diff --git a/.coveragerc b/.coveragerc
index ce73b59b11..cf9ec88a13 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -2,8 +2,7 @@
[run]
omit =
tests/*
- conda/*
- scripts/tests/*
+ scripts/*
concurrency =
multiprocessing
thread
diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000000..6b82ac0df8
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,4 @@
+[flake8]
+max-line-length = 100
+max-complexity = 18
+exclude = tests,__init__.py
diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
new file mode 100644
index 0000000000..c0dc3368d3
--- /dev/null
+++ b/.github/workflows/unittests-gpu.yml
@@ -0,0 +1,60 @@
+name: continuous build - gpu
+
+on: [push, pull_request_target]
+
+defaults:
+ run:
+ shell: bash
+
+jobs:
+ unittest-gpu:
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v2
+
+ - name: Install Linux dependencies
+ run: sudo apt-get install libopenblas-dev
+
+ - name: Setup python
+ uses: actions/setup-python@v2
+ with:
+ python-version: 3.7
+ architecture: x64
+
+ - name: Install Other Dependencies
+ run: |
+ python -m pip install --user --quiet --upgrade pip
+ python -m pip install --user --quiet -e .[extras]
+
+ - name: Configure AWS Credentials
+ uses: aws-actions/configure-aws-credentials@v1
+ with:
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+ aws-region: us-east-1
+
+ - name: Extract branch name
+ shell: bash
+ run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+ id: extract_branch
+
+ - name: Test project on AWS Batch(For push)
+ if: startsWith(steps.extract_branch.outputs.branch, 'PR-') != true
+ run: |
+ python ./tools/batch/submit-job.py --region us-east-1 --job-type g4dn.4x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "/batch_states/test.sh" --wait | tee > script.log
+
+ - name: Test project on AWS Batch(For pull request)
+ if: startsWith(steps.extract_branch.outputs.branch, 'PR-') == true
+ run: |
+ python ./tools/batch/submit-job.py --region us-east-1 --job-type g4dn.4x --source-ref ${{ github.event.pull_request.head.ref }} --work-dir tools/batch --remote https://github.com/${{ github.event.pull_request.head.repo.full_name }} --command "/batch_states/test.sh" --wait | tee > script.log
+
+ - name: Upload log file for AWS Batch test results
+ uses: actions/upload-artifact@v2
+ with:
+ name: GPU_Test_Results
+ path: script.log
+
+
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
new file mode 100644
index 0000000000..ced8f9a1c8
--- /dev/null
+++ b/.github/workflows/unittests.yml
@@ -0,0 +1,47 @@
+name: continuous build
+
+on: [push, pull_request]
+
+defaults:
+ run:
+ shell: bash
+
+jobs:
+ unittest:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ # TODO Add windows test by using "windows-latest"
+ os: [macos-latest, ubuntu-latest]
+ python-version: [ '3.6', '3.7', '3.8']
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v2
+
+ # Install OS specific dependencies
+ - name: Install Linux dependencies
+ if: matrix.os == 'ubuntu-latest'
+ # TODO https://github.com/apache/incubator-mxnet/issues/18293
+ run: sudo apt-get install libopenblas-dev
+
+ - name: Setup python
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ architecture: x64
+ - name: Install Other Dependencies
+ run: |
+ python -m pip install --user --upgrade pip
+ python -m pip install --user setuptools pytest pytest-cov contextvars
+ python -m pip install --upgrade cython
+ python -m pip install --pre --user "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
+ python -m pip install --user -e .[extras]
+ - name: Test project
+ run: |
+ python -m pytest --cov=./ --cov-report=xml --device="cpu" --durations=50 tests/
+ - name: Upload coverage to Codecov
+ uses: codecov/codecov-action@v1.0.10
+ with:
+ env_vars: OS,PYTHON
+
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 99f7dae7c9..0000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,10 +0,0 @@
-[submodule "scripts/word_embeddings/tools/extern/CLI11"]
- path = scripts/word_embeddings/tools/extern/CLI11
- url = https://github.com/CLIUtils/CLI11.git
-[submodule "scripts/word_embeddings/tools/extern/cnpy"]
- path = scripts/word_embeddings/tools/extern/cnpy
- url = https://github.com/leezu/cnpy
- branch = libzip
-[submodule "scripts/word_embeddings/tools/extern/sparsepp"]
- path = scripts/word_embeddings/tools/extern/sparsepp
- url = https://github.com/greg7mdp/sparsepp.git
diff --git a/.pytype.cfg b/.pytype.cfg
index 8220a41658..ebf2d9c586 100644
--- a/.pytype.cfg
+++ b/.pytype.cfg
@@ -5,4 +5,4 @@ inputs =
src/gluonnlp
# Python version (major.minor) of the target code.
-python_version = 3.5
+python_version = 3.6
diff --git a/CODEOWNERS b/CODEOWNERS
index 11af321c0e..43d8c57893 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,9 +1,9 @@
-# Watchers and contributors to Apache MXNet repo directories/packages/files
+# Watchers and contributors to DMLC GluonNLP repo directories/packages/files
# Please see documentation of use of CODEOWNERS file at
# https://help.github.com/articles/about-codeowners/ and
# https://github.com/blog/2392-introducing-code-owners
#
-# Anybody can add themselves or a team as additional watcher or contributor
+# Anybody can add themselves or a team as additional watcher or contributor
# to get notified about changes in a specific package.
# See https://help.github.com/articles/about-teams how to setup teams.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
deleted file mode 100644
index abb8a2119f..0000000000
--- a/CONTRIBUTING.md
+++ /dev/null
@@ -1 +0,0 @@
-Contribution guideline can be found at http://gluon-nlp.mxnet.io/community/contribute.html
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 5ebc05b4eb..0000000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,5 +0,0 @@
-recursive-include gluonnlp *.py
-include LICENSE
-include README.rst
-recursive-exclude tests *
-recursive-exclude scripts *
\ No newline at end of file
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 90b1b01e19..0000000000
--- a/Makefile
+++ /dev/null
@@ -1,113 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ROOTDIR = $(CURDIR)
-MD2IPYNB = $(ROOTDIR)/docs/md2ipynb.py
-
-flake8:
- flake8 --exclude conda,*tests*,test_*.py,scripts/word_embeddings/tools/extern --count --select=E9,F63,F7,F82 --show-source --statistics $(lintdir)
-
-pylint:
- pylint --rcfile=$(ROOTDIR)/.pylintrc $(lintdir)
-
-pytype:
- pytype --config=$(ROOTDIR)/.pytype.cfg
-
-restruc:
- python setup.py check --restructuredtext --strict
-
-lint:
- make lintdir=$(lintdir) flake8
- make lintdir=$(lintdir) pylint
- make pytype
- make lintdir=$(lintdir) ratcheck
- make restruc
-
-ci/rat/apache-rat.jar:
- mkdir -p build
- svn co http://svn.apache.org/repos/asf/creadur/rat/tags/apache-rat-project-0.13/ ci/rat/apache-rat; \
- cd ci/rat/apache-rat/apache-rat; \
- mvn -Dmaven.test.skip=true install;
- cp ci/rat/apache-rat/apache-rat/target/apache-rat-0.13.jar ci/rat/apache-rat.jar
-
-ratcheck: ci/rat/apache-rat.jar
- exec 5>&1; \
- RAT_JAR=ci/rat/apache-rat.jar; \
- OUTPUT=$(java -jar $(RAT_JAR) -E ci/rat/rat-excludes -d $(lintdir) | tee >(cat - >&5)); \
- ERROR_MESSAGE="Printing headers for text files without a valid license header"; \
- echo "-------Process The Output-------"; \
- if [[ $OUTPUT =~ $ERROR_MESSAGE ]]; then \
- echo "ERROR: RAT Check detected files with unknown licenses. Please fix and run test again!"; \
- exit 1; \
- else \
- echo "SUCCESS: There are no files with an Unknown License."; \
- fi
-
-docs: compile_notebooks distribute
- make -C docs html SPHINXOPTS=-W
- for f in $(shell find docs/examples -type f -name '*.md' -print) ; do \
- FILE=`echo $$f | sed 's/docs\///g'` ; \
- DIR=`dirname $$FILE` ; \
- BASENAME=`basename $$FILE` ; \
- HTML_BASENAME=`echo $$BASENAME | sed 's/md/html/'` ; \
- IPYNB_BASENAME=`echo $$BASENAME | sed 's/md/ipynb/'` ; \
- TARGET_HTML="docs/_build/html/$$DIR/$$HTML_BASENAME" ; \
- echo "processing" $$BASENAME ; \
- sed -i "s/$$IPYNB_BASENAME/$$BASENAME/g" $$TARGET_HTML; \
- done;
- for f in $(shell find docs/model_zoo -type f -name '*.rst' -print) ; do \
- DIR=`dirname $$f` ; \
- BASENAME=`basename $$f` ; \
- HTML_BASENAME=`echo $$BASENAME | sed 's/rst/html/'` ; \
- TARGET_HTML="docs/_build/html/$$DIR/$$HTML_BASENAME" ; \
- echo "processing" $$BASENAME ; \
- sed -i "s/docs\/model_zoo/scripts/g" $$TARGET_HTML; \
- done;
- sed -i.bak 's/33\,150\,243/23\,141\,201/g' docs/_build/html/_static/material-design-lite-1.3.0/material.blue-deep_orange.min.css;
- sed -i.bak 's/2196f3/178dc9/g' docs/_build/html/_static/sphinx_materialdesign_theme.css;
-
-clean:
- git clean -ff -d -x --exclude="$(ROOTDIR)/tests/data/*" --exclude="$(ROOTDIR)/conda/"
-
-compile_notebooks:
- for f in $(shell find docs/examples -type f -name '*.md' -print) ; do \
- DIR=$$(dirname $$f) ; \
- BASENAME=$$(basename $$f) ; \
- TARGETNAME=$${BASENAME%.md}.ipynb ; \
- echo $$DIR $$BASENAME $$TARGETNAME; \
- cd $$DIR ; \
- if [ -f $$TARGETNAME ]; then \
- echo $$TARGETNAME exists. Skipping compilation of $$BASENAME in Makefile. ; \
- else \
- python $(MD2IPYNB) $$BASENAME ; \
- fi ; \
- cd - ; \
- done;
-
-dist_scripts:
- cd scripts && \
- find * -type d -prune | grep -v 'tests\|__pycache__' | xargs -t -n 1 -I{} zip -r {}.zip {}
-
-dist_notebooks:
- cd docs/examples && \
- find * -type d -prune | grep -v 'tests\|__pycache__' | xargs -t -n 1 -I{} zip -r {}.zip {} -x "*.md" -x "__pycache__" -x "*.pyc" -x "*.txt" -x "*.log" -x "*.params" -x "*.npz" -x "*.json"
-
-test:
- py.test -v --capture=no --durations=0 tests/unittest scripts
-
-distribute: dist_scripts dist_notebooks
- python setup.py sdist
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000..62e34d894e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,111 @@
+
+GluonNLP: Your Choice of Deep Learning for NLP
+
+
+
+
+
+
+
+
+
+GluonNLP is a toolkit that enables easy text preprocessing, datasets
+loading and neural models building to help you speed up your Natural
+Language Processing (NLP) research.
+
+# Features
+
+For NLP Practitioners
+- Easy-to-use Data Pipeline
+- Automatically Train Models via AutoNLP (TODO)
+
+For Researchers
+- Pretrained Model Zoo
+- Programming with numpy-like API
+
+For Engineers
+- Fast Deployment
+ - [TVM](https://tvm.apache.org/) (TODO)
+- AWS Integration
+
+
+# Installation
+First of all, install the latest MXNet. You may use the following commands:
+
+```bash
+# Install the version with CUDA 10.0
+python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200802" -f https://dist.mxnet.io/python
+
+# Install the version with CUDA 10.1
+python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200802" -f https://dist.mxnet.io/python
+
+# Install the version with CUDA 10.2
+python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
+
+# Install the cpu-only version
+python3 -m pip install -U --pre "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
+```
+
+
+To install GluonNLP, use
+
+```bash
+python3 -m pip install -U -e .
+
+# Also, you may install all the extra requirements via
+python3 -m pip install -U -e ."[extras]"
+```
+
+If you find that you do not have the permission, you can also install to the user folder:
+
+```bash
+python3 -m pip install -U -e . --user
+```
+
+For Windows users, we recommend to use the [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/about).
+
+
+# Access the Command-line Toolkits
+
+To facilitate the researcher and the engineers, we provide command-line-toolkits for
+downloading and preprocessing the NLP datasets. For more details, you may refer to
+ [GluonNLP Datasets](./scripts/datasets) and [GluonNLP Preprocessing Tools](./scripts/preprocess).
+
+```bash
+# CLI for downloading / preparing the dataset
+nlp_data help
+
+# CLI for accessing some common data preprocessing scripts
+nlp_preprocess help
+
+# Also, you can use `python -m` to access the toolkits
+python3 -m gluonnlp.cli.data help
+python3 -m gluonnlp.cli.preprocess help
+
+```
+
+### Frequently Asked Questions
+- **Question**: I cannot you access the command line toolkits. By running `nlp_data`, it reports `nlp_data: command not found`.
+
+ This is sometimes because that you have installed glunonnlp to the user folder and
+ the executables are installed to `~/.local/bin`. You can try to change the `PATH` variable to
+ also include '~/.local/bin'.
+
+ ```
+ export PATH=${PATH}:~/.local/bin
+ ```
+
+
+# Run Unittests
+You may go to [tests](tests) to see all how to run the unittests.
+
+
+# Use Docker
+You can use Docker to launch a JupyterLab development environment with GluonNLP installed.
+
+```
+docker pull gluonai/gluon-nlp:gpu-latest
+docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=4g gluonai/gluon-nlp:gpu-latest
+```
+
+For more details, you can refer to the guidance in [tools/docker](tools/docker).
diff --git a/README.rst b/README.rst
deleted file mode 100644
index cf004dc838..0000000000
--- a/README.rst
+++ /dev/null
@@ -1,218 +0,0 @@
-.. raw:: html
-
- ![](https://github.com/dmlc/gluon-nlp/raw/be3bc8852155e935d68d397e0743715c54c3ce76/docs/_static/gluon_s2.png)
-
-
-.. raw:: html
-
-
-
-GluonNLP: Your Choice of Deep Learning for NLP
-
-.. raw:: html
-
-
-
-.. raw:: html
-
-
-
-
-
-
-GluonNLP is a toolkit that enables easy text preprocessing, datasets
-loading and neural models building to help you speed up your Natural
-Language Processing (NLP) research.
-
-- `Quick Start Guide `__
-- `Resources `__
-
-News
-====
-
-- Tutorial proposal for GluonNLP is accepted at `EMNLP 2019 `__, Hong Kong.
-
-- GluonNLP was featured in:
-
- - **KDD 2019 Alaska**! Check out our tutorial: `From Shallow to Deep Language Representations: Pre-training, Fine-tuning, and Beyond `__.
- - **JSALT 2019 in Montreal, 2019-6-14**! Checkout **https://jsalt19.mxnet.io**.
- - **AWS re:invent 2018 in Las Vegas, 2018-11-28**! Checkout `details `_.
- - **PyData 2018 NYC, 2018-10-18**! Checkout the `awesome talk `__ by Sneha Jha.
- - **KDD 2018 London, 2018-08-21, Apache MXNet Gluon tutorial**! Check out **https://kdd18.mxnet.io**.
-
-Installation
-============
-
-Make sure you have Python 3.5 or newer and a recent version of MXNet (our CI
-server runs the testsuite with Python 3.5).
-
-You can install ``MXNet`` and ``GluonNLP`` using pip.
-
-``GluonNLP`` is based on the most recent version of ``MXNet``.
-
-
-In particular, if you want to install the most recent ``MXNet`` release:
-
-::
-
- pip install --upgrade mxnet>=1.6.0
-
-Else, if you want to install the most recent ``MXNet`` nightly build:
-
-::
-
- pip install --pre --upgrade mxnet
-
-Then, you can install ``GluonNLP``:
-
-::
-
- pip install gluonnlp
-
-Please check more `installation details `_.
-
-Docs 📖
-=======
-
-GluonNLP documentation is available at `our
-website `__.
-
-Community
-=========
-
-GluonNLP is a community that believes in sharing.
-
-For questions, comments, and bug reports, `Github issues `__ is the best way to reach us.
-
-We now have a new Slack channel `here `__.
-(`register `__).
-
-How to Contribute
-=================
-
-GluonNLP community welcomes contributions from anyone!
-
-There are lots of opportunities for you to become our `contributors `__:
-
-- Ask or answer questions on `GitHub issues `__.
-- Propose ideas, or review proposed design ideas on `GitHub issues `__.
-- Improve the `documentation `__.
-- Contribute bug reports `GitHub issues `__.
-- Write new `scripts `__ to reproduce
- state-of-the-art results.
-- Write new `examples `__ to explain
- key ideas in NLP methods and models.
-- Write new `public datasets `__
- (license permitting).
-- Most importantly, if you have an idea of how to contribute, then do it!
-
-For a list of open starter tasks, check `good first issues `__.
-
-Also see our `contributing
-guide `__ on simple how-tos,
-contribution guidelines and more.
-
-Resources
-=========
-
-Check out how to use GluonNLP for your own research or projects.
-
-If you are new to Gluon, please check out our `60-minute crash course
-`__.
-
-For getting started quickly, refer to notebook runnable examples at
-`Examples. `__
-
-For advanced examples, check out our
-`Scripts. `__
-
-For experienced users, check out our
-`API Notes `__.
-
-Quick Start Guide
-=================
-
-`Dataset Loading `__
--------------------------------------------------------------------------------
-
-Load the Wikitext-2 dataset, for example:
-
-.. code:: python
-
- >>> import gluonnlp as nlp
- >>> train = nlp.data.WikiText2(segment='train')
- >>> train[0:5]
- ['=', 'Valkyria', 'Chronicles', 'III', '=']
-
-`Vocabulary Construction `__
--------------------------------------------------------------------------------------
-
-Build vocabulary based on the above dataset, for example:
-
-.. code:: python
-
- >>> vocab = nlp.Vocab(counter=nlp.data.Counter(train))
- >>> vocab
- Vocab(size=33280, unk="", reserved="['', '', '']")
-
-`Neural Models Building `__
-------------------------------------------------------------------------------------
-
-From the models package, apply a Standard RNN language model to the
-above dataset:
-
-.. code:: python
-
- >>> model = nlp.model.language_model.StandardRNN('lstm', len(vocab),
- ... 200, 200, 2, 0.5, True)
- >>> model
- StandardRNN(
- (embedding): HybridSequential(
- (0): Embedding(33280 -> 200, float32)
- (1): Dropout(p = 0.5, axes=())
- )
- (encoder): LSTM(200 -> 200.0, TNC, num_layers=2, dropout=0.5)
- (decoder): HybridSequential(
- (0): Dense(200 -> 33280, linear)
- )
- )
-
-`Word Embeddings Loading `__
------------------------------------------------------------------------------------------
-
-For example, load a GloVe word embedding, one of the state-of-the-art
-English word embeddings:
-
-.. code:: python
-
- >>> glove = nlp.embedding.create('glove', source='glove.6B.50d')
- # Obtain vectors for 'baby' in the GloVe word embedding
- >>> type(glove['baby'])
-
- >>> glove['baby'].shape
- (50,)
-
-
-Reference Paper
-===============
-
-The bibtex entry for the `reference paper `__ of GluonNLP is:
-
-.. code::
-
- @article{gluoncvnlp2020,
- author = {Jian Guo and He He and Tong He and Leonard Lausen and Mu Li and Haibin Lin and Xingjian Shi and Chenguang Wang and Junyuan Xie and Sheng Zha and Aston Zhang and Hang Zhang and Zhi Zhang and Zhongyue Zhang and Shuai Zheng and Yi Zhu},
- title = {GluonCV and GluonNLP: Deep Learning in Computer Vision and Natural Language Processing},
- journal = {Journal of Machine Learning Research},
- year = {2020},
- volume = {21},
- number = {23},
- pages = {1-7},
- url = {http://jmlr.org/papers/v21/19-429.html}
- }
-
-
-New to Deep Learning or NLP?
-============================
-
-For background knowledge of deep learning or NLP, please refer to the open source book `Dive into Deep Learning `__.
diff --git a/ci/batch/docker/Dockerfile b/ci/batch/docker/Dockerfile
deleted file mode 100644
index 8cc64125b5..0000000000
--- a/ci/batch/docker/Dockerfile
+++ /dev/null
@@ -1,27 +0,0 @@
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
-
- RUN apt-get update && apt-get install -y --no-install-recommends \
- build-essential \
- locales \
- cmake \
- git \
- curl \
- vim \
- unzip \
- sudo \
- ca-certificates \
- libjpeg-dev \
- libpng-dev \
- libfreetype6-dev \
- libxft-dev &&\
- rm -rf /var/lib/apt/lists/*
-
- RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
- chmod +x ~/miniconda.sh && \
- ~/miniconda.sh -b -p /opt/conda && \
- rm ~/miniconda.sh && \
- /opt/conda/bin/conda clean -ya
- ENV PATH /opt/conda/bin:$PATH
- RUN git clone https://github.com/dmlc/gluon-nlp
- WORKDIR gluon-nlp
- ADD gluon_nlp_job.sh .
diff --git a/ci/codecov.sh b/ci/codecov.sh
deleted file mode 100755
index 1ef332b1b3..0000000000
--- a/ci/codecov.sh
+++ /dev/null
@@ -1,1550 +0,0 @@
-#!/usr/bin/env bash
-
-# Apache License Version 2.0, January 2004
-# https://github.com/codecov/codecov-bash/blob/master/LICENSE
-
-
-set -e +o pipefail
-
-VERSION="0b37652"
-
-url="https://codecov.io"
-env="$CODECOV_ENV"
-service=""
-token=""
-search_in=""
-flags=""
-exit_with=0
-curlargs=""
-curlawsargs=""
-dump="0"
-clean="0"
-curl_s="-s"
-name="$CODECOV_NAME"
-include_cov=""
-exclude_cov=""
-ddp="$(echo ~)/Library/Developer/Xcode/DerivedData"
-xp=""
-files=""
-cacert="$CODECOV_CA_BUNDLE"
-gcov_ignore="-not -path './bower_components/**' -not -path './node_modules/**' -not -path './vendor/**'"
-gcov_include=""
-
-ft_gcov="1"
-ft_coveragepy="1"
-ft_fix="1"
-ft_search="1"
-ft_s3="1"
-ft_network="1"
-ft_xcodellvm="1"
-ft_xcodeplist="0"
-
-_git_root=$(git rev-parse --show-toplevel 2>/dev/null || hg root 2>/dev/null || echo $PWD)
-git_root="$_git_root"
-codecov_yml=""
-remote_addr=""
-if [ "$git_root" = "$PWD" ];
-then
- git_root="."
-fi
-
-url_o=""
-pr_o=""
-build_o=""
-commit_o=""
-search_in_o=""
-tag_o=""
-branch_o=""
-slug_o=""
-prefix_o=""
-
-commit="$VCS_COMMIT_ID"
-branch="$VCS_BRANCH_NAME"
-pr="$VCS_PULL_REQUEST"
-slug="$VCS_SLUG"
-tag="$VCS_TAG"
-build_url="$CI_BUILD_URL"
-build="$CI_BUILD_ID"
-job="$CI_JOB_ID"
-
-beta_xcode_partials=""
-
-proj_root="$git_root"
-gcov_exe="gcov"
-gcov_arg=""
-
-b="\033[0;36m"
-g="\033[0;32m"
-r="\033[0;31m"
-e="\033[0;90m"
-x="\033[0m"
-
-show_help() {
-cat << EOF
-
- Codecov Bash $VERSION
-
- Global report uploading tool for Codecov
- Documentation at https://docs.codecov.io/docs
- Contribute at https://github.com/codecov/codecov-bash
-
-
- -h Display this help and exit
- -f FILE Target file(s) to upload
-
- -f "path/to/file" only upload this file
- skips searching unless provided patterns below
-
- -f '!*.bar' ignore all files at pattern *.bar
- -f '*.foo' include all files at pattern *.foo
- Must use single quotes.
- This is non-exclusive, use -s "*.foo" to match specific paths.
-
- -s DIR Directory to search for coverage reports.
- Already searches project root and artifact folders.
- -t TOKEN Set the private repository token
- (option) set environment variable CODECOV_TOKEN=:uuid
-
- -t @/path/to/token_file
- -t uuid
-
- -n NAME Custom defined name of the upload. Visible in Codecov UI
-
- -e ENV Specify environment variables to be included with this build
- Also accepting environment variables: CODECOV_ENV=VAR,VAR2
-
- -e VAR,VAR2
-
- -X feature Toggle functionalities
-
- -X gcov Disable gcov
- -X coveragepy Disable python coverage
- -X fix Disable report fixing
- -X search Disable searching for reports
- -X xcode Disable xcode processing
- -X network Disable uploading the file network
-
- -R root dir Used when not in git/hg project to identify project root directory
- -y conf file Used to specify the location of the .codecov.yml config file
- -F flag Flag the upload to group coverage metrics
-
- -F unittests This upload is only unittests
- -F integration This upload is only integration tests
- -F ui,chrome This upload is Chrome - UI tests
-
- -c Move discovered coverage reports to the trash
- -Z Exit with 1 if not successful. Default will Exit with 0
-
- -- xcode --
- -D Custom Derived Data Path for Coverage.profdata and gcov processing
- Default '~/Library/Developer/Xcode/DerivedData'
- -J Specify packages to build coverage.
- This can significantly reduces time to build coverage reports.
-
- -J 'MyAppName' Will match "MyAppName" and "MyAppNameTests"
- -J '^ExampleApp$' Will match only "ExampleApp" not "ExampleAppTests"
-
- -- gcov --
- -g GLOB Paths to ignore during gcov gathering
- -G GLOB Paths to include during gcov gathering
- -p dir Project root directory
- Also used when preparing gcov
- -k prefix Prefix filepaths to help resolve path fixing: https://github.com/codecov/support/issues/472
- -x gcovexe gcov executable to run. Defaults to 'gcov'
- -a gcovargs extra arguments to pass to gcov
-
- -- Override CI Environment Variables --
- These variables are automatically detected by popular CI providers
-
- -B branch Specify the branch name
- -C sha Specify the commit sha
- -P pr Specify the pull request number
- -b build Specify the build number
- -T tag Specify the git tag
-
- -- Enterprise --
- -u URL Set the target url for Enterprise customers
- Not required when retrieving the bash uploader from your CCE
- (option) Set environment variable CODECOV_URL=https://my-hosted-codecov.com
- -r SLUG owner/repo slug used instead of the private repo token in Enterprise
- (option) set environment variable CODECOV_SLUG=:owner/:repo
- (option) set in your codecov.yml "codecov.slug"
- -S PATH File path to your cacert.pem file used to verify ssl with Codecov Enterprise (optional)
- (option) Set environment variable: CODECOV_CA_BUNDLE="/path/to/ca.pem"
- -U curlargs Extra curl arguments to communicate with Codecov. e.g., -U "--proxy http://http-proxy"
- -A curlargs Extra curl arguments to communicate with AWS.
-
- -- Debugging --
- -d Don't upload, but dump upload file to stdout
- -K Remove color from the output
- -v Verbose mode
-
-EOF
-}
-
-
-say() {
- echo -e "$1"
-}
-
-
-urlencode() {
- echo "$1" | curl -Gso /dev/null -w %{url_effective} --data-urlencode @- "" | cut -c 3- | sed -e 's/%0A//'
-}
-
-
-swiftcov() {
- _dir=$(dirname "$1" | sed 's/\(Build\).*/\1/g')
- for _type in app framework xctest
- do
- find "$_dir" -name "*.$_type" | while read f
- do
- _proj=${f##*/}
- _proj=${_proj%."$_type"}
- if [ "$2" = "" ] || [ "$(echo "$_proj" | grep -i "$2")" != "" ];
- then
- say " $g+$x Building reports for $_proj $_type"
- dest=$([ -f "$f/$_proj" ] && echo "$f/$_proj" || echo "$f/Contents/MacOS/$_proj")
- _proj_name=$(echo "$_proj" | sed -e 's/[[:space:]]//g')
- xcrun llvm-cov show $beta_xcode_partials -instr-profile "$1" "$dest" > "$_proj_name.$_type.coverage.txt" \
- || say " ${r}x>${x} llvm-cov failed to produce results for $dest"
- fi
- done
- done
-}
-
-
-# Credits to: https://gist.github.com/pkuczynski/8665367
-parse_yaml() {
- local prefix=$2
- local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
- sed -ne "s|^\($s\)\($w\)$s:$s\"\(.*\)\"$s\$|\1$fs\2$fs\3|p" \
- -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 |
- awk -F$fs '{
- indent = length($1)/2;
- vname[indent] = $2;
- for (i in vname) {if (i > indent) {delete vname[i]}}
- if (length($3) > 0) {
- vn=""; if (indent > 0) {vn=(vn)(vname[0])("_")}
- printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
- }
- }'
-}
-
-
-if [ $# != 0 ];
-then
- while getopts "a:A:b:B:cC:dD:e:f:F:g:G:hJ:k:Kn:p:P:r:R:y:s:S:t:T:u:U:vx:X:Z" o
- do
- case "$o" in
- "a")
- gcov_arg=$OPTARG
- ;;
- "A")
- curlawsargs="$OPTARG"
- ;;
- "b")
- build_o="$OPTARG"
- ;;
- "B")
- branch_o="$OPTARG"
- ;;
- "c")
- clean="1"
- ;;
- "C")
- commit_o="$OPTARG"
- ;;
- "d")
- dump="1"
- ;;
- "D")
- ddp="$OPTARG"
- ;;
- "e")
- env="$env,$OPTARG"
- ;;
- "f")
- if [ "${OPTARG::1}" = "!" ];
- then
- exclude_cov="$exclude_cov -not -path '${OPTARG:1}'"
-
- elif [[ "$OPTARG" = *"*"* ]];
- then
- include_cov="$include_cov -or -name '$OPTARG'"
-
- else
- ft_search=0
- if [ "$files" = "" ];
- then
- files="$OPTARG"
- else
- files="$files
-$OPTARG"
- fi
- fi
- ;;
- "F")
- if [ "$flags" = "" ];
- then
- flags="$OPTARG"
- else
- flags="$flags,$OPTARG"
- fi
- ;;
- "g")
- gcov_ignore="$gcov_ignore -not -path '$OPTARG'"
- ;;
- "G")
- gcov_include="$gcov_include -path '$OPTARG'"
- ;;
- "h")
- show_help
- exit 0;
- ;;
- "J")
- ft_xcodellvm="1"
- ft_xcodeplist="0"
- if [ "$xp" = "" ];
- then
- xp="$OPTARG"
- else
- xp="$xp\|$OPTARG"
- fi
- ;;
- "k")
- prefix_o=$(echo "$OPTARG" | sed -e 's:^/*::' -e 's:/*$::')
- ;;
- "K")
- b=""
- g=""
- r=""
- e=""
- x=""
- ;;
- "n")
- name="$OPTARG"
- ;;
- "p")
- proj_root="$OPTARG"
- ;;
- "P")
- pr_o="$OPTARG"
- ;;
- "r")
- slug_o="$OPTARG"
- ;;
- "R")
- git_root="$OPTARG"
- ;;
- "s")
- if [ "$search_in_o" = "" ];
- then
- search_in_o="$OPTARG"
- else
- search_in_o="$search_in_o $OPTARG"
- fi
- ;;
- "S")
- cacert="--cacert \"$OPTARG\""
- ;;
- "t")
- if [ "${OPTARG::1}" = "@" ];
- then
- token=$(cat "${OPTARG:1}" | tr -d ' \n')
- else
- token="$OPTARG"
- fi
- ;;
- "T")
- tag_o="$OPTARG"
- ;;
- "u")
- url_o=$(echo "$OPTARG" | sed -e 's/\/$//')
- ;;
- "U")
- curlargs="$OPTARG"
- ;;
- "v")
- set -x
- curl_s=""
- ;;
- "x")
- gcov_exe=$OPTARG
- ;;
- "X")
- if [ "$OPTARG" = "gcov" ];
- then
- ft_gcov="0"
- elif [ "$OPTARG" = "coveragepy" ] || [ "$OPTARG" = "py" ];
- then
- ft_coveragepy="0"
- elif [ "$OPTARG" = "xcodellvm" ];
- then
- ft_xcodellvm="1"
- ft_xcodeplist="0"
- elif [ "$OPTARG" = "fix" ] || [ "$OPTARG" = "fixes" ];
- then
- ft_fix="0"
- elif [ "$OPTARG" = "xcode" ];
- then
- ft_xcodellvm="0"
- ft_xcodeplist="0"
- elif [ "$OPTARG" = "search" ];
- then
- ft_search="0"
- elif [ "$OPTARG" = "xcodepartials" ];
- then
- beta_xcode_partials="-use-color"
- elif [ "$OPTARG" = "network" ];
- then
- ft_network="0"
- elif [ "$OPTARG" = "s3" ];
- then
- ft_s3="0"
- fi
- ;;
- "y")
- codecov_yml="$OPTARG"
- ;;
- "Z")
- exit_with=1
- ;;
- esac
- done
-fi
-
-say "
- _____ _
- / ____| | |
-| | ___ __| | ___ ___ _____ __
-| | / _ \\ / _\` |/ _ \\/ __/ _ \\ \\ / /
-| |___| (_) | (_| | __/ (_| (_) \\ V /
- \\_____\\___/ \\__,_|\\___|\\___\\___/ \\_/
- Bash-$VERSION
-
-"
-
-search_in="$proj_root"
-
-if [ "$JENKINS_URL" != "" ];
-then
- say "$e==>$x Jenkins CI detected."
- # https://wiki.jenkins-ci.org/display/JENKINS/Building+a+software+project
- # https://wiki.jenkins-ci.org/display/JENKINS/GitHub+pull+request+builder+plugin#GitHubpullrequestbuilderplugin-EnvironmentVariables
- service="jenkins"
-
- if [ "$ghprbSourceBranch" != "" ];
- then
- branch="$ghprbSourceBranch"
- elif [ "$GIT_BRANCH" != "" ];
- then
- branch="$GIT_BRANCH"
- elif [ "$BRANCH_NAME" != "" ];
- then
- branch="$BRANCH_NAME"
- fi
-
- if [ "$ghprbActualCommit" != "" ];
- then
- commit="$ghprbActualCommit"
- elif [ "$GIT_COMMIT" != "" ];
- then
- commit="$GIT_COMMIT"
- fi
-
- if [ "$ghprbPullId" != "" ];
- then
- pr="$ghprbPullId"
- elif [ "$CHANGE_ID" != "" ];
- then
- pr="$CHANGE_ID"
- fi
-
- build="$BUILD_NUMBER"
- build_url=$(urlencode "$BUILD_URL")
-
-elif [ "$CI" = "true" ] && [ "$TRAVIS" = "true" ] && [ "$SHIPPABLE" != "true" ];
-then
- say "$e==>$x Travis CI detected."
- # https://docs.travis-ci.com/user/environment-variables/
- service="travis"
- commit="${TRAVIS_PULL_REQUEST_SHA:-$TRAVIS_COMMIT}"
- build="$TRAVIS_JOB_NUMBER"
- pr="$TRAVIS_PULL_REQUEST"
- job="$TRAVIS_JOB_ID"
- slug="$TRAVIS_REPO_SLUG"
- env="$env,TRAVIS_OS_NAME"
- tag="$TRAVIS_TAG"
- if [ "$TRAVIS_BRANCH" != "$TRAVIS_TAG" ];
- then
- branch="$TRAVIS_BRANCH"
- fi
-
- language=$(printenv | grep "TRAVIS_.*_VERSION" | head -1)
- if [ "$language" != "" ];
- then
- env="$env,${language%=*}"
- fi
-
-elif [ "$DOCKER_REPO" != "" ];
-then
- say "$e==>$x Docker detected."
- # https://docs.docker.com/docker-cloud/builds/advanced/
- service="docker"
- branch="$SOURCE_BRANCH"
- commit="$SOURCE_COMMIT"
- slug="$DOCKER_REPO"
- tag="$CACHE_TAG"
- env="$env,IMAGE_NAME"
-
-elif [ "$CI" = "true" ] && [ "$CI_NAME" = "codeship" ];
-then
- say "$e==>$x Codeship CI detected."
- # https://www.codeship.io/documentation/continuous-integration/set-environment-variables/
- service="codeship"
- branch="$CI_BRANCH"
- build="$CI_BUILD_NUMBER"
- build_url=$(urlencode "$CI_BUILD_URL")
- commit="$CI_COMMIT_ID"
-
-elif [ ! -z "$CF_BUILD_URL" ] && [ ! -z "$CF_BUILD_ID" ];
-then
- say "$e==>$x Codefresh CI detected."
- # https://docs.codefresh.io/v1.0/docs/variables
- service="codefresh"
- branch="$CF_BRANCH"
- build="$CF_BUILD_ID"
- build_url=$(urlencode "$CF_BUILD_URL")
- commit="$CF_REVISION"
-
-elif [ "$TEAMCITY_VERSION" != "" ];
-then
- say "$e==>$x TeamCity CI detected."
- # https://confluence.jetbrains.com/display/TCD8/Predefined+Build+Parameters
- # https://confluence.jetbrains.com/plugins/servlet/mobile#content/view/74847298
- if [ "$TEAMCITY_BUILD_BRANCH" = '' ];
- then
- echo " Teamcity does not automatically make build parameters available as environment variables."
- echo " Add the following environment parameters to the build configuration"
- echo " env.TEAMCITY_BUILD_BRANCH = %teamcity.build.branch%"
- echo " env.TEAMCITY_BUILD_ID = %teamcity.build.id%"
- echo " env.TEAMCITY_BUILD_URL = %teamcity.serverUrl%/viewLog.html?buildId=%teamcity.build.id%"
- echo " env.TEAMCITY_BUILD_COMMIT = %system.build.vcs.number%"
- echo " env.TEAMCITY_BUILD_REPOSITORY = %vcsroot..url%"
- fi
- service="teamcity"
- branch="$TEAMCITY_BUILD_BRANCH"
- build="$TEAMCITY_BUILD_ID"
- build_url=$(urlencode "$TEAMCITY_BUILD_URL")
- if [ "$TEAMCITY_BUILD_COMMIT" != "" ];
- then
- commit="$TEAMCITY_BUILD_COMMIT"
- else
- commit="$BUILD_VCS_NUMBER"
- fi
- remote_addr="$TEAMCITY_BUILD_REPOSITORY"
-
-elif [ "$CI" = "true" ] && [ "$CIRCLECI" = "true" ];
-then
- say "$e==>$x Circle CI detected."
- # https://circleci.com/docs/environment-variables
- service="circleci"
- branch="$CIRCLE_BRANCH"
- build="$CIRCLE_BUILD_NUM"
- job="$CIRCLE_NODE_INDEX"
- if [ "$CIRCLE_PROJECT_REPONAME" != "" ];
- then
- slug="$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME"
- else
- # git@github.com:owner/repo.git
- slug="${CIRCLE_REPOSITORY_URL##*:}"
- # owner/repo.git
- slug="${slug%%.git}"
- fi
- pr="$CIRCLE_PR_NUMBER"
- commit="$CIRCLE_SHA1"
- search_in="$search_in $CIRCLE_ARTIFACTS $CIRCLE_TEST_REPORTS"
-
-elif [ "$BUDDYBUILD_BRANCH" != "" ];
-then
- say "$e==>$x buddybuild detected"
- # http://docs.buddybuild.com/v6/docs/custom-prebuild-and-postbuild-steps
- service="buddybuild"
- branch="$BUDDYBUILD_BRANCH"
- build="$BUDDYBUILD_BUILD_NUMBER"
- build_url="https://dashboard.buddybuild.com/public/apps/$BUDDYBUILD_APP_ID/build/$BUDDYBUILD_BUILD_ID"
- # BUDDYBUILD_TRIGGERED_BY
- if [ "$ddp" = "$(echo ~)/Library/Developer/Xcode/DerivedData" ];
- then
- ddp="/private/tmp/sandbox/${BUDDYBUILD_APP_ID}/bbtest"
- fi
-
-elif [ "${bamboo_planRepository_revision}" != "" ];
-then
- say "$e==>$x Bamboo detected"
- # https://confluence.atlassian.com/bamboo/bamboo-variables-289277087.html#Bamboovariables-Build-specificvariables
- service="bamboo"
- commit="${bamboo_planRepository_revision}"
- branch="${bamboo_planRepository_branch}"
- build="${bamboo_buildNumber}"
- build_url="${bamboo_buildResultsUrl}"
- remote_addr="${bamboo_planRepository_repositoryUrl}"
-
-elif [ "$CI" = "true" ] && [ "$BITRISE_IO" = "true" ];
-then
- # http://devcenter.bitrise.io/faq/available-environment-variables/
- say "$e==>$x Bitrise CI detected."
- service="bitrise"
- branch="$BITRISE_GIT_BRANCH"
- build="$BITRISE_BUILD_NUMBER"
- build_url=$(urlencode "$BITRISE_BUILD_URL")
- pr="$BITRISE_PULL_REQUEST"
- if [ "$GIT_CLONE_COMMIT_HASH" != "" ];
- then
- commit="$GIT_CLONE_COMMIT_HASH"
- fi
-
-elif [ "$CI" = "true" ] && [ "$SEMAPHORE" = "true" ];
-then
- say "$e==>$x Semaphore CI detected."
- # https://semaphoreapp.com/docs/available-environment-variables.html
- service="semaphore"
- branch="$BRANCH_NAME"
- build="$SEMAPHORE_BUILD_NUMBER"
- job="$SEMAPHORE_CURRENT_THREAD"
- pr="$PULL_REQUEST_NUMBER"
- slug="$SEMAPHORE_REPO_SLUG"
- commit="$REVISION"
- env="$env,SEMAPHORE_TRIGGER_SOURCE"
-
-elif [ "$CI" = "true" ] && [ "$BUILDKITE" = "true" ];
-then
- say "$e==>$x Buildkite CI detected."
- # https://buildkite.com/docs/guides/environment-variables
- service="buildkite"
- branch="$BUILDKITE_BRANCH"
- build="$BUILDKITE_BUILD_NUMBER"
- job="$BUILDKITE_JOB_ID"
- build_url=$(urlencode "$BUILDKITE_BUILD_URL")
- slug="$BUILDKITE_PROJECT_SLUG"
- commit="$BUILDKITE_COMMIT"
- if [[ "$BUILDKITE_PULL_REQUEST" != "false" ]]; then
- pr="$BUILDKITE_PULL_REQUEST"
- fi
- tag="$BUILDKITE_TAG"
-
-elif [ "$CI" = "drone" ] || [ "$DRONE" = "true" ];
-then
- say "$e==>$x Drone CI detected."
- # http://docs.drone.io/env.html
- # drone commits are not full shas
- service="drone.io"
- branch="$DRONE_BRANCH"
- build="$DRONE_BUILD_NUMBER"
- build_url=$(urlencode "${DRONE_BUILD_LINK}")
- pr="$DRONE_PULL_REQUEST"
- job="$DRONE_JOB_NUMBER"
- tag="$DRONE_TAG"
-
-elif [ "$HEROKU_TEST_RUN_BRANCH" != "" ];
-then
- say "$e==>$x Heroku CI detected."
- # https://devcenter.heroku.com/articles/heroku-ci#environment-variables
- service="heroku"
- branch="$HEROKU_TEST_RUN_BRANCH"
- build="$HEROKU_TEST_RUN_ID"
-
-elif [ "$CI" = "True" ] && [ "$APPVEYOR" = "True" ];
-then
- say "$e==>$x Appveyor CI detected."
- # http://www.appveyor.com/docs/environment-variables
- service="appveyor"
- branch="$APPVEYOR_REPO_BRANCH"
- build=$(urlencode "$APPVEYOR_JOB_ID")
- pr="$APPVEYOR_PULL_REQUEST_NUMBER"
- job="$APPVEYOR_ACCOUNT_NAME%2F$APPVEYOR_PROJECT_SLUG%2F$APPVEYOR_BUILD_VERSION"
- slug="$APPVEYOR_REPO_NAME"
- commit="$APPVEYOR_REPO_COMMIT"
-
-elif [ "$CI" = "true" ] && [ "$WERCKER_GIT_BRANCH" != "" ];
-then
- say "$e==>$x Wercker CI detected."
- # http://devcenter.wercker.com/articles/steps/variables.html
- service="wercker"
- branch="$WERCKER_GIT_BRANCH"
- build="$WERCKER_MAIN_PIPELINE_STARTED"
- slug="$WERCKER_GIT_OWNER/$WERCKER_GIT_REPOSITORY"
- commit="$WERCKER_GIT_COMMIT"
-
-elif [ "$CI" = "true" ] && [ "$MAGNUM" = "true" ];
-then
- say "$e==>$x Magnum CI detected."
- # https://magnum-ci.com/docs/environment
- service="magnum"
- branch="$CI_BRANCH"
- build="$CI_BUILD_NUMBER"
- commit="$CI_COMMIT"
-
-elif [ "$SHIPPABLE" = "true" ];
-then
- say "$e==>$x Shippable CI detected."
- # http://docs.shippable.com/ci_configure/
- service="shippable"
- branch=$([ "$HEAD_BRANCH" != "" ] && echo "$HEAD_BRANCH" || echo "$BRANCH")
- build="$BUILD_NUMBER"
- build_url=$(urlencode "$BUILD_URL")
- pr="$PULL_REQUEST"
- slug="$REPO_FULL_NAME"
- commit="$COMMIT"
-
-elif [ "$TDDIUM" = "true" ];
-then
- say "Solano CI detected."
- # http://docs.solanolabs.com/Setup/tddium-set-environment-variables/
- service="solano"
- commit="$TDDIUM_CURRENT_COMMIT"
- branch="$TDDIUM_CURRENT_BRANCH"
- build="$TDDIUM_TID"
- pr="$TDDIUM_PR_ID"
-
-elif [ "$GREENHOUSE" = "true" ];
-then
- say "$e==>$x Greenhouse CI detected."
- # http://docs.greenhouseci.com/docs/environment-variables-files
- service="greenhouse"
- branch="$GREENHOUSE_BRANCH"
- build="$GREENHOUSE_BUILD_NUMBER"
- build_url=$(urlencode "$GREENHOUSE_BUILD_URL")
- pr="$GREENHOUSE_PULL_REQUEST"
- commit="$GREENHOUSE_COMMIT"
- search_in="$search_in $GREENHOUSE_EXPORT_DIR"
-
-elif [ "$GITLAB_CI" != "" ];
-then
- say "$e==>$x GitLab CI detected."
- # http://doc.gitlab.com/ce/ci/variables/README.html
- service="gitlab"
- branch="${CI_BUILD_REF_NAME:-$CI_COMMIT_REF_NAME}"
- build="${CI_BUILD_ID:-$CI_JOB_ID}"
- remote_addr="${CI_BUILD_REPO:-$CI_REPOSITORY_URL}"
- commit="${CI_BUILD_REF:-$CI_COMMIT_SHA}"
-
-else
- say "${r}x>${x} No CI provider detected."
- say " Testing inside Docker? ${b}http://docs.codecov.io/docs/testing-with-docker${x}"
- say " Testing with Tox? ${b}https://docs.codecov.io/docs/python#section-testing-with-tox${x}"
-
-fi
-
-say " ${e}project root:${x} $git_root"
-
-# find branch, commit, repo from git command
-if [ "$GIT_BRANCH" != "" ];
-then
- branch="$GIT_BRANCH"
-
-elif [ "$branch" = "" ];
-then
- branch=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || hg branch 2>/dev/null || echo "")
- if [ "$branch" = "HEAD" ];
- then
- branch=""
- fi
-fi
-
-if [ "$commit_o" = "" ];
-then
- # merge commit -> actual commit
- mc=
- if [ -n "$pr" ] && [ "$pr" != false ];
- then
- mc=$(git show --no-patch --format="%P" 2>/dev/null || echo "")
- fi
- if [[ "$mc" =~ ^[a-z0-9]{40}[[:space:]][a-z0-9]{40}$ ]];
- then
- say " Fixing merge commit SHA"
- commit=$(echo "$mc" | cut -d' ' -f2)
- elif [ "$GIT_COMMIT" != "" ];
- then
- commit="$GIT_COMMIT"
- elif [ "$commit" = "" ];
- then
- commit=$(git log -1 --format="%H" 2>/dev/null || hg id -i --debug 2>/dev/null | tr -d '+' || echo "")
- fi
-else
- commit="$commit_o"
-fi
-
-if [ "$CODECOV_TOKEN" != "" ] && [ "$token" = "" ];
-then
- say "${e}-->${x} token set from env"
- token="$CODECOV_TOKEN"
-fi
-
-if [ "$CODECOV_URL" != "" ] && [ "$url_o" = "" ];
-then
- say "${e}-->${x} url set from env"
- url_o=$(echo "$CODECOV_URL" | sed -e 's/\/$//')
-fi
-
-if [ "$CODECOV_SLUG" != "" ];
-then
- say "${e}-->${x} slug set from env"
- slug_o="$CODECOV_SLUG"
-
-elif [ "$slug" = "" ];
-then
- if [ "$remote_addr" = "" ];
- then
- remote_addr=$(git config --get remote.origin.url || hg paths default || echo '')
- fi
- if [ "$remote_addr" != "" ];
- then
- if echo "$remote_addr" | grep -q "//"; then
- # https
- slug=$(echo "$remote_addr" | cut -d / -f 4,5 | sed -e 's/\.git$//')
- else
- # ssh
- slug=$(echo "$remote_addr" | cut -d : -f 2 | sed -e 's/\.git$//')
- fi
- fi
- if [ "$slug" = "/" ];
- then
- slug=""
- fi
-fi
-
-yaml=$(test -n "$codecov_yml" && echo "$codecov_yml" \
- || cd "$git_root" && \
- git ls-files "*codecov.yml" "*codecov.yaml" 2>/dev/null \
- || hg locate "*codecov.yml" "*codecov.yaml" 2>/dev/null \
- || cd $proj_root && find . -type f -name '*codecov.y*ml' -depth 1 2>/dev/null \
- || echo '')
-yaml=$(echo "$yaml" | head -1)
-
-if [ "$yaml" != "" ];
-then
- say " ${e}Yaml found at:${x} $yaml"
- config=$(parse_yaml "$git_root/$yaml" || echo '')
-
- # TODO validate the yaml here
-
- if [ "$(echo "$config" | grep 'codecov_token="')" != "" ] && [ "$token" = "" ];
- then
- say "${e}-->${x} token set from yaml"
- token="$(echo "$config" | grep 'codecov_token="' | sed -e 's/codecov_token="//' | sed -e 's/"\.*//')"
- fi
-
- if [ "$(echo "$config" | grep 'codecov_url="')" != "" ] && [ "$url_o" = "" ];
- then
- say "${e}-->${x} url set from yaml"
- url_o="$(echo "$config" | grep 'codecov_url="' | sed -e 's/codecov_url="//' | sed -e 's/"\.*//')"
- fi
-
- if [ "$(echo "$config" | grep 'codecov_slug="')" != "" ] && [ "$slug_o" = "" ];
- then
- say "${e}-->${x} slug set from yaml"
- slug_o="$(echo "$config" | grep 'codecov_slug="' | sed -e 's/codecov_slug="//' | sed -e 's/"\.*//')"
- fi
-else
- say " ${g}Yaml not found, that's ok! Learn more at${x} ${b}http://docs.codecov.io/docs/codecov-yaml${x}"
-
-fi
-
-if [ "$branch_o" != "" ];
-then
- branch=$(urlencode "$branch_o")
-else
- branch=$(urlencode "$branch")
-fi
-
-query="branch=$branch\
- &commit=$commit\
- &build=$([ "$build_o" = "" ] && echo "$build" || echo "$build_o")\
- &build_url=$build_url\
- &name=$(urlencode "$name")\
- &tag=$([ "$tag_o" = "" ] && echo "$tag" || echo "$tag_o")\
- &slug=$([ "$slug_o" = "" ] && urlencode "$slug" || urlencode "$slug_o")\
- &service=$service\
- &flags=$flags\
- &pr=$([ "$pr_o" = "" ] && echo "${pr##\#}" || echo "${pr_o##\#}")\
- &job=$job"
-
-if [ "$ft_search" = "1" ];
-then
- # detect bower comoponents location
- bower_components="bower_components"
- bower_rc=$(cd "$git_root" && cat .bowerrc 2>/dev/null || echo "")
- if [ "$bower_rc" != "" ];
- then
- bower_components=$(echo "$bower_rc" | tr -d '\n' | grep '"directory"' | cut -d'"' -f4 | sed -e 's/\/$//')
- if [ "$bower_components" = "" ];
- then
- bower_components="bower_components"
- fi
- fi
-
- # Swift Coverage
- if [ "$ft_xcodellvm" = "1" ] && [ -d "$ddp" ];
- then
- say "${e}==>${x} Processing Xcode reports via llvm-cov"
- say " DerivedData folder: $ddp"
- profdata_files=$(find "$ddp" -name '*.profdata' 2>/dev/null || echo '')
- if [ "$profdata_files" != "" ];
- then
- # xcode via profdata
- if [ "$xp" = "" ];
- then
- # xp=$(xcodebuild -showBuildSettings 2>/dev/null | grep -i "^\s*PRODUCT_NAME" | sed -e 's/.*= \(.*\)/\1/')
- # say " ${e}->${x} Speed up Xcode processing by adding ${e}-J '$xp'${x}"
- say " ${g}hint${x} Speed up Swift processing by using use ${g}-J 'AppName'${x} (regexp accepted)"
- say " ${g}hint${x} This will remove Pods/ from your report. Also ${b}https://docs.codecov.io/docs/ignoring-paths${x}"
- fi
- while read -r profdata;
- do
- if [ "$profdata" != "" ];
- then
- swiftcov "$profdata" "$xp"
- fi
- done <<< "$profdata_files"
- else
- say " ${e}->${x} No Swift coverage found"
- fi
-
- # Obj-C Gcov Coverage
- if [ "$ft_gcov" = "1" ];
- then
- say " ${e}->${x} Running $gcov_exe for Obj-C"
- bash -c "find $ddp -type f -name '*.gcda' $gcov_include $gcov_ignore -exec $gcov_exe -p $gcov_arg {} +" || true
- fi
- fi
-
- if [ "$ft_xcodeplist" = "1" ] && [ -d "$ddp" ];
- then
- say "${e}==>${x} Processing Xcode plists"
- plists_files=$(find "$ddp" -name '*.xccoverage' 2>/dev/null || echo '')
- if [ "$plists_files" != "" ];
- then
- while read -r plist;
- do
- if [ "$plist" != "" ];
- then
- say " ${g}Found${x} plist file at $plist"
- plutil -convert xml1 -o "$(basename "$plist").plist" -- $plist
- fi
- done <<< "$plists_files"
- fi
- fi
-
- # Gcov Coverage
- if [ "$ft_gcov" = "1" ];
- then
- say "${e}==>${x} Running gcov in $proj_root ${e}(disable via -X gcov)${x}"
- bash -c "find $proj_root -type f -name '*.gcno' $gcov_include $gcov_ignore -exec $gcov_exe -pb $gcov_arg {} +" || true
- else
- say "${e}==>${x} gcov disabled"
- fi
-
- # Python Coverage
- if [ "$ft_coveragepy" = "1" ];
- then
- if [ ! -f coverage.xml ];
- then
- if which coverage >/dev/null 2>&1;
- then
- say "${e}==>${x} Python coveragepy exists ${e}disable via -X coveragepy${x}"
-
- dotcoverage=$(find "$git_root" -name '.coverage' -or -name '.coverage.*' | head -1 || echo '')
- if [ "$dotcoverage" != "" ];
- then
- cd "$(dirname "$dotcoverage")"
- if [ ! -f .coverage ];
- then
- say " ${e}->${x} Running coverage combine"
- coverage combine -a
- fi
- say " ${e}->${x} Running coverage xml"
- if [ "$(coverage xml -i)" != "No data to report." ];
- then
- files="$files
-$PWD/coverage.xml"
- else
- say " ${r}No data to report.${x}"
- fi
- cd "$proj_root"
- else
- say " ${r}No .coverage file found.${x}"
- fi
- else
- say "${e}==>${x} Python coveragepy not found"
- fi
- fi
- else
- say "${e}==>${x} Python coveragepy disabled"
- fi
-
- if [ "$search_in_o" != "" ];
- then
- # location override
- search_in="$search_in_o"
- fi
-
- say "$e==>$x Searching for coverage reports in:"
- for _path in $search_in
- do
- say " ${g}+${x} $_path"
- done
-
- patterns="find $search_in \( \
- -name vendor \
- -or -name htmlcov \
- -or -name virtualenv \
- -or -name js/generated/coverage \
- -or -name .virtualenv \
- -or -name virtualenvs \
- -or -name .virtualenvs \
- -or -name .env \
- -or -name .envs \
- -or -name env \
- -or -name .yarn-cache \
- -or -name envs \
- -or -name .venv \
- -or -name .venvs \
- -or -name venv \
- -or -name venvs \
- -or -name .git \
- -or -name .hg \
- -or -name .tox \
- -or -name __pycache__ \
- -or -name '.egg-info*' \
- -or -name '$bower_components' \
- -or -name node_modules \
- -or -name 'conftest_*.c.gcov' \
- \) -prune -or \
- -type f \( -name '*coverage*.*' \
- -or -name 'nosetests.xml' \
- -or -name 'jacoco*.xml' \
- -or -name 'clover.xml' \
- -or -name 'report.xml' \
- -or -name '*.codecov.*' \
- -or -name 'codecov.*' \
- -or -name 'cobertura.xml' \
- -or -name 'excoveralls.json' \
- -or -name 'luacov.report.out' \
- -or -name 'coverage-final.json' \
- -or -name 'naxsi.info' \
- -or -name 'lcov.info' \
- -or -name 'lcov.dat' \
- -or -name '*.lcov' \
- -or -name '*.clover' \
- -or -name 'cover.out' \
- -or -name 'gcov.info' \
- -or -name '*.gcov' \
- -or -name '*.lst' \
- $include_cov \) \
- $exclude_cov \
- -not -name '*.profdata' \
- -not -name 'coverage-summary.json' \
- -not -name 'phpunit-code-coverage.xml' \
- -not -name '*/classycle/report.xml' \
- -not -name 'remapInstanbul.coverage*.json' \
- -not -name 'phpunit-coverage.xml' \
- -not -name '*codecov.yml' \
- -not -name '*.serialized' \
- -not -name '.coverage*' \
- -not -name '.*coveragerc' \
- -not -name '*.sh' \
- -not -name '*.bat' \
- -not -name '*.ps1' \
- -not -name '*.env' \
- -not -name '*.cmake' \
- -not -name '*.dox' \
- -not -name '*.ec' \
- -not -name '*.rst' \
- -not -name '*.h' \
- -not -name '*.scss' \
- -not -name '*.o' \
- -not -name '*.proto' \
- -not -name '*.sbt' \
- -not -name '*.xcoverage.*' \
- -not -name '*.gz' \
- -not -name '*.conf' \
- -not -name '*.p12' \
- -not -name '*.csv' \
- -not -name '*.rsp' \
- -not -name '*.m4' \
- -not -name '*.pem' \
- -not -name '*~' \
- -not -name '*.exe' \
- -not -name '*.am' \
- -not -name '*.template' \
- -not -name '*.cp' \
- -not -name '*.bw' \
- -not -name '*.crt' \
- -not -name '*.log' \
- -not -name '*.cmake' \
- -not -name '*.pth' \
- -not -name '*.in' \
- -not -name '*.jar*' \
- -not -name '*.pom*' \
- -not -name '*.png' \
- -not -name '*.jpg' \
- -not -name '*.sql' \
- -not -name '*.jpeg' \
- -not -name '*.svg' \
- -not -name '*.gif' \
- -not -name '*.csv' \
- -not -name '*.snapshot' \
- -not -name '*.mak*' \
- -not -name '*.bash' \
- -not -name '*.data' \
- -not -name '*.py' \
- -not -name '*.class' \
- -not -name '*.xcconfig' \
- -not -name '*.ec' \
- -not -name '*.coverage' \
- -not -name '*.pyc' \
- -not -name '*.cfg' \
- -not -name '*.egg' \
- -not -name '*.ru' \
- -not -name '*.css' \
- -not -name '*.less' \
- -not -name '*.pyo' \
- -not -name '*.whl' \
- -not -name '*.html' \
- -not -name '*.ftl' \
- -not -name '*.erb' \
- -not -name '*.rb' \
- -not -name '*.js' \
- -not -name '*.jade' \
- -not -name '*.db' \
- -not -name '*.md' \
- -not -name '*.cpp' \
- -not -name '*.gradle' \
- -not -name '*.tar.tz' \
- -not -name '*.scss' \
- -not -name 'include.lst' \
- -not -name 'fullLocaleNames.lst' \
- -not -name 'inputFiles.lst' \
- -not -name 'createdFiles.lst' \
- -not -name 'scoverage.measurements.*' \
- -not -name 'test_*_coverage.txt' \
- -not -name 'testrunner-coverage*' \
- -print 2>/dev/null"
- files=$(eval "$patterns" || echo '')
-
-elif [ "$include_cov" != "" ];
-then
- files=$(eval "find $search_in -type f \( ${include_cov:5} \)$exclude_cov 2>/dev/null" || echo '')
-fi
-
-num_of_files=$(echo "$files" | wc -l | tr -d ' ')
-if [ "$num_of_files" != '' ] && [ "$files" != '' ];
-then
- say " ${e}->${x} Found $num_of_files reports"
-fi
-
-# no files found
-if [ "$files" = "" ];
-then
- say "${r}-->${x} No coverage report found."
- say " Please visit ${b}http://docs.codecov.io/docs/supported-languages${x}"
- exit ${exit_with};
-fi
-
-if [ "$ft_network" == "1" ];
-then
- say "${e}==>${x} Detecting git/mercurial file structure"
- network=$(cd "$git_root" && git ls-files 2>/dev/null || hg locate 2>/dev/null || echo "")
- if [ "$network" = "" ];
- then
- network=$(find "$git_root" \( \
- -name virtualenv \
- -name .virtualenv \
- -name virtualenvs \
- -name .virtualenvs \
- -name '*.png' \
- -name '*.gif' \
- -name '*.jpg' \
- -name '*.jpeg' \
- -name '*.md' \
- -name .env \
- -name .envs \
- -name env \
- -name envs \
- -name .venv \
- -name .venvs \
- -name venv \
- -name venvs \
- -name .git \
- -name .egg-info \
- -name shunit2-2.1.6 \
- -name vendor \
- -name __pycache__ \
- -name node_modules \
- -path '*/$bower_components/*' \
- -path '*/target/delombok/*' \
- -path '*/build/lib/*' \
- -path '*/js/generated/coverage/*' \
- \) -prune -or \
- -type f -print 2>/dev/null || echo '')
- fi
-
- if [ "$prefix_o" != "" ];
- then
- network=$(echo "$network" | awk "{print \"$prefix_o/\"\$0}")
- fi
-fi
-
-upload_file=`mktemp /tmp/codecov.XXXXXX`
-adjustments_file=`mktemp /tmp/codecov.adjustments.XXXXXX`
-
-cleanup() {
- rm -f $upload_file $adjustments_file $upload_file.gz
-}
-
-trap cleanup INT ABRT TERM
-
-if [ "$env" != "" ];
-then
- inc_env=""
- say "${e}==>${x} Appending build variables"
- for varname in $(echo "$env" | tr ',' ' ')
- do
- if [ "$varname" != "" ];
- then
- say " ${g}+${x} $varname"
- inc_env="${inc_env}${varname}=$(eval echo "\$${varname}")
-"
- fi
- done
-
-echo "$inc_env<<<<<< ENV" >> $upload_file
-fi
-
-# Append git file list
-# write discovered yaml location
-echo "$yaml" >> $upload_file
-if [ "$ft_network" == "1" ];
-then
- i="woff|eot|otf" # fonts
- i="$i|gif|png|jpg|jpeg|psd" # images
- i="$i|ptt|pptx|numbers|pages|md|txt|xlsx|docx|doc|pdf|html|csv" # docs
- i="$i|yml|yaml|.gitignore" # supporting docs
- echo "$network" | grep -vwE "($i)$" >> $upload_file
-fi
-echo "<<<<<< network" >> $upload_file
-
-fr=0
-say "${e}==>${x} Reading reports"
-while IFS='' read -r file;
-do
- # read the coverage file
- if [ "$(echo "$file" | tr -d ' ')" != '' ];
- then
- if [ -f "$file" ];
- then
- report_len=$(wc -c < "$file")
- if [ "$report_len" -ne 0 ];
- then
- say " ${g}+${x} $file ${e}bytes=$(echo "$report_len" | tr -d ' ')${x}"
- # append to to upload
- _filename=$(basename "$file")
- if [ "${_filename##*.}" = 'gcov' ];
- then
- echo "# path=$(echo "$file.reduced" | sed "s|^$git_root/||")" >> $upload_file
- # get file name
- head -1 $file >> $upload_file
- # 1. remove source code
- # 2. remove ending bracket lines
- # 3. remove whitespace
- # 4. remove contextual lines
- # 5. remove function names
- awk -F': *' '{print $1":"$2":"}' $file \
- | sed '\/: *} *$/d' \
- | sed 's/^ *//' \
- | sed '/^-/d' \
- | sed 's/^function.*/func/' >> $upload_file
- else
- echo "# path=$(echo "$file" | sed "s|^$git_root/||")" >> $upload_file
- cat "$file" >> $upload_file
- fi
- echo "<<<<<< EOF" >> $upload_file
- fr=1
- if [ "$clean" = "1" ];
- then
- rm "$file"
- fi
- else
- say " ${r}-${x} Skipping empty file $file"
- fi
- else
- say " ${r}-${x} file not found at $file"
- fi
- fi
-done <<< "$(echo -e "$files")"
-
-if [ "$fr" = "0" ];
-then
- say "${r}-->${x} No coverage data found."
- say " Please visit ${b}http://docs.codecov.io/docs/supported-languages${x}"
- say " search for your projects language to learn how to collect reports."
- exit ${exit_with};
-fi
-
-if [ "$ft_fix" = "1" ];
-then
- say "${e}==>${x} Appending adjustments"
- say " ${b}http://docs.codecov.io/docs/fixing-reports${x}"
-
- empty_line='^[[:space:]]*$'
- # //
- syntax_comment='^[[:space:]]*//.*'
- # /* or */
- syntax_comment_block='^[[:space:]]*(\/\*|\*\/)[[:space:]]*$'
- # { or }
- syntax_bracket='^[[:space:]]*[\{\}][[:space:]]*(//.*)?$'
- # [ or ]
- syntax_list='^[[:space:]]*[][][[:space:]]*(//.*)?$'
-
- skip_dirs="-not -path '*/$bower_components/*' \
- -not -path '*/node_modules/*'"
-
- cut_and_join() {
- awk 'BEGIN { FS=":" }
- $3 ~ /\/\*/ || $3 ~ /\*\// { print $0 ; next }
- $1!=key { if (key!="") print out ; key=$1 ; out=$1":"$2 ; next }
- { out=out","$2 }
- END { print out }' 2>/dev/null
- }
-
- if echo "$network" | grep -m1 '.kt$' 1>/dev/null;
- then
- # skip brackets and comments
- find "$git_root" -type f \
- -name '*.kt' \
- -exec \
- grep -nIHE -e $syntax_bracket \
- -e $syntax_comment_block {} \; \
- | cut_and_join \
- >> $adjustments_file \
- || echo ''
-
- # last line in file
- find "$git_root" -type f \
- -name '*.kt' -exec \
- wc -l {} \; \
- | while read l; do echo "EOF: $l"; done \
- 2>/dev/null \
- >> $adjustments_file \
- || echo ''
-
- fi
-
- if echo "$network" | grep -m1 '.go$' 1>/dev/null;
- then
- # skip empty lines, comments, and brackets
- find "$git_root" -not -path '*/vendor/*' \
- -type f \
- -name '*.go' \
- -exec \
- grep -nIHE \
- -e $empty_line \
- -e $syntax_comment \
- -e $syntax_comment_block \
- -e $syntax_bracket \
- {} \; \
- | cut_and_join \
- >> $adjustments_file \
- || echo ''
- fi
-
- if echo "$network" | grep -m1 '.dart$' 1>/dev/null;
- then
- # skip brackets
- find "$git_root" -type f \
- -name '*.dart' \
- -exec \
- grep -nIHE \
- -e $syntax_bracket \
- {} \; \
- | cut_and_join \
- >> $adjustments_file \
- || echo ''
- fi
-
- if echo "$network" | grep -m1 '.php$' 1>/dev/null;
- then
- # skip empty lines, comments, and brackets
- find "$git_root" -not -path "*/vendor/*" \
- -type f \
- -name '*.php' \
- -exec \
- grep -nIHE \
- -e $syntax_list \
- -e $syntax_bracket \
- -e '^[[:space:]]*\);[[:space:]]*(//.*)?$' \
- {} \; \
- | cut_and_join \
- >> $adjustments_file \
- || echo ''
- fi
-
- if echo "$network" | grep -m1 '\(.cpp\|.h\|.cxx\|.c\|.hpp\|.m\)$' 1>/dev/null;
- then
- # skip brackets
- find "$git_root" -type f \
- $skip_dirs \
- \( \
- -name '*.h' \
- -or -name '*.cpp' \
- -or -name '*.cxx' \
- -or -name '*.m' \
- -or -name '*.c' \
- -or -name '*.hpp' \
- \) -exec \
- grep -nIHE \
- -e $empty_line \
- -e $syntax_bracket \
- -e '// LCOV_EXCL' \
- {} \; \
- | cut_and_join \
- >> $adjustments_file \
- || echo ''
-
- # skip brackets
- find "$git_root" -type f \
- $skip_dirs \
- \( \
- -name '*.h' \
- -or -name '*.cpp' \
- -or -name '*.cxx' \
- -or -name '*.m' \
- -or -name '*.c' \
- -or -name '*.hpp' \
- \) -exec \
- grep -nIH '// LCOV_EXCL' \
- {} \; \
- >> $adjustments_file \
- || echo ''
-
- fi
-
- found=$(cat $adjustments_file | tr -d ' ')
-
- if [ "$found" != "" ];
- then
- say " ${g}+${x} Found adjustments"
- echo "# path=fixes" >> $upload_file
- cat $adjustments_file >> $upload_file
- echo "<<<<<< EOF" >> $upload_file
- rm -rf $adjustments_file
- else
- say " ${e}->${x} No adjustments found"
- fi
-fi
-
-if [ "$url_o" != "" ];
-then
- url="$url_o"
-fi
-
-if [ "$dump" != "0" ];
-then
- # trim whitespace from query
- say " ${e}->${x} Dumping upload file (no upload)"
- echo "$url/upload/v4?$(echo "package=bash-$VERSION&token=$token&$query" | tr -d ' ')"
- cat $upload_file
-else
-
- say "${e}==>${x} Gzipping contents"
- gzip -nf9 $upload_file
-
- query=$(echo "${query}" | tr -d ' ')
- say "${e}==>${x} Uploading reports"
- say " ${e}url:${x} $url"
- say " ${e}query:${x} $query"
-
- # now add token to query
- query=$(echo "package=bash-$VERSION&token=$token&$query" | tr -d ' ')
-
- if [ "$ft_s3" = "1" ];
- then
- i="0"
- while [ $i -lt 4 ]
- do
- i=$[$i+1]
- say " ${e}->${x} Pinging Codecov"
- res=$(curl $curl_s -X POST $curlargs $cacert \
- -H 'X-Reduced-Redundancy: false' \
- -H 'X-Content-Type: application/x-gzip' \
- "$url/upload/v4?$query" || true)
- # a good replay is "https://codecov.io" + "\n" + "https://codecov.s3.amazonaws.com/..."
- status=$(echo "$res" | head -1 | grep 'HTTP ' | cut -d' ' -f2)
- if [ "$status" = "" ];
- then
- s3target=$(echo "$res" | sed -n 2p)
- say " ${e}->${x} Uploading"
- s3=$(curl $curl_s -fiX PUT $curlawsargs \
- --data-binary @$upload_file.gz \
- -H 'Content-Type: application/x-gzip' \
- -H 'Content-Encoding: gzip' \
- -H 'x-amz-acl: public-read' \
- "$s3target" || true)
- if [ "$s3" != "" ];
- then
- say " ${g}->${x} View reports at ${b}$(echo "$res" | sed -n 1p)${x}"
- exit 0
- else
- say " ${r}X>${x} Failed to upload"
- fi
- elif [ "$status" = "400" ];
- then
- # 400 Error
- say "${g}${res}${x}"
- exit ${exit_with}
- fi
- say " ${e}->${x} Sleeping for 30s and trying again..."
- sleep 30
- done
- fi
-
- say " ${e}->${x} Uploading to Codecov"
- i="0"
- while [ $i -lt 4 ]
- do
- i=$[$i+1]
-
- res=$(curl $curl_s -X POST $curlargs $cacert \
- --data-binary @$upload_file.gz \
- -H 'Content-Type: text/plain' \
- -H 'Content-Encoding: gzip' \
- -H 'X-Content-Encoding: gzip' \
- -H 'Accept: text/plain' \
- "$url/upload/v2?$query" || echo 'HTTP 500')
- # HTTP 200
- # http://....
- status=$(echo "$res" | head -1 | cut -d' ' -f2)
- if [ "$status" = "" ];
- then
- say " View reports at ${b}$(echo "$res" | head -2 | tail -1)${x}"
- exit 0
-
- elif [ "${status:0:1}" = "5" ];
- then
- say " ${e}->${x} Sleeping for 30s and trying again..."
- sleep 30
-
- else
- say " ${g}${res}${x}"
- exit 0
- exit ${exit_with}
- fi
-
- done
-
- say " ${r}X> Failed to upload coverage reports${x}"
-fi
-
-exit ${exit_with}
diff --git a/ci/jenkins/Jenkinsfile_py3-master_cpu_unittest b/ci/jenkins/Jenkinsfile_py3-master_cpu_unittest
deleted file mode 100644
index fb87760de6..0000000000
--- a/ci/jenkins/Jenkinsfile_py3-master_cpu_unittest
+++ /dev/null
@@ -1,69 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-// timeout in minutes
-max_time = 120
-
-node {
- // Loading the utilities requires a node context unfortunately
- checkout scm
- utils = load('ci/jenkins/utils.groovy')
- build_steps = load('ci/jenkins/build_steps.groovy')
-}
-utils.assign_node_labels(linux_gpu: 'linux-gpu', linux_cpu: 'linux-cpu')
-
-utils.main_wrapper(
-core_logic: {
- utils.parallel_stage('Sanity', [
- build_steps.sanity_lint('gluon-nlp-cpu-py3-master', 'cpu/py3-master', 'src/gluonnlp')
- ])
-
- utils.parallel_stage('Tests', [
- build_steps.test_unittest('gluon-nlp-cpu-py3-master', 'cpu/py3-master',
- 'tests/unittest', 'src/gluonnlp',
- 'not (gpu or serial or skip_master)',
- 4, false, false),
- build_steps.test_unittest('gluon-nlp-cpu-py3-master', 'cpu/py3-master',
- 'tests/unittest', 'src/gluonnlp',
- 'not (gpu or skip_master) and serial',
- 0, false, false),
- build_steps.test_unittest('gluon-nlp-cpu-py3-master', 'cpu/py3-master',
- 'scripts/tests', 'src/gluonnlp',
- 'not (gpu or serial or integration or skip_master)',
- 4, false, false),
- build_steps.test_unittest('gluon-nlp-cpu-py3-master', 'cpu/py3-master',
- 'scripts/tests', 'src/gluonnlp',
- '(not (gpu or integration or skip_master)) and serial',
- 0, false, false),
- build_steps.test_unittest('gluon-nlp-cpu-py3-master', 'cpu/py3-master',
- 'scripts/tests', 'src/gluonnlp',
- 'not (gpu or serial or skip_master) and integration',
- 4, false, false),
- build_steps.test_unittest('gluon-nlp-cpu-py3-master', 'cpu/py3-master',
- 'scripts/tests', 'src/gluonnlp',
- 'not (gpu or skip_master) and serial and integration',
- 0, false, false)
- ])
-}
-,
-failure_handler: {}
-)
diff --git a/ci/jenkins/Jenkinsfile_py3-master_gpu_doc b/ci/jenkins/Jenkinsfile_py3-master_gpu_doc
deleted file mode 100644
index 82d6cc5fee..0000000000
--- a/ci/jenkins/Jenkinsfile_py3-master_gpu_doc
+++ /dev/null
@@ -1,168 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-// timeout in minutes
-max_time = 120
-
-node {
- // Loading the utilities requires a node context unfortunately
- checkout scm
- utils = load('ci/jenkins/utils.groovy')
- build_steps = load('ci/jenkins/build_steps.groovy')
-}
-utils.assign_node_labels(linux_gpu: 'linux-gpu', linux_cpu: 'linux-cpu')
-
-utils.main_wrapper(
-core_logic: {
- utils.parallel_stage('Doc Test', [
- build_steps.test_doctest('gluon-nlp-cpu-py3-master', 'cpu/py3-master',
- 'src/gluonnlp', 'src/gluonnlp', 4)
- ])
-
- // Compile example notebooks, Doctest & Create Website
- node { // Single node parallelism
- ws('gluon-nlp-cpu-py3-master') {
- stage("Prepare conda environment for website") {
- utils.init_git()
- // Require a full environment here due to sphinx build step
- // after compiling and downloading the notebooks
- sh 'source ci/prepare_clean_env.sh cpu/py3-master'
- }
-
- stage("Create Website") {
- def tests = [:]
- for (f in findFiles(glob: '**/docs/examples/*/*.md')) {
- def md_file = f.toString() // Convert FileWrapper to String
- def short_name = md_file["docs/examples/".length()..-1]
- tests[short_name] = { ->
- def base_name = md_file[0..-4] + ''
- def ipynb_file = base_name + '.ipynb'
- def stdout_file = base_name + '.stdout.log'
- def stderr_file = base_name + '.stderr.log'
- stage(short_name) { // remove common path from name
- // Submit AWS Batch jobs for each example notebook
- // The converted notebooks and the conversion logs are
- // saved to S3 and retrieved on the CI server once the jobs
- // finished.
-
- if (env.BRANCH_NAME.startsWith('PR-')){
- sh """
- set +e
- conda activate ./conda/cpu/py3-master
-
- python3 ci/batch/submit-job.py --region us-east-1 --wait \
- --timeout 1800 --saved-output ./docs/examples --conda-env docker/py3 \
- --name GluonNLP-${env.BRANCH_NAME}-${env.BUILD_NUMBER} \
- --save-path batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/docs/examples \
- --work-dir . --source-ref refs/pull/${env.CHANGE_ID}/head \
- --command \"(python3 docs/md2ipynb.py ${md_file} | tee ${stdout_file}) 3>&1 1>&2 2>&3 | tee ${stderr_file} \"
- BATCH_EXIT_CODE=\$?
-
- aws s3api wait object-exists --bucket gluon-nlp-staging \
- --key batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stderr_file}
- aws s3 cp s3://gluon-nlp-staging/batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stderr_file} ${stderr_file}
- cat ${stderr_file}
-
- aws s3api wait object-exists --bucket gluon-nlp-staging \
- --key batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stdout_file}
- aws s3 cp s3://gluon-nlp-staging/batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stdout_file} ${stdout_file}
- cat ${stdout_file}
-
- if [ \$BATCH_EXIT_CODE -ne 0 ]; then
- echo AWS Batch Task Failed
- else
- aws s3api wait object-exists --bucket gluon-nlp-staging \
- --key batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${ipynb_file}
- aws s3 cp s3://gluon-nlp-staging/batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${ipynb_file} ${ipynb_file}
- fi
-
- exit \$BATCH_EXIT_CODE
- """
- } else {
- sh """
- set +e
- conda activate ./conda/cpu/py3-master
-
- python3 ci/batch/submit-job.py --region us-east-1 --wait \
- --timeout 1800 --saved-output ./docs/examples --conda-env docker/py3 \
- --name GluonNLP-${env.BRANCH_NAME}-${env.BUILD_NUMBER} \
- --save-path batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/docs/examples \
- --work-dir . --source-ref ${env.BRANCH_NAME} \
- --command \"(python3 docs/md2ipynb.py ${md_file} | tee ${stdout_file}) 3>&1 1>&2 2>&3 | tee ${stderr_file} \"
- BATCH_EXIT_CODE=\$?
-
- aws s3api wait object-exists --bucket gluon-nlp-staging \
- --key batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stderr_file}
- aws s3 cp s3://gluon-nlp-staging/batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stderr_file} ${stderr_file}
- cat ${stderr_file}
-
- aws s3api wait object-exists --bucket gluon-nlp-staging \
- --key batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stdout_file}
- aws s3 cp s3://gluon-nlp-staging/batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stdout_file} ${stdout_file}
- cat ${stdout_file}
-
- if [ \$BATCH_EXIT_CODE -ne 0 ]; then
- echo AWS Batch Task Failed
- else
- aws s3api wait object-exists --bucket gluon-nlp-staging \
- --key batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${ipynb_file}
- aws s3 cp s3://gluon-nlp-staging/batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${ipynb_file} ${ipynb_file}
- fi
-
- exit \$BATCH_EXIT_CODE
- """
- }
- }
- }
- }
-
- parallel tests
- }
-
- stage("Upload Website") {
- if (env.BRANCH_NAME.startsWith('PR-')){
- bucket = 'gluon-nlp-staging'
- path = env.BRANCH_NAME+'/'+env.BUILD_NUMBER
- } else {
- bucket = 'gluon-nlp'
- path = env.BRANCH_NAME
- }
- sh """
- conda activate ./conda/cpu/py3-master
- make docs
- ci/upload_doc.sh ${bucket} ${path}
- """
- }
- }
- }
-
- utils.parallel_stage('Documentation', [
- build_steps.website_linkcheck('gluon-nlp-cpu-py3-master', 'cpu/py3-master')
- ])
-
- utils.parallel_stage('Deploy', [
- build_steps.post_website_link()
- ])
-}
-,
-failure_handler: {}
-)
diff --git a/ci/jenkins/Jenkinsfile_py3-master_gpu_integration b/ci/jenkins/Jenkinsfile_py3-master_gpu_integration
deleted file mode 100644
index 31002e4bdd..0000000000
--- a/ci/jenkins/Jenkinsfile_py3-master_gpu_integration
+++ /dev/null
@@ -1,53 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-// timeout in minutes
-max_time = 120
-
-node {
- // Loading the utilities requires a node context unfortunately
- checkout scm
- utils = load('ci/jenkins/utils.groovy')
- build_steps = load('ci/jenkins/build_steps.groovy')
-}
-utils.assign_node_labels(linux_gpu: 'linux-gpu', linux_cpu: 'linux-cpu')
-
-utils.main_wrapper(
-core_logic: {
- utils.parallel_stage('Sanity', [
- build_steps.sanity_lint('gluon-nlp-gpu-py3-master', 'gpu/py3-master', 'scripts')
- ])
-
- utils.parallel_stage('Scripts', [
- build_steps.test_unittest('gluon-nlp-gpu-py3-master', 'gpu/py3-master',
- 'scripts/tests', 'src/gluonnlp',
- 'gpu and (not (serial or skip_master)) and integration',
- 4, true, true),
- build_steps.test_unittest('gluon-nlp-gpu-py3-master', 'gpu/py3-master',
- 'scripts/tests', 'src/gluonnlp',
- 'gpu and serial and integration and (not skip_master)',
- 0, true, true)
- ])
-}
-,
-failure_handler: {}
-)
diff --git a/ci/jenkins/Jenkinsfile_py3-master_gpu_unittest b/ci/jenkins/Jenkinsfile_py3-master_gpu_unittest
deleted file mode 100644
index 6275e40d58..0000000000
--- a/ci/jenkins/Jenkinsfile_py3-master_gpu_unittest
+++ /dev/null
@@ -1,61 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-// timeout in minutes
-max_time = 120
-
-node {
- // Loading the utilities requires a node context unfortunately
- checkout scm
- utils = load('ci/jenkins/utils.groovy')
- build_steps = load('ci/jenkins/build_steps.groovy')
-}
-utils.assign_node_labels(linux_gpu: 'linux-gpu', linux_cpu: 'linux-cpu')
-
-utils.main_wrapper(
-core_logic: {
- utils.parallel_stage('Sanity', [
- build_steps.sanity_lint('gluon-nlp-gpu-py3-master', 'gpu/py3-master', 'src/gluonnlp')
- ])
-
- utils.parallel_stage('Tests', [
- build_steps.test_unittest('gluon-nlp-gpu-py3-master', 'gpu/py3-master',
- 'tests/unittest', 'src/gluonnlp',
- 'gpu and (not (serial or skip_master))',
- 4, true, false),
- build_steps.test_unittest('gluon-nlp-gpu-py3-master', 'gpu/py3-master',
- 'tests/unittest', 'src/gluonnlp',
- 'gpu and serial and not skip_master',
- 0, true, false),
- build_steps.test_unittest('gluon-nlp-gpu-py3-master', 'gpu/py3-master',
- 'scripts/tests', 'src/gluonnlp',
- 'gpu and (not (serial or skip_master or integration))',
- 4, true, false),
- build_steps.test_unittest('gluon-nlp-gpu-py3-master', 'gpu/py3-master',
- 'scripts/tests', 'src/gluonnlp',
- 'gpu and serial and not (skip_master or integration)',
- 0, true, false)
- ])
-}
-,
-failure_handler: {}
-)
diff --git a/ci/jenkins/Jenkinsfile_py3_cpu_unittest b/ci/jenkins/Jenkinsfile_py3_cpu_unittest
deleted file mode 100644
index 6d518fdbfd..0000000000
--- a/ci/jenkins/Jenkinsfile_py3_cpu_unittest
+++ /dev/null
@@ -1,69 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-// timeout in minutes
-max_time = 120
-
-node {
- // Loading the utilities requires a node context unfortunately
- checkout scm
- utils = load('ci/jenkins/utils.groovy')
- build_steps = load('ci/jenkins/build_steps.groovy')
-}
-utils.assign_node_labels(linux_gpu: 'linux-gpu', linux_cpu: 'linux-cpu')
-
-utils.main_wrapper(
-core_logic: {
- utils.parallel_stage('Sanity', [
- build_steps.sanity_lint('gluon-nlp-cpu-py3', 'cpu/py3', 'src/gluonnlp')
- ])
-
- utils.parallel_stage('Tests', [
- build_steps.test_unittest('gluon-nlp-cpu-py3', 'cpu/py3',
- 'tests/unittest', 'src/gluonnlp',
- 'not (gpu or serial)',
- 4, false, false),
- build_steps.test_unittest('gluon-nlp-cpu-py3', 'cpu/py3',
- 'tests/unittest', 'src/gluonnlp',
- '(not gpu) and serial',
- 0, false, false),
- build_steps.test_unittest('gluon-nlp-cpu-py3', 'cpu/py3',
- 'scripts/tests', 'src/gluonnlp',
- 'not (gpu or serial or integration)',
- 4, false, false),
- build_steps.test_unittest('gluon-nlp-cpu-py3', 'cpu/py3',
- 'scripts/tests', 'src/gluonnlp',
- '(not (gpu or integration)) and serial',
- 0, false, false),
- build_steps.test_unittest('gluon-nlp-cpu-py3', 'cpu/py3',
- 'scripts/tests', 'src/gluonnlp',
- 'not (gpu or serial) and integration',
- 4, false, false),
- build_steps.test_unittest('gluon-nlp-cpu-py3', 'cpu/py3',
- 'scripts/tests', 'src/gluonnlp',
- '(not gpu) and serial and integration',
- 0, false, false)
- ])
-}
-,
-failure_handler: {}
-)
diff --git a/ci/jenkins/Jenkinsfile_py3_gpu_integration b/ci/jenkins/Jenkinsfile_py3_gpu_integration
deleted file mode 100644
index e683f5f14d..0000000000
--- a/ci/jenkins/Jenkinsfile_py3_gpu_integration
+++ /dev/null
@@ -1,53 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-// timeout in minutes
-max_time = 120
-
-node {
- // Loading the utilities requires a node context unfortunately
- checkout scm
- utils = load('ci/jenkins/utils.groovy')
- build_steps = load('ci/jenkins/build_steps.groovy')
-}
-utils.assign_node_labels(linux_gpu: 'linux-gpu', linux_cpu: 'linux-cpu')
-
-utils.main_wrapper(
-core_logic: {
- utils.parallel_stage('Sanity', [
- build_steps.sanity_lint('gluon-nlp-gpu-py3', 'gpu/py3', 'scripts')
- ])
-
- utils.parallel_stage('Scripts', [
- build_steps.test_unittest('gluon-nlp-gpu-py3', 'gpu/py3',
- 'scripts/tests', 'src/gluonnlp',
- 'gpu and (not serial) and integration',
- 4, true, true),
- build_steps.test_unittest('gluon-nlp-gpu-py3', 'gpu/py3',
- 'scripts/tests', 'src/gluonnlp',
- 'gpu and serial and integration',
- 0, true, true)
- ])
-}
-,
-failure_handler: {}
-)
diff --git a/ci/jenkins/Jenkinsfile_py3_gpu_unittest b/ci/jenkins/Jenkinsfile_py3_gpu_unittest
deleted file mode 100644
index 8430ca0a36..0000000000
--- a/ci/jenkins/Jenkinsfile_py3_gpu_unittest
+++ /dev/null
@@ -1,61 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-// timeout in minutes
-max_time = 120
-
-node {
- // Loading the utilities requires a node context unfortunately
- checkout scm
- utils = load('ci/jenkins/utils.groovy')
- build_steps = load('ci/jenkins/build_steps.groovy')
-}
-utils.assign_node_labels(linux_gpu: 'linux-gpu', linux_cpu: 'linux-cpu')
-
-utils.main_wrapper(
-core_logic: {
- utils.parallel_stage('Sanity', [
- build_steps.sanity_lint('gluon-nlp-gpu-py3', 'gpu/py3', 'src/gluonnlp')
- ])
-
- utils.parallel_stage('Tests', [
- build_steps.test_unittest('gluon-nlp-gpu-py3', 'gpu/py3',
- 'tests/unittest', 'src/gluonnlp',
- 'gpu and not serial',
- 4, true, false),
- build_steps.test_unittest('gluon-nlp-gpu-py3', 'gpu/py3',
- 'tests/unittest', 'src/gluonnlp',
- 'gpu and serial',
- 0, true, false),
- build_steps.test_unittest('gluon-nlp-gpu-py3', 'gpu/py3',
- 'scripts/tests', 'src/gluonnlp',
- 'gpu and not (serial or integration)',
- 4, true, false),
- build_steps.test_unittest('gluon-nlp-gpu-py3', 'gpu/py3',
- 'scripts/tests', 'src/gluonnlp',
- 'gpu and serial and not integration',
- 0, true, false)
- ])
-}
-,
-failure_handler: {}
-)
diff --git a/ci/jenkins/build_steps.groovy b/ci/jenkins/build_steps.groovy
deleted file mode 100644
index 63bd59e81d..0000000000
--- a/ci/jenkins/build_steps.groovy
+++ /dev/null
@@ -1,127 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// This file contains the steps that will be used in the
-// Jenkins pipelines
-
-utils = load('ci/jenkins/utils.groovy')
-
-def sanity_lint(workspace_name, conda_env_name, path) {
- return ['Lint': {
- node {
- ws(workspace_name) {
- timeout(time: max_time, unit: 'MINUTES') {
- utils.init_git()
- sh """
- set -ex
- source ci/prepare_clean_env.sh ${conda_env_name}
- make lintdir=${path} lint
- set +ex
- """
- }
- }
- }
- }]
-}
-
-def test_unittest(workspace_name, conda_env_name,
- test_path, cov_path,
- mark,
- threads, gpu, skip_report) {
- capture_flag = env.BRANCH_NAME.startsWith('PR-')?'':'--capture=no'
- node_type = gpu?NODE_LINUX_GPU:NODE_LINUX_CPU
- return ["${conda_env_name}: ${test_path} -m '${mark}'": {
- node(node_type) {
- ws(workspace_name) {
- timeout(time: max_time, unit: 'MINUTES') {
- utils.init_git()
- sh """
- set -ex
- source ci/prepare_clean_env.sh ${conda_env_name}
- pytest -v ${capture_flag} -n ${threads} -m '${mark}' --durations=30 --cov ${cov_path} --cov-report=term --cov-report xml ${test_path}
- set +ex
- """
- if (!skip_report) utils.publish_test_coverage('GluonNLPCodeCov')
- }
- }
- }
- }]
-}
-
-def test_doctest(workspace_name, conda_env_name,
- test_path, cov_path, threads) {
- capture_flag = env.BRANCH_NAME.startsWith('PR-')?'':'--capture=no'
- return ["${conda_env_name}: doctest ${test_path}": {
- node(NODE_LINUX_CPU) {
- ws(workspace_name) {
- timeout(time: max_time, unit: 'MINUTES') {
- utils.init_git()
- sh """
- set -ex
- source ci/prepare_clean_env.sh ${conda_env_name}
- pytest -v ${capture_flag} -n ${threads} --durations=30 --cov ${cov_path} --cov-report=term --cov-report xml --doctest-modules ${test_path}
- set +ex
- """
- utils.publish_test_coverage('GluonNLPCodeCov')
- }
- }
- }
- }]
-}
-
-def website_linkcheck(workspace_name, conda_env_name) {
- return ["${conda_env_name}: website link check": {
- node(NODE_LINUX_CPU) {
- ws(workspace_name) {
- timeout(time: max_time, unit: 'MINUTES') {
- utils.init_git()
- sh """
- set -ex
- source ci/prepare_clean_env.sh ${conda_env_name}
- make distribute
- set +ex
- """
- linkcheck_errors = sh returnStdout: true, script: """
- conda activate ./conda/${conda_env_name}
- """
- linkcheck_errors = linkcheck_errors.split('\n').findAll {it ==~ '/^(line *[0-9]*) broken.*$/'}
- linkcheck_errors = linkcheck_errors.join('\n')
- linkcheck_errors = linkcheck_errors.trim()
- if (linkcheck_errors && env.BRANCH_NAME.startsWith("PR-")) {
- pullRequest.comment("Found link check problems in job ${env.BRANCH_NAME}/${env.BUILD_NUMBER}:\n"+linkcheck_errors)
- }
- }
- }
- }
- }]
-}
-
-def post_website_link() {
- return ["Deploy: ": {
- node {
- timeout(time: max_time, unit: 'MINUTES') {
- if (env.BRANCH_NAME.startsWith("PR-")) {
- pullRequest.comment("Job ${env.BRANCH_NAME}/${env.BUILD_NUMBER} is complete. \nDocs are uploaded to http://gluon-nlp-staging.s3-accelerate.dualstack.amazonaws.com/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/index.html")
- }
- }
- }
- }]
-}
-
-return this
diff --git a/ci/jenkins/utils.groovy b/ci/jenkins/utils.groovy
deleted file mode 100644
index ddbde419d5..0000000000
--- a/ci/jenkins/utils.groovy
+++ /dev/null
@@ -1,214 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// initialize source codes
-def init_git() {
- deleteDir()
- retry(5) {
- try {
- // Make sure wait long enough for api.github.com request quota. Important: Don't increase the amount of
- // retries as this will increase the amount of requests and worsen the throttling
- timeout(time: 15, unit: 'MINUTES') {
- checkout scm
- sh 'git clean -xdff'
- sh 'git reset --hard'
- sh 'git submodule update --init --recursive'
- sh 'git submodule foreach --recursive git clean -ffxd'
- sh 'git submodule foreach --recursive git reset --hard'
- }
- } catch (exc) {
- deleteDir()
- error "Failed to fetch source codes with ${exc}"
- sleep 2
- }
- }
-}
-
-
-def get_git_commit_hash() {
- lastCommitMessage = sh (script: "git log -1 --pretty=%B", returnStdout: true)
- lastCommitMessage = lastCommitMessage.trim()
- if (lastCommitMessage.startsWith("Merge commit '") && lastCommitMessage.endsWith("' into HEAD")) {
- // Merge commit applied by Jenkins, skip that commit
- git_commit_hash = sh (script: "git rev-parse @~", returnStdout: true)
- } else {
- git_commit_hash = sh (script: "git rev-parse @", returnStdout: true)
- }
- return git_commit_hash.trim()
-}
-
-def publish_test_coverage(codecov_credential) {
- // CodeCovs auto detection has trouble with our CIs PR validation due the merging strategy
- git_commit_hash = get_git_commit_hash()
-
- if (env.CHANGE_ID) {
- // PR execution
- codecovArgs = "-B ${env.CHANGE_TARGET} -C ${git_commit_hash} -P ${env.CHANGE_ID}"
- } else {
- // Branch execution
- codecovArgs = "-B ${env.BRANCH_NAME} -C ${git_commit_hash}"
- }
-
- // To make sure we never fail because test coverage reporting is not available
- // Fall back to our own copy of the bash helper if it failed to download the public version
- withCredentials([string(credentialsId: codecov_credential, variable: 'CODECOV_TOKEN')]) {
- sh "(curl --retry 10 -s https://codecov.io/bash | bash -s - ${codecovArgs}) || (curl --retry 10 -s https://s3-us-west-2.amazonaws.com/mxnet-ci-prod-slave-data/codecov-bash.txt | bash -s - ${codecovArgs}) || true"
- }
-}
-
-// Allow publishing to GitHub with a custom context (the status shown under a PR)
-// Credit to https://plugins.jenkins.io/github
-def get_repo_url() {
- checkout scm
- return sh(returnStdout: true, script: "git config --get remote.origin.url").trim()
-}
-
-def update_github_commit_status(state, message) {
- node {
- // NOTE: https://issues.jenkins-ci.org/browse/JENKINS-39482
- //The GitHubCommitStatusSetter requires that the Git Server is defined under
- //*Manage Jenkins > Configure System > GitHub > GitHub Servers*.
- //Otherwise the GitHubCommitStatusSetter is not able to resolve the repository name
- //properly and you would see an empty list of repos:
- //[Set GitHub commit status (universal)] PENDING on repos [] (sha:xxxxxxx) with context:test/mycontext
- //See https://cwiki.apache.org/confluence/display/MXNET/Troubleshooting#Troubleshooting-GitHubcommit/PRstatusdoesnotgetpublished
-
- echo "Publishing commit status..."
-
- repoUrl = get_repo_url()
- echo "repoUrl=${repoUrl}"
-
- commitSha = get_git_commit_hash()
- echo "commitSha=${commitSha}"
-
- context = get_github_context()
- echo "context=${context}"
-
- // a few attempts need to be made: https://github.com/apache/incubator-mxnet/issues/11654
- for (int attempt = 1; attempt <= 3; attempt++) {
- echo "Sending GitHub status attempt ${attempt}..."
-
- step([
- $class: 'GitHubCommitStatusSetter',
- reposSource: [$class: "ManuallyEnteredRepositorySource", url: repoUrl],
- contextSource: [$class: "ManuallyEnteredCommitContextSource", context: context],
- commitShaSource: [$class: "ManuallyEnteredShaSource", sha: commitSha],
- statusBackrefSource: [$class: "ManuallyEnteredBackrefSource", backref: "${env.RUN_DISPLAY_URL}"],
- errorHandlers: [[$class: 'ShallowAnyErrorHandler']],
- statusResultSource: [
- $class: 'ConditionalStatusResultSource',
- results: [[$class: "AnyBuildResult", message: message, state: state]]
- ]
- ])
-
- if (attempt <= 2) {
- sleep 1
- }
- }
-
- echo "Publishing commit status done."
-
- }
-}
-
-def get_github_context() {
- // Since we use multi-branch pipelines, Jenkins appends the branch name to the job name
- if (env.BRANCH_NAME) {
- short_job_name = JOB_NAME.substring(0, JOB_NAME.lastIndexOf('/'))
- } else {
- short_job_name = JOB_NAME
- }
-
- return "ci/jenkins/${short_job_name}"
-}
-
-def parallel_stage(stage_name, steps) {
- // Allow to pass an array of steps that will be executed in parallel in a stage
- new_map = [:]
-
- for (def step in steps) {
- new_map = new_map << step
- }
-
- stage(stage_name) {
- parallel new_map
- }
-}
-
-def assign_node_labels(args) {
- // This function allows to assign instance labels to the generalized placeholders.
- // This serves two purposes:
- // 1. Allow generalized placeholders (e.g. NODE_WINDOWS_CPU) in the job definition
- // in order to abstract away the underlying node label. This allows to schedule a job
- // onto a different node for testing or security reasons. This could be, for example,
- // when you want to test a new set of slaves on separate labels or when a job should
- // only be run on restricted slaves
- // 2. Restrict the allowed job types within a Jenkinsfile. For example, a UNIX-CPU-only
- // Jenkinsfile should not allowed access to Windows or GPU instances. This prevents
- // users from just copy&pasting something into an existing Jenkinsfile without
- // knowing about the limitations.
- NODE_LINUX_GPU = args.linux_gpu
- NODE_LINUX_CPU = args.linux_cpu
-}
-
-def main_wrapper(args) {
- // Main Jenkinsfile pipeline wrapper handler that allows to wrap core logic into a format
- // that supports proper failure handling
- // args:
- // - core_logic: Jenkins pipeline containing core execution logic
- // - failure_handler: Failure handler
-
- // assign any caught errors here
- err = null
- try {
- update_github_commit_status('PENDING', 'Job has been enqueued')
-
- timestamps {
- args['core_logic']()
- }
-
- // set build status to success at the end
- currentBuild.result = "SUCCESS"
- update_github_commit_status('SUCCESS', 'Job succeeded')
- } catch (caughtError) {
- node {
- sh "echo caught ${caughtError}"
- err = caughtError
- currentBuild.result = "FAILURE"
- update_github_commit_status('FAILURE', 'Job failed')
- }
- } finally {
- timestamps {
- node {
- // Call failure handler
- args['failure_handler']()
-
- // Clean workspace to reduce space requirements
- cleanWs()
-
- // Remember to rethrow so the build is marked as failing
- if (err) {
- throw err
- }
- }
- }
- }
-}
-
-return this
diff --git a/ci/prepare_clean_env.sh b/ci/prepare_clean_env.sh
deleted file mode 100755
index 1a224c418a..0000000000
--- a/ci/prepare_clean_env.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-env_name=$1
-
-echo Preparing clean environment on $(hostname) in $(ls -id $(pwd))
-
-export LD_LIBRARY_PATH=/usr/local/cuda-10.0/lib64
-export CUDA_VISIBLE_DEVICES=$EXECUTOR_NUMBER
-export CONDA_ENVS_PATH=$PWD/conda
-export CONDA_PKGS_DIRS=$PWD/conda/pkgs
-export MXNET_HOME=$PWD/tests/data
-export HOROVOD_WITHOUT_TENSORFLOW=1
-export HOROVOD_WITHOUT_PYTORCH=1
-export HOROVOD_WITH_MXNET=1
-
-make clean
-conda env update --prune -p conda/${env_name} -f env/${env_name}.yml
-conda activate ./conda/${env_name}
-conda list
-printenv
-
-pip install -v -e .
-pip install horovod --no-cache-dir -U
-python -m spacy download en
-python -m spacy download de
-python -m nltk.downloader all
diff --git a/ci/rat/rat-excludes b/ci/rat/rat-excludes
deleted file mode 100755
index 3d6d00f7e8..0000000000
--- a/ci/rat/rat-excludes
+++ /dev/null
@@ -1,55 +0,0 @@
-\..*
-.*css
-\\.*
-.*ipynb
-.*html
-.*json
-.*txt
-3rdparty/*
-R-package/*
-trunk/*
-.*\\.m
-.*\\.mk
-.*\\.R
-.*svg
-.*cfg
-.*config
-.*rst
-__init__.py
-build/*
-.*\\.t
-MANIFEST
-Changes
-.*csv
-.*names
-CODEOWNERS
-snap.python
-bbox.pyx
-cpu_nms.pyx
-gpu_nms.pyx
-nms_kernel.cu
-_mask.pyx
-coco.py
-base.pyi
-special_functions-inl.h
-erfinv-inl.h
-im2col.cuh
-im2col.h
-pool.h
-dataset.cPickle
-image-classification/*
-rat-excludes
-apache-rat-tasks/*
-moderngpu/*
-deformable_im2col.cuh
-deformable_im2col.h
-REQUIRE
-Project.toml
-include/*
-.*.iml
-.*.json.ref
-searchtools_custom.js
-theme.conf
-LICENSE.binary.dependencies
-multi-bleu-detok.perl
-multi-bleu.perl
diff --git a/ci/upload_doc.sh b/ci/upload_doc.sh
deleted file mode 100755
index efa5e5d904..0000000000
--- a/ci/upload_doc.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-bucket=$1
-path=$2
-echo "Uploading doc to s3://${bucket}/${path}/"
-aws s3 sync --delete docs/_build/html/ s3://${bucket}/${path}/ --acl public-read
-echo "Uploaded doc to http://${bucket}.s3-accelerate.dualstack.amazonaws.com/${path}/index.html"
diff --git a/codecov.yml b/codecov.yml
deleted file mode 100644
index fcc1c6dece..0000000000
--- a/codecov.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-codecov:
- notify:
- require_ci_to_pass: yes
- ci:
- - ci.mxnet.io
-
-coverage:
- precision: 2
- round: down
- range: "70...100"
-
- status:
- project: yes
- patch: yes
- changes: no
-
-parsers:
- gcov:
- branch_detection:
- conditional: yes
- loop: yes
- method: no
- macro: no
-
-comment:
- layout: "header, reach, diff, files"
- behavior: default
- require_changes: no
- require_base: no
- require_head: no
diff --git a/conftest.py b/conftest.py
index 8c9e442716..04efde9756 100644
--- a/conftest.py
+++ b/conftest.py
@@ -97,7 +97,7 @@ def pytest_configure():
'use MXNET_MODULE_SEED={} to reproduce.'.format(seed))
np.random.seed(seed)
- mx.random.seed(seed)
+ mx.npx.random.seed(seed)
random.seed(seed)
# The MXNET_TEST_SEED environment variable will override MXNET_MODULE_SEED for tests with
@@ -197,6 +197,7 @@ def test_not_ok_with_random_data():
def hybridize(request):
return request.param
+
@pytest.fixture(autouse=True)
def doctest(doctest_namespace):
doctest_namespace['np'] = np
@@ -205,3 +206,10 @@ def doctest(doctest_namespace):
doctest_namespace['gluon'] = mx.gluon
import doctest
doctest.ELLIPSIS_MARKER = '-etc-'
+
+def pytest_addoption(parser):
+ parser.addoption("--device", action="append", default=[], help="list of device choices to run the tests. ex: mx.gpu() (For GPU test only)")
+
+def pytest_generate_tests(metafunc):
+ if 'ctx' in metafunc.fixturenames:
+ metafunc.parametrize("ctx", [getattr(mx, device)() for device in metafunc.config.option.device])
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
index f812baec3a..51f1f7df1c 100644
--- a/docs/_static/custom.css
+++ b/docs/_static/custom.css
@@ -20,9 +20,11 @@
}
@media (max-width: 650px) {
-.install .option, .install .title {
- width: 90%;
-}
-.install .title {
- margin-top: 1em;
+ .install .option, .install .title {
+ width: 90%;
+ }
+
+ .install .title {
+ margin-top: 1em;
+ }
}
diff --git a/docs/api/data.batchify.rst b/docs/api/data.batchify.rst
deleted file mode 100644
index 7a7eecd378..0000000000
--- a/docs/api/data.batchify.rst
+++ /dev/null
@@ -1,47 +0,0 @@
-gluonnlp.data.batchify
-======================
-
-Batchify functions can be used to transform a dataset into mini-batches that can be processed
-efficiently.
-
-.. currentmodule:: gluonnlp.data.batchify
-
-Batch Loaders
--------------
-
-.. autosummary::
- :nosignatures:
-
- Stack
- Pad
- List
- Tuple
- NamedTuple
- Dict
-
-
-Language Modeling
------------------
-
-.. autosummary::
- :nosignatures:
-
- CorpusBatchify
- CorpusBPTTBatchify
- StreamBPTTBatchify
-
-Embedding Training
-------------------
-
-.. autosummary::
- :nosignatures:
-
- EmbeddingCenterContextBatchify
-
-API Reference
--------------
-
-.. automodule:: gluonnlp.data.batchify
- :members:
- :imported-members:
- :special-members: __call__, __iter__
diff --git a/docs/api/data.rst b/docs/api/data.rst
index 78a13e9b79..540dc977f9 100644
--- a/docs/api/data.rst
+++ b/docs/api/data.rst
@@ -5,294 +5,11 @@ GluonNLP Toolkit provides tools for building efficient data pipelines for NLP ta
.. currentmodule:: gluonnlp.data
-Public Datasets
----------------
-
-Popular datasets for NLP tasks are provided in gluonnlp.
-By default, all built-in datasets are automatically downloaded from public repo and
-reside in ~/.mxnet/datasets/.
-
-
-Language modeling
-~~~~~~~~~~~~~~~~~
-
-`WikiText `_
-is a popular language modeling dataset from Salesforce.
-It is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia.
-The dataset is available under the Creative Commons Attribution-ShareAlike License.
-
-`Google 1 Billion Words `_
-is a popular language modeling dataset.
-It is a collection of over 0.8 billion tokens extracted from the WMT11 website.
-The dataset is available under Apache License.
-
-.. autosummary::
- :nosignatures:
-
- WikiText2
- WikiText103
- WikiText2Raw
- WikiText103Raw
- GBWStream
-
-
-Text Classification
-~~~~~~~~~~~~~~~~~~~
-
-`IMDB `_ is a popular dataset for binary sentiment classification.
-It provides a set of 25,000 highly polar movie reviews for training, 25,000 for testing, and additional unlabeled data.
-
-`MR `_ is a movie-review data set of 10,662 sentences labeled with respect to their overall sentiment polarity (positive or negative).
-
-`SST-1 `_ is an extension of the MR data set. However, training/test splits are provided and labels are fine-grained (very positive, positive, neutral, negative, very negative). The training and test data sets have 237,107 and 2,210 sentences respectively.
-
-SST-2 is the same as SST-1 with neutral sentences removed and only binary sentiment polarity are considered: very positive is considered as positive, and very negative is considered as negative.
-
-`SUBJ `_ is a Subjectivity data set for sentiment analysis. Sentences labeled with respect to their subjectivity status (subjective or objective).
-
-`TREC `_ is a movie-review data set of 10,000 sentences labeled with respect to their subjectivity status (subjective or objective).
-
-CR is customer reviews of various products (cameras, MP3s etc.). Sentences are labeled with respect to their overall sentiment polarities (positive or negative).
-
-`MPQA `_ is an opinion polarity detection subtask. Sentences are labeled with respect to their overall sentiment polarities (positive or negative).
-
-.. autosummary::
- :nosignatures:
-
- IMDB
- MR
- SST_1
- SST_2
- SUBJ
- TREC
- CR
- MPQA
-
-
-Word Embedding Evaluation Datasets
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-There are a number of commonly used datasets for intrinsic evaluation for word embeddings.
-
-The similarity-based evaluation datasets include:
-
-.. autosummary::
- :nosignatures:
-
- WordSim353
- MEN
- RadinskyMTurk
- RareWords
- SimLex999
- SimVerb3500
- SemEval17Task2
- BakerVerb143
- YangPowersVerb130
-
-Analogy-based evaluation datasets include:
-
-.. autosummary::
- :nosignatures:
-
- GoogleAnalogyTestSet
- BiggerAnalogyTestSet
-
-
-CoNLL Datasets
-~~~~~~~~~~~~~~
-The `CoNLL `_ datasets are from a series of annual
-competitions held at the top tier conference of the same name. The conference is organized by SIGNLL.
-
-These datasets include data for the shared tasks, such as part-of-speech (POS) tagging, chunking,
-named entity recognition (NER), semantic role labeling (SRL), etc.
-
-We provide built in support for CoNLL 2000 -- 2002, 2004, as well as the Universal Dependencies
-dataset which is used in the 2017 and 2018 competitions.
-
-.. autosummary::
- :nosignatures:
-
- CoNLL2000
- CoNLL2001
- CoNLL2002
- CoNLL2004
- UniversalDependencies21
-
-
-Machine Translation Datasets
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autosummary::
- :nosignatures:
-
- IWSLT2015
- WMT2014
- WMT2014BPE
- WMT2016
- WMT2016BPE
-
-
-Intent Classification and Slot Labeling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autosummary::
- :nosignatures:
-
- ATISDataset
- SNIPSDataset
-
-
-Question Answering
-~~~~~~~~~~~~~~~~~~
-
-`Stanford Question Answering Dataset (SQuAD) `_ is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.
-
-.. autosummary::
- :nosignatures:
-
- SQuAD
-
-
-GLUE Benchmark
-~~~~~~~~~~~~~~
-
-The `General Language Understanding Evaluation (GLUE) benchmark `_ is a collection of resources for training, evaluating, and analyzing natural language understanding systems.
-
-.. autosummary::
- :nosignatures:
-
- GlueCoLA
- GlueSST2
- GlueSTSB
- GlueQQP
- GlueRTE
- GlueMNLI
- GlueQNLI
- GlueWNLI
- GlueMRPC
-
-
-SuperGLUE Benchmark
-~~~~~~~~~~~~~~~~~~~~
-
-The `SuperGLUE Benchmark `_ a new benchmark styled after GLUE with a new set of more difficult language understanding tasks.
-
-.. autosummary::
- :nosignatures:
-
- SuperGlueRTE
- SuperGlueCB
- SuperGlueWSC
- SuperGlueWiC
- SuperGlueCOPA
- SuperGlueMultiRC
- SuperGlueBoolQ
- SuperGlueReCoRD
- SuperGlueAXb
- SuperGlueAXg
-
-
-Datasets
---------
-
-Dataset API for processing common text formats. The following classes can be used or subclassed to
-load custom datasets.
-
-.. autosummary::
- :nosignatures:
-
- TextLineDataset
- CorpusDataset
- TSVDataset
-
-
-DataStreams
------------
-
-DataStream API for streaming and processing common text formats. The following classes can be used or subclassed to
-stream large custom data.
-
-.. autosummary::
- :nosignatures:
-
- DataStream
- SimpleDataStream
- DatasetStream
- SimpleDatasetStream
- PrefetchingStream
-
-Transforms
-----------
-
-Text data transformation functions. They can be used for processing text sequences in conjunction
-with `Dataset.transform` method.
-
-.. autosummary::
- :nosignatures:
-
- ClipSequence
- PadSequence
- SacreMosesTokenizer
- SpacyTokenizer
- SacreMosesDetokenizer
- BERTTokenizer
- BERTSentenceTransform
-
-Samplers
---------
-
-Samplers determine how to iterate through datasets. The below samplers and batch samplers can help
-iterate through sequence data.
-
-.. autosummary::
- :nosignatures:
-
- SortedSampler
- FixedBucketSampler
- SortedBucketSampler
- SplitSampler
-
-The `FixedBucketSampler` uses following bucket scheme classes to generate bucket keys.
-
-.. autosummary::
- :nosignatures:
-
- ConstWidthBucket
- LinearWidthBucket
- ExpWidthBucket
-
-DataLoaders
------------
-
-DataLoaders loads data from a dataset and returns mini-batches of data
-
-.. autosummary::
- :nosignatures:
-
- ShardedDataLoader
- DatasetLoader
-
-Utilities
----------
-
-Miscellaneous utility classes and functions for processing text and sequence data.
-
-.. autosummary::
- :nosignatures:
-
- Counter
- count_tokens
- concat_sequence
- slice_sequence
- train_valid_split
- register
- create
- list_datasets
API Reference
-------------
.. automodule:: gluonnlp.data
- :members:
- :imported-members:
- :special-members: __iter__, __call__
+ :members:
+ :imported-members:
+ :special-members: __contains__, __getitem__, __setitem__
diff --git a/docs/api/embedding.rst b/docs/api/embedding.rst
index b31d4db3f5..9f5830b16e 100644
--- a/docs/api/embedding.rst
+++ b/docs/api/embedding.rst
@@ -7,8 +7,7 @@ GluonNLP Toolkit provides tools for working with embeddings.
This page describes the ``gluonnlp`` APIs for text embedding, such as loading
pre-trained embedding vectors for text tokens and storing them in the
-``mxnet.ndarray.NDArray`` format as well as utilities for intrinsic evaluation
-of text embeddings.
+``numpy.ndarray`` format.
Pre-trained Embeddings
@@ -18,32 +17,9 @@ Pre-trained Embeddings
.. autosummary::
:nosignatures:
- register
- create
list_sources
- TokenEmbedding
- GloVe
- FastText
- Word2Vec
-
-
-Intrinsic evaluation
---------------------
-
-.. currentmodule:: gluonnlp.embedding.evaluation
-.. autosummary::
- :nosignatures:
-
- register
- create
- list_evaluation_functions
- WordEmbeddingSimilarityFunction
- WordEmbeddingAnalogyFunction
- CosineSimilarity
- ThreeCosAdd
- ThreeCosMul
- WordEmbeddingSimilarity
- WordEmbeddingAnalogy
+ load_embeddings
+ get_fasttext_model
API Reference
@@ -54,7 +30,4 @@ API Reference
:imported-members:
:special-members: __contains__, __getitem__, __setitem__
-.. automodule:: gluonnlp.embedding.evaluation
- :members:
- :imported-members:
- :special-members: __contains__, __getitem__, __setitem__
+
diff --git a/docs/api/index.rst b/docs/api/index.rst
index 4d9a7e76ae..5cc27a6e00 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -4,13 +4,7 @@ API Documentation
.. toctree::
:maxdepth: 2
- vocab
- embedding
data
- data.batchify
- model
- model.train
- loss
- initializer
- optimizer
+ embedding
+ models
utils
diff --git a/docs/api/initializer.rst b/docs/api/initializer.rst
deleted file mode 100644
index 5c104e7244..0000000000
--- a/docs/api/initializer.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-gluonnlp.initializer
-======================
-
-This page describes initializers that are useful for multiple NLP model architectures.
-
-.. currentmodule:: gluonnlp.initializer
-
-Highway Bias Initializer
---------------------------
-
-We now provide Highway bias initializer defined in the following work.
-
-.. code-block:: none
-
- @inproceedings{srivastava2015training,
- title={Training very deep networks},
- author={Srivastava, Rupesh K and Greff, Klaus and Schmidhuber, J{\"u}rgen},
- booktitle={Advances in neural information processing systems},
- pages={2377--2385},
- year={2015}}
-
-.. autosummary::
- :nosignatures:
-
- HighwayBias
-
-API Reference
--------------
-
-.. automodule:: gluonnlp.initializer
- :members:
- :imported-members:
diff --git a/docs/api/loss.rst b/docs/api/loss.rst
deleted file mode 100644
index 12acfc645c..0000000000
--- a/docs/api/loss.rst
+++ /dev/null
@@ -1,51 +0,0 @@
-gluonnlp.loss
-=============
-
-GluonNLP Toolkit provides tools for easily setting up task specific loss.
-
-.. currentmodule:: gluonnlp.loss
-
-Masked Loss
------------
-
-.. autosummary::
- :nosignatures:
-
- MaskedSoftmaxCrossEntropyLoss
-
-
-Label Smoothing
----------------
-
-.. autosummary::
- :nosignatures:
-
- LabelSmoothing
-
-
-Activation Regularizers
------------------------
-
-Activation regularization and temporal activation regularization defined in the following work:
-
-.. code-block:: none
-
- @article{merity2017revisiting,
- title={Revisiting Activation Regularization for Language RNNs},
- author={Merity, Stephen and McCann, Bryan and Socher, Richard},
- journal={arXiv preprint arXiv:1708.01009},
- year={2017}}
-
-.. autosummary::
- :nosignatures:
-
- ActivationRegularizationLoss
- TemporalActivationRegularizationLoss
-
-
-API Reference
--------------
-
-.. automodule:: gluonnlp.loss
- :members:
- :imported-members:
diff --git a/docs/api/model.rst b/docs/api/model.rst
deleted file mode 100644
index 8cc594bb87..0000000000
--- a/docs/api/model.rst
+++ /dev/null
@@ -1,170 +0,0 @@
-gluonnlp.model
-==============
-
-GluonNLP Toolkit supplies models for common NLP tasks with pre-trained weights. By default,
-all requested pre-trained weights are downloaded from public repo and stored in ~/.mxnet/models/.
-
-.. currentmodule:: gluonnlp.model
-
-Model Registry
---------------
-
-The model registry provides an easy interface to obtain pre-defined and pre-trained models.
-
-.. autosummary::
- :nosignatures:
-
- get_model
-
-The `get_model` function returns a pre-defined model given the name of a
-registered model. The following sections of this page present a list of
-registered names for each model category.
-
-Information about pretrained models
------------------------------------
-
-.. autosummary::
- :nosignatures:
-
- list_models
-
-Language Modeling
------------------
-
-Components
-
-.. autosummary::
- :nosignatures:
-
- AWDRNN
- BiLMEncoder
- LSTMPCellWithClip
- StandardRNN
- BigRNN
-
-Pre-defined models
-
-.. autosummary::
- :nosignatures:
-
- awd_lstm_lm_1150
- awd_lstm_lm_600
- standard_lstm_lm_200
- standard_lstm_lm_650
- standard_lstm_lm_1500
- big_rnn_lm_2048_512
-
-Machine Translation
--------------------
-
-.. autosummary::
- :nosignatures:
-
- Seq2SeqEncoder
- TransformerEncoder
- TransformerEncoderCell
- PositionwiseFFN
-
-.. autosummary::
- :nosignatures:
-
- transformer_en_de_512
-
-Bidirectional Encoder Representations from Transformers
--------------------------------------------------------
-
-Components
-
-.. autosummary::
- :nosignatures:
-
- BERTModel
- BERTEncoder
-
-Pre-defined models
-
-.. autosummary::
- :nosignatures:
-
- bert_12_768_12
- bert_24_1024_16
-
-Convolutional Encoder
----------------------
-
-.. autosummary::
- :nosignatures:
-
- ConvolutionalEncoder
-
-ELMo
-----
-
-Components
-
-.. autosummary::
- :nosignatures:
-
- ELMoBiLM
- ELMoCharacterEncoder
-
-Pre-defined models
-
-.. autosummary::
- :nosignatures:
-
- elmo_2x1024_128_2048cnn_1xhighway
- elmo_2x2048_256_2048cnn_1xhighway
- elmo_2x4096_512_2048cnn_2xhighway
-
-Highway Network
------------------
-
-.. autosummary::
- :nosignatures:
-
- Highway
-
-Attention Cell
---------------
-
-.. autosummary::
- :nosignatures:
-
- AttentionCell
- MultiHeadAttentionCell
- MLPAttentionCell
- DotProductAttentionCell
-
-Sequence Sampling
------------------
-
-.. autosummary::
- :nosignatures:
-
- BeamSearchScorer
- BeamSearchSampler
- SequenceSampler
-
-
-Other Modeling Utilities
-------------------------
-
-.. autosummary::
- :nosignatures:
-
- WeightDropParameter
- apply_weight_drop
- L2Normalization
- GELU
- ISDense
- NCEDense
- SparseISDense
- SparseNCEDense
-
-API Reference
--------------
-
-.. automodule:: gluonnlp.model
- :members:
- :imported-members:
diff --git a/docs/api/model.train.rst b/docs/api/model.train.rst
deleted file mode 100644
index 500ab60c72..0000000000
--- a/docs/api/model.train.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-gluonnlp.model.train
-=====================
-
-GluonNLP Toolkit supplies models with train-mode since the corresponding models have different behaviors in training
- and inference, e.g., the number and type of the outputs from the forward pass are different.
-
-.. currentmodule:: gluonnlp.model.train
-
-Language Modeling
------------------
-
-.. autosummary::
- :nosignatures:
-
- AWDRNN
- StandardRNN
- CacheCell
- get_cache_model
- BigRNN
-
-
-
-Word Embeddings
----------------
-
-.. autosummary::
- :nosignatures:
-
- EmbeddingModel
- CSREmbeddingModel
- FasttextEmbeddingModel
-
-
-API Reference
--------------
-
-.. automodule:: gluonnlp.model.train
- :members:
- :imported-members:
diff --git a/docs/api/models.rst b/docs/api/models.rst
new file mode 100644
index 0000000000..a2623ce4b9
--- /dev/null
+++ b/docs/api/models.rst
@@ -0,0 +1,15 @@
+gluonnlp.models
+===============
+
+GluonNLP Toolkit supplies models for common NLP tasks with pre-trained weights. By default,
+all requested pre-trained weights are downloaded from public repo and stored in ~/.mxnet/models/.
+
+.. currentmodule:: gluonnlp.models
+
+API Reference
+-------------
+
+.. automodule:: gluonnlp.models
+ :members:
+ :imported-members:
+ :special-members: __contains__, __getitem__, __setitem__
diff --git a/docs/api/optimizer.rst b/docs/api/optimizer.rst
deleted file mode 100644
index 8bf3f7e214..0000000000
--- a/docs/api/optimizer.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-gluonnlp.optimizer
-======================
-
-Gluonnlp provides some special optimizers for training in natural language processing.
-
-.. currentmodule:: gluonnlp.optimizer
-
-BERTAdam Optimizer
---------------------------
-
-The Adam optimizer with weight decay regularization for BERT.
-
-.. autosummary::
- :nosignatures:
-
- BERTAdam
-
-API Reference
--------------
-
-.. automodule:: gluonnlp.optimizer
- :members:
- :imported-members:
diff --git a/docs/api/utils.rst b/docs/api/utils.rst
index 58c1aa008f..e672814d8d 100644
--- a/docs/api/utils.rst
+++ b/docs/api/utils.rst
@@ -5,49 +5,10 @@ GluonNLP Toolkit provides tools for easily setting up task specific loss.
.. currentmodule:: gluonnlp.utils
-
-File Handling
--------------
-
-.. autosummary::
- :nosignatures:
-
- glob
- mkdir
-
-
-Parameter and Training
-----------------------
-
-.. autosummary::
- :nosignatures:
-
- clip_grad_global_norm
-
-
-Serialization and Deserialization
----------------------------------
-
-.. autosummary::
- :nosignatures:
-
- load_parameters
- load_states
- save_parameters
- save_states
-
-Setting Seed
----------------------------------
-
-.. autosummary::
- :nosignatures:
-
- set_seed
-
-
API Reference
-------------
.. automodule:: gluonnlp.utils
- :members:
- :imported-members:
+ :members:
+ :imported-members:
+ :special-members: __contains__, __getitem__, __setitem__
diff --git a/docs/api/vocab.rst b/docs/api/vocab.rst
deleted file mode 100644
index 15efa47367..0000000000
--- a/docs/api/vocab.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-gluonnlp.vocab
-==============
-
-This page describes the ``gluonnlp.Vocab`` class for text data numericalization
-and the subword functionality provided in ``gluonnlp.vocab``.
-
-
-Vocabulary
-----------
-
-The vocabulary builds indices for text tokens and can be attached with
-token embeddings. The input counter whose keys are candidate indices may
-be obtained via :func:`gluonnlp.data.count_tokens`
-
-.. currentmodule:: gluonnlp
-.. autosummary::
- :nosignatures:
-
- Vocab
-
-
-Subword functionality
----------------------
-
-When using a vocabulary of fixed size, out of vocabulary words may be
-encountered. However, words are composed of characters, allowing intelligent
-fallbacks for out of vocabulary words based on subword units such as the
-characters or ngrams in a word. :class:`gluonnlp.vocab.SubwordFunction` provides
-an API to map words to their subword units. :doc:`model.train` contains
-models that make use of subword information to word embeddings.
-
-.. currentmodule:: gluonnlp.vocab
-.. autosummary::
- :nosignatures:
-
- SubwordFunction
- ByteSubwords
- NGramHashes
-
-
-ELMo Character-level Vocabulary
--------------------------------
-
-In the original ELMo pre-trained models, the character-level vocabulary relies on UTF-8 encoding in a specific setting.
-We provide the following vocabulary class to keep consistent with ELMo pre-trained models.
-
-.. currentmodule:: gluonnlp.vocab
-.. autosummary::
- :nosignatures:
-
- ELMoCharVocab
-
-
-BERT Vocabulary
-----------------
-
-The vocabulary for BERT, inherited from :class:`gluon.Vocab` , provides some additional special tokens for ease of use.
-
-.. currentmodule:: gluonnlp.vocab
-.. autosummary::
- :nosignatures:
-
- BERTVocab
-
-
-API Reference
--------------
-
-.. automodule:: gluonnlp
- :members:
- :imported-members:
- :special-members: __call__, __len__
-
-.. automodule:: gluonnlp.vocab
- :exclude-members: Vocab
- :members:
- :imported-members:
- :special-members: __call__, __len__
diff --git a/docs/conf.py b/docs/conf.py
index c48d913674..f707e1277f 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -63,6 +63,7 @@
nbsphinx_kernel_name = 'python3'
nbsphinx_allow_errors = True
nbsphinx_timeout = 1200
+nbsphinx_execute = 'never'
html_sourcelink_suffix = ''
html_context = {
@@ -172,8 +173,8 @@
'header_links' : [
('Install', 'install/install-more', False, ''),
('API', 'api/index', False, ''),
- ('Community', 'community/index', False, ''),
- ('Contribute', 'community/contribute', False, ''),
+ ('Community', 'website/index', False, ''),
+ ('Contribute', 'website/contribute', False, ''),
('GitHub', 'https://github.com/dmlc/gluon-nlp/', True, 'fab fa-github'),
],
@@ -209,7 +210,7 @@
intersphinx_mapping = {
'python': ('https://docs.python.org/{.major}'.format(sys.version_info), None),
- 'mxnet': ('https://beta.mxnet.io/', None),
+ 'mxnet': ('https://mxnet.apache.org/api/python/docs/', None),
'numpy': ('http://docs.scipy.org/doc/numpy/', None),
'scipy': ('http://docs.scipy.org/doc/scipy/reference', None),
'matplotlib': ('http://matplotlib.org/', None),
diff --git a/docs/examples/index.rst b/docs/examples/index.rst
index 0aba428b61..74178c748f 100644
--- a/docs/examples/index.rst
+++ b/docs/examples/index.rst
@@ -3,32 +3,8 @@ Tutorials
Interested in getting started in a new NLP area? Here are some tutorials to help get started.
-Data Loading and Vocabularies
------------------------------
-.. container:: cards
-
- .. card::
- :title: Data Loading APIs
- :link: notes/data_api.html
-
- Basics on how to load and process the sentiment dataset to form batches that can be processed efficiently.
-
- .. card::
- :title: Vocabulary APIs
- :link: notes/vocab_emb.html
-
- Learn how to write simple code to create index for tokens.
-
-
-.. toctree::
- :hidden:
- :maxdepth: 2
-
- notes/index
-
-
-Representation Learning
+Embedding
-----------------------
.. container:: cards
@@ -40,26 +16,6 @@ Representation Learning
Basics on how to use word embedding with vocab in GluonNLP and apply it on word similarity and
analogy problems.
- .. card::
- :title: Word Embeddings Training and Evaluation
- :link: word_embedding/word_embedding_training.html
-
- Learn how to train fastText and word2vec embeddings on your own dataset, and determine
- embedding quality through intrinsic evaluation.
-
- .. card::
- :title: Extracting Sentence Features with Pre-trained ELMo
- :link: sentence_embedding/elmo_sentence_representation.html
-
- See how to use GluonNLP's API to automatically download the pre-trained ELMo model, and extract features from it.
-
- .. card::
- :title: Fine-tuning Pre-trained BERT Models
- :link: sentence_embedding/bert.html
-
- See how to use GluonNLP to fine-tune a sentence pair classification model with
- pre-trained BERT parameters.
-
.. toctree::
:hidden:
@@ -68,91 +24,3 @@ Representation Learning
word_embedding/index
-Language Modeling
------------------
-
-.. container:: cards
-
- .. card::
- :title: LSTM-based Language Models
- :link: language_model/use_pretrained_lm.html
-
- Learn what a language model is, what it can do, and how to train a word-level language model
- with truncated back-propagation-through-time (BPTT).
-
-
-.. toctree::
- :hidden:
- :maxdepth: 2
-
- language_model/index
-
-
-Machine Translation
--------------------
-
-.. container:: cards
-
- .. card::
- :title: Training GNMT on IWSLT 2015 Dataset
- :link: machine_translation/gnmt.html
-
- Learn how to train Google Neural Machine Translation, a seq2seq with attention model.
-
- .. card::
- :title: Using Pre-trained Transformer
- :link: machine_translation/transformer.html
-
- Learn how to use a pre-trained transformer translation model for English-German translation.
-
-
-.. toctree::
- :hidden:
- :maxdepth: 2
-
- machine_translation/index
-
-
-Sentiment Analysis
-------------------
-
-.. container:: cards
-
- .. card::
- :title: Fine-tuning LSTM-based Language Model
- :link: sentiment_analysis/sentiment_analysis.html
-
- See how to fine-tune a pre-trained language model to perform sentiment analysis on movie reviews.
-
- .. card::
- :title: Training Structured Self-attentive Sentence Embedding
- :link: sentiment_analysis/self_attentive_sentence_embedding.html
-
- See how to use GluonNLP to build more advanced model structure for extracting sentence
- embeddings to predict Yelp review rating.
-
-
-.. toctree::
- :hidden:
- :maxdepth: 2
-
- sentiment_analysis/index
-
-
-Text Generation
----------------
-
-.. container:: cards
-
- .. card::
- :title: Sequence Generation with Beam Search Sampler and Sequence Sampler
- :link: sequence_sampling/sequence_sampling.html
-
- Learn how to generate sentence from pre-trained language model through sampling and beam
- search.
-
-.. toctree::
- :hidden:
- :maxdepth: 2
-
- sequence_sampling/index
diff --git a/docs/examples/language_model/cache_model.png b/docs/examples/language_model/cache_model.png
deleted file mode 100644
index b3c06026d8..0000000000
Binary files a/docs/examples/language_model/cache_model.png and /dev/null differ
diff --git a/docs/examples/language_model/index.rst b/docs/examples/language_model/index.rst
deleted file mode 100644
index 9696673062..0000000000
--- a/docs/examples/language_model/index.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Language Modeling
-=================
-
-.. container:: cards
-
- .. card::
- :title: Using Pre-trained Language Model
- :link: use_pretrained_lm.html
-
- Learn what a language model is, what it can do, and how to use a pre-trained language model.
-
- .. card::
- :title: Train your own LSTM based Language Model
- :link: train_language_model.html
-
- Learn how to train a word-level language model
- with truncated back-propagation-through-time (BPTT).
-
-
-.. toctree::
- :hidden:
- :maxdepth: 2
-
- use_pretrained_lm.ipynb
- train_language_model.ipynb
-
-
diff --git a/docs/examples/language_model/language_model_intro.png b/docs/examples/language_model/language_model_intro.png
deleted file mode 100644
index ec9af278ff..0000000000
Binary files a/docs/examples/language_model/language_model_intro.png and /dev/null differ
diff --git a/docs/examples/language_model/train_language_model.md b/docs/examples/language_model/train_language_model.md
deleted file mode 100644
index 4bd6178dd3..0000000000
--- a/docs/examples/language_model/train_language_model.md
+++ /dev/null
@@ -1,292 +0,0 @@
-# Train your own LSTM based Language Model
-
-Now let's go through the step-by-step process on how to train your own
-language model using GluonNLP.
-
-## Preparation
-
-We'll start by taking care of
-our basic dependencies and setting up our environment.
-
-Firstly, we import the required modules for GluonNLP and the LM.
-
-```{.python .input}
-import warnings
-warnings.filterwarnings('ignore')
-
-import glob
-import time
-import math
-
-import mxnet as mx
-from mxnet import gluon, autograd
-from mxnet.gluon.utils import download
-
-import gluonnlp as nlp
-nlp.utils.check_version('0.7.0')
-```
-
-Then we setup the environment for GluonNLP.
-
-Please note that we should change num_gpus according to how many NVIDIA GPUs are available on the target machine in the following code.
-
-```{.python .input}
-num_gpus = 1
-context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus else [mx.cpu()]
-log_interval = 200
-```
-
-Next we setup the hyperparameters for the LM we are using.
-
-Note that BPTT stands for "back propagation through time," and LR stands for learning rate. A link to more information on truncated BPTT can be found [here.](https://en.wikipedia.org/wiki/Backpropagation_through_time)
-
-```{.python .input}
-batch_size = 20 * len(context)
-lr = 20
-epochs = 3
-bptt = 35
-grad_clip = 0.25
-```
-
-## Loading the dataset
-
-Now, we load the dataset, extract the vocabulary, numericalize, and batchify in order to perform truncated BPTT.
-
-```{.python .input}
-dataset_name = 'wikitext-2'
-
-# Load the dataset
-train_dataset, val_dataset, test_dataset = [
- nlp.data.WikiText2(
- segment=segment, bos=None, eos='', skip_empty=False)
- for segment in ['train', 'val', 'test']
-]
-
-# Extract the vocabulary and numericalize with "Counter"
-vocab = nlp.Vocab(
- nlp.data.Counter(train_dataset), padding_token=None, bos_token=None)
-
-# Batchify for BPTT
-bptt_batchify = nlp.data.batchify.CorpusBPTTBatchify(
- vocab, bptt, batch_size, last_batch='discard')
-train_data, val_data, test_data = [
- bptt_batchify(x) for x in [train_dataset, val_dataset, test_dataset]
-]
-```
-
-And then we load the pre-defined language model architecture as so:
-
-```{.python .input}
-model_name = 'standard_lstm_lm_200'
-model, vocab = nlp.model.get_model(model_name, vocab=vocab, dataset_name=None)
-print(model)
-print(vocab)
-
-# Initialize the model
-model.initialize(mx.init.Xavier(), ctx=context)
-
-# Initialize the trainer and optimizer and specify some hyperparameters
-trainer = gluon.Trainer(model.collect_params(), 'sgd', {
- 'learning_rate': lr,
- 'momentum': 0,
- 'wd': 0
-})
-
-# Specify the loss function, in this case, cross-entropy with softmax.
-loss = gluon.loss.SoftmaxCrossEntropyLoss()
-```
-
-## Training the LM
-
-Now that everything is ready, we can start training the model.
-
-We first define a helper function for detaching the gradients on specific states for easier truncated BPTT.
-
-```{.python .input}
-def detach(hidden):
- if isinstance(hidden, (tuple, list)):
- hidden = [detach(i) for i in hidden]
- else:
- hidden = hidden.detach()
- return hidden
-```
-
-And then a helper evaluation function.
-
-```{.python .input}
-# Note that ctx is short for context
-def evaluate(model, data_source, batch_size, ctx):
- total_L = 0.0
- ntotal = 0
- hidden = model.begin_state(
- batch_size=batch_size, func=mx.nd.zeros, ctx=ctx)
- for i, (data, target) in enumerate(data_source):
- data = data.as_in_context(ctx)
- target = target.as_in_context(ctx)
- output, hidden = model(data, hidden)
- hidden = detach(hidden)
- L = loss(output.reshape(-3, -1), target.reshape(-1))
- total_L += mx.nd.sum(L).asscalar()
- ntotal += L.size
- return total_L / ntotal
-```
-
-### The main training loop
-
-Our loss function will be the standard cross-entropy loss function used for multi-class classification, applied at each time step to compare the model's predictions to the true next word in the sequence.
-We can calculate gradients with respect to our parameters using truncated BPTT.
-In this case, we'll back propagate for $35$ time steps, updating our weights with stochastic gradient descent and a learning rate of $20$; these correspond to the hyperparameters that we specified earlier in the notebook.
-
-
-
-```{.python .input}
-# Function for actually training the model
-def train(model, train_data, val_data, test_data, epochs, lr):
- best_val = float("Inf")
- start_train_time = time.time()
- parameters = model.collect_params().values()
-
- for epoch in range(epochs):
- total_L = 0.0
- start_epoch_time = time.time()
- start_log_interval_time = time.time()
- hiddens = [model.begin_state(batch_size//len(context), func=mx.nd.zeros, ctx=ctx)
- for ctx in context]
-
- for i, (data, target) in enumerate(train_data):
- data_list = gluon.utils.split_and_load(data, context,
- batch_axis=1, even_split=True)
- target_list = gluon.utils.split_and_load(target, context,
- batch_axis=1, even_split=True)
- hiddens = detach(hiddens)
- L = 0
- Ls = []
-
- with autograd.record():
- for j, (X, y, h) in enumerate(zip(data_list, target_list, hiddens)):
- output, h = model(X, h)
- batch_L = loss(output.reshape(-3, -1), y.reshape(-1,))
- L = L + batch_L.as_in_context(context[0]) / (len(context) * X.size)
- Ls.append(batch_L / (len(context) * X.size))
- hiddens[j] = h
- L.backward()
- grads = [p.grad(x.context) for p in parameters for x in data_list]
- gluon.utils.clip_global_norm(grads, grad_clip)
-
- trainer.step(1)
-
- total_L += sum([mx.nd.sum(l).asscalar() for l in Ls])
-
- if i % log_interval == 0 and i > 0:
- cur_L = total_L / log_interval
- print('[Epoch %d Batch %d/%d] loss %.2f, ppl %.2f, '
- 'throughput %.2f samples/s'%(
- epoch, i, len(train_data), cur_L, math.exp(cur_L),
- batch_size * log_interval / (time.time() - start_log_interval_time)))
- total_L = 0.0
- start_log_interval_time = time.time()
-
- mx.nd.waitall()
-
- print('[Epoch %d] throughput %.2f samples/s'%(
- epoch, len(train_data)*batch_size / (time.time() - start_epoch_time)))
-
- val_L = evaluate(model, val_data, batch_size, context[0])
- print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%(
- epoch, time.time()-start_epoch_time, val_L, math.exp(val_L)))
-
- if val_L < best_val:
- best_val = val_L
- test_L = evaluate(model, test_data, batch_size, context[0])
- model.save_parameters('{}_{}-{}.params'.format(model_name, dataset_name, epoch))
- print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
- else:
- lr = lr*0.25
- print('Learning rate now %f'%(lr))
- trainer.set_learning_rate(lr)
-
- print('Total training throughput %.2f samples/s'%(
- (batch_size * len(train_data) * epochs) /
- (time.time() - start_train_time)))
-```
-
-We can now actually perform the training
-
-```{.python .input}
-train(model, train_data, val_data, test_data, epochs, lr)
-```
-
-## Using your own dataset
-
-When we train a language model, we fit to the statistics of a given dataset.
-While many papers focus on a few standard datasets, such as WikiText or the Penn Tree Bank, that's just to provide a standard benchmark for the purpose of comparing models against one another.
-In general, for any given use case, you'll want to train your own language model using a dataset of your own choice.
-Here, for demonstration, we'll grab some `.txt` files corresponding to Sherlock Holmes novels.
-
-We first download the new dataset.
-
-```{.python .input}
-TRAIN_PATH = "./sherlockholmes.train.txt"
-VALID_PATH = "./sherlockholmes.valid.txt"
-TEST_PATH = "./sherlockholmes.test.txt"
-PREDICT_PATH = "./tinyshakespeare/input.txt"
-download(
- "https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.train.txt",
- TRAIN_PATH,
- sha1_hash="d65a52baaf32df613d4942e0254c81cff37da5e8")
-download(
- "https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.valid.txt",
- VALID_PATH,
- sha1_hash="71133db736a0ff6d5f024bb64b4a0672b31fc6b3")
-download(
- "https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.test.txt",
- TEST_PATH,
- sha1_hash="b7ccc4778fd3296c515a3c21ed79e9c2ee249f70")
-download(
- "https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt",
- PREDICT_PATH,
- sha1_hash="04486597058d11dcc2c556b1d0433891eb639d2e")
-
-print(glob.glob("sherlockholmes.*.txt"))
-```
-
-Then we specify the tokenizer as well as batchify the dataset.
-
-```{.python .input}
-import nltk
-moses_tokenizer = nlp.data.SacreMosesTokenizer()
-
-sherlockholmes_datasets = [
- nlp.data.CorpusDataset(
- 'sherlockholmes.{}.txt'.format(name),
- sample_splitter=nltk.tokenize.sent_tokenize,
- tokenizer=moses_tokenizer,
- flatten=True,
- eos='') for name in ['train', 'valid', 'test']
-]
-
-sherlockholmes_train_data, sherlockholmes_val_data, sherlockholmes_test_data = [
- bptt_batchify(dataset) for dataset in sherlockholmes_datasets
-]
-```
-
-We setup the evaluation to see whether our previous model trained on the other dataset does well on the new dataset.
-
-```{.python .input}
-sherlockholmes_L = evaluate(model, sherlockholmes_val_data, batch_size,
- context[0])
-print('Best validation loss %.2f, test ppl %.2f' %
- (sherlockholmes_L, math.exp(sherlockholmes_L)))
-```
-
-Or we have the option of training the model on the new dataset with just one line of code.
-
-```{.python .input}
-train(
- model,
- sherlockholmes_train_data, # This is your input training data, we leave batchifying and tokenizing as an exercise for the reader
- sherlockholmes_val_data,
- sherlockholmes_test_data, # This would be your test data, again left as an exercise for the reader
- epochs=3,
- lr=20)
-```
diff --git a/docs/examples/language_model/use_pretrained_lm.md b/docs/examples/language_model/use_pretrained_lm.md
deleted file mode 100644
index c6351c872c..0000000000
--- a/docs/examples/language_model/use_pretrained_lm.md
+++ /dev/null
@@ -1,217 +0,0 @@
-# Using Pre-trained Language Model
-
-A statistical language model is simply a probability distribution over sequences of words or characters [1].
-In this tutorial, we'll restrict our attention to word-based language models.
-Given a reliable language model, we can answer questions like *which among the following strings are we more likely to encounter?*
-
-1. 'On Monday, Mr. Lamar’s “DAMN.” took home an even more elusive honor,
-one that may never have even seemed within reach: the Pulitzer Prize"
-1. "Frog zealot flagged xylophone the bean wallaby anaphylaxis extraneous
-porpoise into deleterious carrot banana apricot."
-
-Even if we've never seen either of these sentences in our entire lives, and even though no rapper has previously been
-awarded a Pulitzer Prize, we wouldn't be shocked to see the first sentence in the New York Times.
-By comparison, we can all agree that the second sentence, consisting of incoherent babble, is comparatively unlikely.
-A statistical language model can assign precise probabilities to each of these and other strings of words.
-
-Given a large corpus of text, we can estimate (or, in this case, train) a language model $\hat{p}(x_1, ..., x_n)$.
-And given such a model, we can sample strings $\mathbf{x} \sim \hat{p}(x_1, ..., x_n)$, generating new strings according to their estimated probability.
-Among other useful applications, we can use language models to score candidate transcriptions from speech recognition models, given a preference to sentences that seem more probable (at the expense of those deemed anomalous).
-
-These days recurrent neural networks (RNNs) are the preferred method for language models. In this notebook, we will go through an example of using GluonNLP to
-
-(i) implement a typical LSTM language model architecture
-(ii) train the language model on a corpus of real data
-(iii) bring in your own dataset for training
-(iv) grab off-the-shelf pre-trained state-of-the-art language models (i.e., AWD language model) using GluonNLP.
-
-## What is a language model (LM)?
-
-The standard approach to language modeling consists of training a model that given a trailing window of text, predicts the next word in the sequence.
-When we train the model we feed in the inputs $x_1, x_2, ...$ and try at each time step to predict the corresponding next word $x_2, ..., x_{n+1}$.
-To generate text from a language model, we can iteratively predict the next word, and then feed this word as an input to the model at the subsequent time step. The image included below demonstrates this idea.
-
-
-
-## Using a pre-trained AWD LSTM language model
-
-AWD LSTM language model is the state-of-the-art RNN language model [1]. The main technique leveraged is to add weight-dropout on the recurrent hidden to hidden matrices to prevent overfitting on the recurrent connections.
-
-### Load the vocabulary and the pre-trained model
-
-```{.python .input}
-import warnings
-import math
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-
-warnings.filterwarnings('ignore')
-nlp.utils.check_version('0.7.0')
-
-num_gpus = 1
-context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus else [mx.cpu()]
-log_interval = 200
-
-batch_size = 20 * len(context)
-lr = 20
-epochs = 3
-bptt = 35
-grad_clip = 0.25
-
-dataset_name = 'wikitext-2'
-
-# Load the dataset
-train_dataset, val_dataset, test_dataset = [
- nlp.data.WikiText2(
- segment=segment, bos=None, eos='', skip_empty=False)
- for segment in ['train', 'val', 'test']
-]
-
-vocab = nlp.Vocab(
- nlp.data.Counter(train_dataset), padding_token=None, bos_token=None)
-
-
-# Batchify for BPTT
-bptt_batchify = nlp.data.batchify.CorpusBPTTBatchify(
- vocab, bptt, batch_size, last_batch='discard')
-train_data, val_data, test_data = [
- bptt_batchify(x) for x in [train_dataset, val_dataset, test_dataset]
-]
-
-awd_model_name = 'awd_lstm_lm_1150'
-awd_model, vocab = nlp.model.get_model(
- awd_model_name,
- vocab=vocab,
- dataset_name=dataset_name,
- pretrained=True,
- ctx=context[0])
-
-print(awd_model)
-print(vocab)
-```
-
-### Evaluate the pre-trained model on the validation and test datasets
-
-```{.python .input}
-# Specify the loss function, in this case, cross-entropy with softmax.
-loss = gluon.loss.SoftmaxCrossEntropyLoss()
-
-
-def detach(hidden):
- if isinstance(hidden, (tuple, list)):
- hidden = [detach(i) for i in hidden]
- else:
- hidden = hidden.detach()
- return hidden
-
-
-# Note that ctx is short for context
-def evaluate(model, data_source, batch_size, ctx):
- total_L = 0.0
- ntotal = 0
- hidden = model.begin_state(
- batch_size=batch_size, func=mx.nd.zeros, ctx=ctx)
- for i, (data, target) in enumerate(data_source):
- data = data.as_in_context(ctx)
- target = target.as_in_context(ctx)
- output, hidden = model(data, hidden)
- hidden = detach(hidden)
- L = loss(output.reshape(-3, -1), target.reshape(-1))
- total_L += mx.nd.sum(L).asscalar()
- ntotal += L.size
- return total_L / ntotal
-
-
-val_L = evaluate(awd_model, val_data, batch_size, context[0])
-test_L = evaluate(awd_model, test_data, batch_size, context[0])
-
-print('Best validation loss %.2f, val ppl %.2f' % (val_L, math.exp(val_L)))
-print('Best test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L)))
-```
-
-## Using a cache LSTM LM
-
-Cache LSTM language model [2] adds a cache-like memory to neural network language models. It can be used in conjunction with the aforementioned AWD LSTM language model or other LSTM models.
-It exploits the hidden outputs to define a probability distribution over the words in the cache.
-It generates state-of-the-art results at inference time.
-
-
-
-### Load the pre-trained model and define the hyperparameters
-
-```{.python .input}
-window = 2
-theta = 0.662
-lambdas = 0.1279
-bptt = 2000
-cache_model = nlp.model.train.get_cache_model(name=awd_model_name,
- dataset_name=dataset_name,
- window=window,
- theta=theta,
- lambdas=lambdas,
- ctx=context[0])
-
-print(cache_model)
-```
-
-### Define specific get_batch and evaluation helper functions for the cache model
-
-Note that these helper functions are very similar to the ones we defined above, but are slightly different.
-
-```{.python .input}
-val_test_batch_size = 1
-val_test_batchify = nlp.data.batchify.CorpusBatchify(vocab, val_test_batch_size)
-val_data = val_test_batchify(val_dataset)
-test_data = val_test_batchify(test_dataset)
-```
-
-```{.python .input}
-def get_batch(data_source, i, seq_len=None):
- seq_len = min(seq_len if seq_len else bptt, len(data_source) - 1 - i)
- data = data_source[i:i + seq_len]
- target = data_source[i + 1:i + 1 + seq_len]
- return data, target
-```
-
-```{.python .input}
-def evaluate_cache(model, data_source, batch_size, ctx):
- total_L = 0.0
- hidden = model.begin_state(
- batch_size=batch_size, func=mx.nd.zeros, ctx=ctx)
- next_word_history = None
- cache_history = None
- for i in range(0, len(data_source) - 1, bptt):
- if i > 0:
- print('Batch %d, ppl %f' % (i, math.exp(total_L / i)))
- if i == bptt:
- return total_L / i
- data, target = get_batch(data_source, i)
- data = data.as_in_context(ctx)
- target = target.as_in_context(ctx)
- L = 0
- outs, next_word_history, cache_history, hidden = model(
- data, target, next_word_history, cache_history, hidden)
- for out in outs:
- L += (-mx.nd.log(out)).asscalar()
- total_L += L / data.shape[1]
- hidden = detach(hidden)
- return total_L / len(data_source)
-```
-
-### Evaluate the pre-trained model on the validation and test datasets
-
-```{.python .input}
-val_L = evaluate_cache(cache_model, val_data, val_test_batch_size, context[0])
-test_L = evaluate_cache(cache_model, test_data, val_test_batch_size, context[0])
-
-print('Best validation loss %.2f, val ppl %.2f'%(val_L, math.exp(val_L)))
-print('Best test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
-```
-
-
-## References
-
-[1] Merity, S., et al. “Regularizing and optimizing LSTM language models”. ICLR 2018
-
-[2] Grave, E., et al. “Improving neural language models with a continuous cache”. ICLR 2017
diff --git a/docs/examples/machine_translation/dataprocessor.py b/docs/examples/machine_translation/dataprocessor.py
deleted file mode 100644
index 8e5e63f4d4..0000000000
--- a/docs/examples/machine_translation/dataprocessor.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Data preprocessing for transformer."""
-
-import os
-import io
-import time
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-import nmt
-import hyperparameters as hparams
-
-def cache_dataset(dataset, prefix):
- """Cache the processed npy dataset the dataset into a npz
-
- Parameters
- ----------
- dataset : SimpleDataset
- file_path : str
- """
- if not os.path.exists(nmt._constants.CACHE_PATH):
- os.makedirs(nmt._constants.CACHE_PATH)
- src_data = np.concatenate([e[0] for e in dataset])
- tgt_data = np.concatenate([e[1] for e in dataset])
- src_cumlen = np.cumsum([0]+[len(e[0]) for e in dataset])
- tgt_cumlen = np.cumsum([0]+[len(e[1]) for e in dataset])
- np.savez(os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz'),
- src_data=src_data, tgt_data=tgt_data,
- src_cumlen=src_cumlen, tgt_cumlen=tgt_cumlen)
-
-
-def load_cached_dataset(prefix):
- cached_file_path = os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz')
- if os.path.exists(cached_file_path):
- print('Loading dataset...')
- npz_data = np.load(cached_file_path)
- src_data, tgt_data, src_cumlen, tgt_cumlen = [npz_data[n] for n in
- ['src_data', 'tgt_data', 'src_cumlen', 'tgt_cumlen']]
- src_data = np.array([src_data[low:high] for low, high in zip(src_cumlen[:-1], src_cumlen[1:])])
- tgt_data = np.array([tgt_data[low:high] for low, high in zip(tgt_cumlen[:-1], tgt_cumlen[1:])])
- return gluon.data.ArrayDataset(np.array(src_data), np.array(tgt_data))
- else:
- return None
-
-
-class TrainValDataTransform(object):
- """Transform the machine translation dataset.
-
- Clip source and the target sentences to the maximum length. For the source sentence, append the
- EOS. For the target sentence, append BOS and EOS.
-
- Parameters
- ----------
- src_vocab : Vocab
- tgt_vocab : Vocab
- src_max_len : int
- tgt_max_len : int
- """
-
- def __init__(self, src_vocab, tgt_vocab, src_max_len=None, tgt_max_len=None):
- self._src_vocab = src_vocab
- self._tgt_vocab = tgt_vocab
- self._src_max_len = src_max_len
- self._tgt_max_len = tgt_max_len
-
- def __call__(self, src, tgt):
- if self._src_max_len:
- src_sentence = self._src_vocab[src.split()[:self._src_max_len]]
- else:
- src_sentence = self._src_vocab[src.split()]
- if self._tgt_max_len:
- tgt_sentence = self._tgt_vocab[tgt.split()[:self._tgt_max_len]]
- else:
- tgt_sentence = self._tgt_vocab[tgt.split()]
- src_sentence.append(self._src_vocab[self._src_vocab.eos_token])
- tgt_sentence.insert(0, self._tgt_vocab[self._tgt_vocab.bos_token])
- tgt_sentence.append(self._tgt_vocab[self._tgt_vocab.eos_token])
- src_npy = np.array(src_sentence, dtype=np.int32)
- tgt_npy = np.array(tgt_sentence, dtype=np.int32)
- return src_npy, tgt_npy
-
-
-def process_dataset(dataset, src_vocab, tgt_vocab, src_max_len=-1, tgt_max_len=-1):
- start = time.time()
- dataset_processed = dataset.transform(TrainValDataTransform(src_vocab, tgt_vocab,
- src_max_len,
- tgt_max_len), lazy=False)
- end = time.time()
- print('Processing Time spent: {}'.format(end - start))
- return dataset_processed
-
-
-def load_translation_data(dataset, src_lang='en', tgt_lang='de'):
- """Load translation dataset
-
- Parameters
- ----------
- dataset : str
- src_lang : str, default 'en'
- tgt_lang : str, default 'de'
-
- Returns
- -------
-
- """
- if dataset == 'WMT2014BPE':
- common_prefix = 'WMT2014BPE_{}_{}_{}_{}'.format(src_lang, tgt_lang,
- hparams.src_max_len, hparams.tgt_max_len)
- data_train = nlp.data.WMT2014BPE('train', src_lang=src_lang, tgt_lang=tgt_lang)
- data_val = nlp.data.WMT2014BPE('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang)
- data_test = nlp.data.WMT2014BPE('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang,
- full=False)
- elif dataset == 'TOY':
- common_prefix = 'TOY_{}_{}_{}_{}'.format(src_lang, tgt_lang,
- hparams.src_max_len, hparams.tgt_max_len)
- data_train = nmt.dataset.TOY('train', src_lang=src_lang, tgt_lang=tgt_lang)
- data_val = nmt.dataset.TOY('val', src_lang=src_lang, tgt_lang=tgt_lang)
- data_test = nmt.dataset.TOY('test', src_lang=src_lang, tgt_lang=tgt_lang)
- else:
- raise NotImplementedError
- src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab
- data_train_processed = load_cached_dataset(common_prefix + '_train')
- if not data_train_processed:
- data_train_processed = process_dataset(data_train, src_vocab, tgt_vocab,
- hparams.src_max_len, hparams.tgt_max_len)
- cache_dataset(data_train_processed, common_prefix + '_train')
- data_val_processed = load_cached_dataset(common_prefix + '_val')
- if not data_val_processed:
- data_val_processed = process_dataset(data_val, src_vocab, tgt_vocab)
- cache_dataset(data_val_processed, common_prefix + '_val')
- data_test_processed = load_cached_dataset(common_prefix + '_' + str(False) + '_test')
- if not data_test_processed:
- data_test_processed = process_dataset(data_test, src_vocab, tgt_vocab)
- cache_dataset(data_test_processed, common_prefix + '_' + str(False) + '_test')
- fetch_tgt_sentence = lambda src, tgt: tgt
- if dataset == 'WMT2014BPE':
- val_text = nlp.data.WMT2014('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang)
- test_text = nlp.data.WMT2014('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang,
- full=False)
- elif dataset == 'TOY':
- val_text = data_val
- test_text = data_test
- else:
- raise NotImplementedError
- val_tgt_sentences = list(val_text.transform(fetch_tgt_sentence))
- test_tgt_sentences = list(test_text.transform(fetch_tgt_sentence))
- return data_train_processed, data_val_processed, data_test_processed, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab
-
-
-def get_data_lengths(dataset):
- return list(dataset.transform(lambda srg, tgt: (len(srg), len(tgt))))
diff --git a/docs/examples/machine_translation/gnmt.md b/docs/examples/machine_translation/gnmt.md
deleted file mode 100644
index edfc566a46..0000000000
--- a/docs/examples/machine_translation/gnmt.md
+++ /dev/null
@@ -1,531 +0,0 @@
-# Training GNMT on IWSLT 2015 Dataset
-
-In this notebook, we are going to train Google NMT on IWSLT 2015 English-Vietnamese
-Dataset. The building process includes four key steps:
-
-1. Load and preprocess the dataset
-
-2. Create a sampler and `DataLoader`
-
-3. Build the actual model
-
-4. Write the training algorithm
-
-This tutorial will guide you through each of the steps and explain briefly how each works. Please remember to click the download button at the top of the page to download the necessary files to follow this tutorial.
-
-## Setup
-
-Firstly, we need to setup the environment and import the necessary modules. For this tutorial, a GPU is highly important.
-
-```{.python .input}
-import warnings
-warnings.filterwarnings('ignore')
-
-import argparse
-import time
-import random
-import os
-import io
-import logging
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-import nmt
-nlp.utils.check_version('0.7.0')
-```
-
-Next, we need to specify the hyperparameters for the dataset, the model, and for training and testing time.
-
-```{.python .input}
-np.random.seed(100)
-random.seed(100)
-mx.random.seed(10000)
-ctx = mx.gpu(0)
-
-# parameters for dataset
-dataset = 'IWSLT2015'
-src_lang, tgt_lang = 'en', 'vi'
-src_max_len, tgt_max_len = 50, 50
-
-# parameters for model
-num_hidden = 512
-num_layers = 2
-num_bi_layers = 1
-dropout = 0.2
-
-# parameters for training
-batch_size, test_batch_size = 128, 32
-num_buckets = 5
-epochs = 1
-clip = 5
-lr = 0.001
-lr_update_factor = 0.5
-log_interval = 10
-save_dir = 'gnmt_en_vi_u512'
-
-#parameters for testing
-beam_size = 10
-lp_alpha = 1.0
-lp_k = 5
-
-nmt.utils.logging_config(save_dir)
-```
-
-## Loading and processing the dataset
-
-The following shows how to process the dataset and cache the processed dataset
-for future use. The processing steps include the following:
-
-1. Clipping the source and target sequences
-2. Splitting the string input to a list of tokens
-3. Mapping the string token onto its integer index in the vocabulary
-4. Appending the end-of-sentence (EOS) token to source sentence and adding BOS and EOS tokens to the target sentence
-
-
-Firstly, we load and cache the dataset with the two helper functions `cache_dataset` and `load_cached_dataset`. The functions are straightforward and well commented so no further explanation will be given.
-
-```{.python .input}
-def cache_dataset(dataset, prefix):
- """Cache the processed npy dataset the dataset into an npz file
-
- Parameters
- ----------
- dataset : gluon.data.SimpleDataset
- file_path : str
- """
- if not os.path.exists(nmt._constants.CACHE_PATH):
- os.makedirs(nmt._constants.CACHE_PATH)
- src_data = np.concatenate([e[0] for e in dataset])
- tgt_data = np.concatenate([e[1] for e in dataset])
- src_cumlen = np.cumsum([0]+[len(e[0]) for e in dataset])
- tgt_cumlen = np.cumsum([0]+[len(e[1]) for e in dataset])
- np.savez(os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz'),
- src_data=src_data, tgt_data=tgt_data,
- src_cumlen=src_cumlen, tgt_cumlen=tgt_cumlen)
-
-
-def load_cached_dataset(prefix):
- cached_file_path = os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz')
- if os.path.exists(cached_file_path):
- print('Load cached data from {}'.format(cached_file_path))
- npz_data = np.load(cached_file_path)
- src_data, tgt_data, src_cumlen, tgt_cumlen = [npz_data[n] for n in
- ['src_data', 'tgt_data', 'src_cumlen', 'tgt_cumlen']]
- src_data = np.array([src_data[low:high] for low, high in zip(src_cumlen[:-1], src_cumlen[1:])])
- tgt_data = np.array([tgt_data[low:high] for low, high in zip(tgt_cumlen[:-1], tgt_cumlen[1:])])
- return gluon.data.ArrayDataset(np.array(src_data), np.array(tgt_data))
- else:
- return None
-
-```
-
-Next, we write the class `TrainValDataTransform` to have easy access to transforming and clipping the source and target sentences. This class also adds the EOS and BOS tokens for cleaner data. Please refer to the comments in the code for more details.
-
-```{.python .input}
-class TrainValDataTransform(object):
- """Transform the machine translation dataset.
-
- Clip source and the target sentences to the maximum length. For the source sentence, append the
- EOS. For the target sentence, append BOS and EOS.
-
- Parameters
- ----------
- src_vocab : Vocab
- tgt_vocab : Vocab
- src_max_len : int
- tgt_max_len : int
- """
-
- def __init__(self, src_vocab, tgt_vocab, src_max_len, tgt_max_len):
- # On initialization of the class, we set the class variables
- self._src_vocab = src_vocab
- self._tgt_vocab = tgt_vocab
- self._src_max_len = src_max_len
- self._tgt_max_len = tgt_max_len
-
- def __call__(self, src, tgt):
- # On actual calling of the class, we perform the clipping then the appending of the EOS and BOS tokens.
- if self._src_max_len > 0:
- src_sentence = self._src_vocab[src.split()[:self._src_max_len]]
- else:
- src_sentence = self._src_vocab[src.split()]
- if self._tgt_max_len > 0:
- tgt_sentence = self._tgt_vocab[tgt.split()[:self._tgt_max_len]]
- else:
- tgt_sentence = self._tgt_vocab[tgt.split()]
- src_sentence.append(self._src_vocab[self._src_vocab.eos_token])
- tgt_sentence.insert(0, self._tgt_vocab[self._tgt_vocab.bos_token])
- tgt_sentence.append(self._tgt_vocab[self._tgt_vocab.eos_token])
- src_npy = np.array(src_sentence, dtype=np.int32)
- tgt_npy = np.array(tgt_sentence, dtype=np.int32)
- return src_npy, tgt_npy
-```
-
-We leverage the class written above to create a helper function that processes the dataset in very few lines of code.
-
-```{.python .input}
-def process_dataset(dataset, src_vocab, tgt_vocab, src_max_len=-1, tgt_max_len=-1):
- start = time.time()
- dataset_processed = dataset.transform(TrainValDataTransform(src_vocab, tgt_vocab,
- src_max_len,
- tgt_max_len), lazy=False)
- end = time.time()
- print('Processing time spent: {}'.format(end - start))
- return dataset_processed
-```
-
-Here we define a function `load_translation_data` that combines all the above steps to load the data, check if it's been processed, and if not, process the data. The method returns all of the required data for training, validating, and testing our model. Please refer to the comments in the code for more information on what each piece does.
-
-```{.python .input}
-def load_translation_data(dataset, src_lang='en', tgt_lang='vi'):
- """Load translation dataset
-
- Parameters
- ----------
- dataset : str
- src_lang : str, default 'en'
- tgt_lang : str, default 'vi'
-
- Returns
- -------
- data_train_processed : Dataset
- The preprocessed training sentence pairs
- data_val_processed : Dataset
- The preprocessed validation sentence pairs
- data_test_processed : Dataset
- The preprocessed test sentence pairs
- val_tgt_sentences : list
- The target sentences in the validation set
- test_tgt_sentences : list
- The target sentences in the test set
- src_vocab : Vocab
- Vocabulary of the source language
- tgt_vocab : Vocab
- Vocabulary of the target language
- """
- common_prefix = 'IWSLT2015_{}_{}_{}_{}'.format(src_lang, tgt_lang,
- src_max_len, tgt_max_len)
-
- # Load the three datasets from files
- data_train = nlp.data.IWSLT2015('train', src_lang=src_lang, tgt_lang=tgt_lang)
- data_val = nlp.data.IWSLT2015('val', src_lang=src_lang, tgt_lang=tgt_lang)
- data_test = nlp.data.IWSLT2015('test', src_lang=src_lang, tgt_lang=tgt_lang)
- src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab
- data_train_processed = load_cached_dataset(common_prefix + '_train')
-
- # Check if each dataset has been processed or not, and if not, process and cache them.
- if not data_train_processed:
- data_train_processed = process_dataset(data_train, src_vocab, tgt_vocab,
- src_max_len, tgt_max_len)
- cache_dataset(data_train_processed, common_prefix + '_train')
- data_val_processed = load_cached_dataset(common_prefix + '_val')
- if not data_val_processed:
- data_val_processed = process_dataset(data_val, src_vocab, tgt_vocab)
- cache_dataset(data_val_processed, common_prefix + '_val')
- data_test_processed = load_cached_dataset(common_prefix + '_test')
- if not data_test_processed:
- data_test_processed = process_dataset(data_test, src_vocab, tgt_vocab)
- cache_dataset(data_test_processed, common_prefix + '_test')
-
- # Pull out the target sentences for both test and validation
- fetch_tgt_sentence = lambda src, tgt: tgt.split()
- val_tgt_sentences = list(data_val.transform(fetch_tgt_sentence))
- test_tgt_sentences = list(data_test.transform(fetch_tgt_sentence))
-
- # Return all of the necessary pieces we can extract from the data for training our model
- return data_train_processed, data_val_processed, data_test_processed, \
- val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab
-```
-
-We define a last helper function `get_data_lengths` to get the length of the datasets, again, for simplified cleaner code later.
-```{.python .input}
-def get_data_lengths(dataset):
- return list(dataset.transform(lambda srg, tgt: (len(srg), len(tgt))))
-
-```
-
-And for the last step of processing, we leverage all of our helper functions to keep the code concise and to these 15-20 lines for use in our main. This does all of the aforementioned processing along with storing the necessary information in memory for training our model.
-
-```{.python .input}
-data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab\
- = load_translation_data(dataset=dataset, src_lang=src_lang, tgt_lang=tgt_lang)
-data_train_lengths = get_data_lengths(data_train)
-data_val_lengths = get_data_lengths(data_val)
-data_test_lengths = get_data_lengths(data_test)
-
-with io.open(os.path.join(save_dir, 'val_gt.txt'), 'w', encoding='utf-8') as of:
- for ele in val_tgt_sentences:
- of.write(' '.join(ele) + '\n')
-
-with io.open(os.path.join(save_dir, 'test_gt.txt'), 'w', encoding='utf-8') as of:
- for ele in test_tgt_sentences:
- of.write(' '.join(ele) + '\n')
-
-
-data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False)
-data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
- for i, ele in enumerate(data_val)])
-data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
- for i, ele in enumerate(data_test)])
-```
-
-## Sampler and `DataLoader` construction
-
-Now, we have obtained and stored all of the relevant data information. The next step
-is to construct the sampler and `DataLoader`. The first step is to use the `batchify`
-function, which pads and stacks sequences to form mini-batches.
-
-```{.python .input}
-train_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0),
- nlp.data.batchify.Pad(pad_val=0),
- nlp.data.batchify.Stack(dtype='float32'),
- nlp.data.batchify.Stack(dtype='float32'))
-test_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0),
- nlp.data.batchify.Pad(pad_val=0),
- nlp.data.batchify.Stack(dtype='float32'),
- nlp.data.batchify.Stack(dtype='float32'),
- nlp.data.batchify.Stack())
-```
-
-We can then construct bucketing samplers, which generate batches by grouping
-sequences with similar lengths. Here, the bucketing scheme is empirically determined.
-
-```{.python .input}
-bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
-train_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_train_lengths,
- batch_size=batch_size,
- num_buckets=num_buckets,
- shuffle=True,
- bucket_scheme=bucket_scheme)
-logging.info('Train Batch Sampler:\n{}'.format(train_batch_sampler.stats()))
-val_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_val_lengths,
- batch_size=test_batch_size,
- num_buckets=num_buckets,
- shuffle=False)
-logging.info('Valid Batch Sampler:\n{}'.format(val_batch_sampler.stats()))
-test_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_test_lengths,
- batch_size=test_batch_size,
- num_buckets=num_buckets,
- shuffle=False)
-logging.info('Test Batch Sampler:\n{}'.format(test_batch_sampler.stats()))
-```
-
-Given the samplers, we can create a `DataLoader`, which is iterable. This simply is a data construct (an iterator) that can feed the model batches at a time. For more information refer to [this](https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/datasets.html) page.
-
-```{.python .input}
-train_data_loader = gluon.data.DataLoader(data_train,
- batch_sampler=train_batch_sampler,
- batchify_fn=train_batchify_fn,
- num_workers=4)
-val_data_loader = gluon.data.DataLoader(data_val,
- batch_sampler=val_batch_sampler,
- batchify_fn=test_batchify_fn,
- num_workers=4)
-test_data_loader = gluon.data.DataLoader(data_test,
- batch_sampler=test_batch_sampler,
- batchify_fn=test_batchify_fn,
- num_workers=4)
-```
-
-## Building the GNMT model
-
-After obtaining the DataLoader, we can finally build the model. The GNMT encoder and decoder
-can be easily constructed by calling `get_gnmt_encoder_decoder` function. Then, we
-feed the encoder and decoder to the `NMTModel` to construct the GNMT model.
-
-`model.hybridize` allows computation to be done using the symbolic backend. To understand what it means to be "hybridized," please refer to [this](https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/hybrid.html) page on MXNet hybridization and its advantages.
-
-```{.python .input}
-encoder, decoder, one_step_ahead_decoder = nmt.gnmt.get_gnmt_encoder_decoder(
- hidden_size=num_hidden, dropout=dropout, num_layers=num_layers,
- num_bi_layers=num_bi_layers)
-model = nlp.model.translation.NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder,
- decoder=decoder, one_step_ahead_decoder=one_step_ahead_decoder,
- embed_size=num_hidden, prefix='gnmt_')
-model.initialize(init=mx.init.Uniform(0.1), ctx=ctx)
-static_alloc = True
-model.hybridize(static_alloc=static_alloc)
-logging.info(model)
-
-# Due to the paddings, we need to mask out the losses corresponding to padding tokens.
-loss_function = nlp.loss.MaskedSoftmaxCELoss()
-loss_function.hybridize(static_alloc=static_alloc)
-```
-
-Here, we build the `BeamSearchTranslator` and define a predetermined `BeamSearchScorer` as the heuristical mechanism for the search. For more information on Beam Search and its applications to NLP, check [here](https://en.wikipedia.org/wiki/Beam_search).
-
-```{.python .input}
-translator = nmt.translation.BeamSearchTranslator(model=model, beam_size=beam_size,
- scorer=nlp.model.BeamSearchScorer(alpha=lp_alpha,
- K=lp_k),
- max_length=tgt_max_len + 100)
-logging.info('Use beam_size={}, alpha={}, K={}'.format(beam_size, lp_alpha, lp_k))
-```
-
-We define the evaluation function as shown in the code block below. The `evaluate` function uses the beam
-search translator to generate outputs for the validation and testing datasets. Please refer to the comments in the code for more information on what each piece does. In addition, we add the `write_sentences` helper method to easily output the sentences.
-
-```{.python .input}
-def evaluate(data_loader):
- """Evaluate given the data loader
-
- Parameters
- ----------
- data_loader : gluon.data.DataLoader
-
- Returns
- -------
- avg_loss : float
- Average loss
- real_translation_out : list of list of str
- The translation output
- """
- translation_out = []
- all_inst_ids = []
- avg_loss_denom = 0
- avg_loss = 0.0
-
- for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \
- in enumerate(data_loader):
- src_seq = src_seq.as_in_context(ctx)
- tgt_seq = tgt_seq.as_in_context(ctx)
- src_valid_length = src_valid_length.as_in_context(ctx)
- tgt_valid_length = tgt_valid_length.as_in_context(ctx)
-
- # Calculate Loss
- out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
- loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean().asscalar()
- all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
- avg_loss += loss * (tgt_seq.shape[1] - 1)
- avg_loss_denom += (tgt_seq.shape[1] - 1)
-
- # Translate the sequences and score them
- samples, _, sample_valid_length =\
- translator.translate(src_seq=src_seq, src_valid_length=src_valid_length)
- max_score_sample = samples[:, 0, :].asnumpy()
- sample_valid_length = sample_valid_length[:, 0].asnumpy()
-
- # Iterate through the tokens and stitch the tokens together for the sentence
- for i in range(max_score_sample.shape[0]):
- translation_out.append(
- [tgt_vocab.idx_to_token[ele] for ele in
- max_score_sample[i][1:(sample_valid_length[i] - 1)]])
-
- # Calculate the average loss and initialize a None-filled translation list
- avg_loss = avg_loss / avg_loss_denom
- real_translation_out = [None for _ in range(len(all_inst_ids))]
-
- # Combine all the words/tokens into a sentence for the final translation
- for ind, sentence in zip(all_inst_ids, translation_out):
- real_translation_out[ind] = sentence
-
- # Return the loss and the translation
- return avg_loss, real_translation_out
-
-
-def write_sentences(sentences, file_path):
- with io.open(file_path, 'w', encoding='utf-8') as of:
- for sent in sentences:
- of.write(' '.join(sent) + '\n')
-```
-
-## Training
-
-Before entering the training stage, we need to create a trainer for updating the
-parameters based on the loss. In the following example, we create a trainer that uses the ADAM
-optimizer.
-
-```{.python .input}
-trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr})
-```
-
-We can then write the training loop. During the training, we evaluate on the validation and testing datasets every epoch, and record the
-parameters that give the highest [Bilingual Evaluation Understudy Score (BLEU)](https://www.aclweb.org/anthology/P02-1040.pdf) score on the validation dataset. Before
-performing forward and backward computation, we first use the `as_in_context` function to copy
-the mini-batch to the GPU. The statement `with mx.autograd.record()` tells Gluon's
-backend to compute the gradients for the part inside the block.
-
-```{.python .input}
-best_valid_bleu = 0.0
-
-# Run through each epoch
-for epoch_id in range(epochs):
- log_avg_loss = 0
- log_avg_gnorm = 0
- log_wc = 0
- log_start_time = time.time()
-
- # Iterate through each batch
- for batch_id, (src_seq, tgt_seq, src_valid_length, tgt_valid_length)\
- in enumerate(train_data_loader):
-
- src_seq = src_seq.as_in_context(ctx)
- tgt_seq = tgt_seq.as_in_context(ctx)
- src_valid_length = src_valid_length.as_in_context(ctx)
- tgt_valid_length = tgt_valid_length.as_in_context(ctx)
-
- # Compute gradients and losses
- with mx.autograd.record():
- out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
- loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean()
- loss = loss * (tgt_seq.shape[1] - 1) / (tgt_valid_length - 1).mean()
- loss.backward()
-
- grads = [p.grad(ctx) for p in model.collect_params().values()]
- gnorm = gluon.utils.clip_global_norm(grads, clip)
- trainer.step(1)
- src_wc = src_valid_length.sum().asscalar()
- tgt_wc = (tgt_valid_length - 1).sum().asscalar()
- step_loss = loss.asscalar()
- log_avg_loss += step_loss
- log_avg_gnorm += gnorm
- log_wc += src_wc + tgt_wc
- if (batch_id + 1) % log_interval == 0:
- wps = log_wc / (time.time() - log_start_time)
- logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, gnorm={:.4f}, '
- 'throughput={:.2f}K wps, wc={:.2f}K'
- .format(epoch_id, batch_id + 1, len(train_data_loader),
- log_avg_loss / log_interval,
- np.exp(log_avg_loss / log_interval),
- log_avg_gnorm / log_interval,
- wps / 1000, log_wc / 1000))
- log_start_time = time.time()
- log_avg_loss = 0
- log_avg_gnorm = 0
- log_wc = 0
-
- # Evaluate the losses on validation and test datasets and find the corresponding BLEU score and log it
- valid_loss, valid_translation_out = evaluate(val_data_loader)
- valid_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([val_tgt_sentences], valid_translation_out)
- logging.info('[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
- .format(epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
- test_loss, test_translation_out = evaluate(test_data_loader)
- test_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([test_tgt_sentences], test_translation_out)
- logging.info('[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
- .format(epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100))
-
- # Output the sentences we predicted on the validation and test datasets
- write_sentences(valid_translation_out,
- os.path.join(save_dir, 'epoch{:d}_valid_out.txt').format(epoch_id))
- write_sentences(test_translation_out,
- os.path.join(save_dir, 'epoch{:d}_test_out.txt').format(epoch_id))
-
- # Save the model if the BLEU score is better than the previous best
- if valid_bleu_score > best_valid_bleu:
- best_valid_bleu = valid_bleu_score
- save_path = os.path.join(save_dir, 'valid_best.params')
- logging.info('Save best parameters to {}'.format(save_path))
- model.save_parameters(save_path)
-
- # Update the learning rate based on the number of epochs that have passed
- if epoch_id + 1 >= (epochs * 2) // 3:
- new_lr = trainer.learning_rate * lr_update_factor
- logging.info('Learning rate change to {}'.format(new_lr))
- trainer.set_learning_rate(new_lr)
-```
-
-## Conclusion
-In this notebook, we have shown how to train a GNMT model on the IWSLT 2015 English-Vietnamese dataset using the Gluon NLP toolkit.
-The complete training script can be found [here](https://github.com/dmlc/gluon-nlp/blob/master/scripts/machine_translation/train_gnmt.py).
-The code sequence to reproduce the results can be seen on the [machine translation page](http://gluon-nlp.mxnet.io/model_zoo/machine_translation/index.html).
diff --git a/docs/examples/machine_translation/hyperparameters.py b/docs/examples/machine_translation/hyperparameters.py
deleted file mode 100644
index f0e31a5949..0000000000
--- a/docs/examples/machine_translation/hyperparameters.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Hyperparameters for transformer."""
-
-import nmt
-
-# parameters for dataset
-src_lang = 'en'
-tgt_lang = 'de'
-src_max_len = -1
-tgt_max_len = -1
-
-# parameters for model
-num_units = 512
-hidden_size = 2048
-dropout = 0.1
-epsilon = 0.1
-num_layers = 6
-num_heads = 8
-scaled = True
-
-# parameters for training
-optimizer = 'adam'
-epochs = 3
-batch_size = 2700
-test_batch_size = 256
-num_accumulated = 1
-lr = 2
-warmup_steps = 1
-save_dir = 'transformer_en_de_u512'
-average_start = 1
-num_buckets = 20
-log_interval = 10
-bleu = '13a'
-
-#parameters for testing
-beam_size = 4
-lp_alpha = 0.6
-lp_k = 5
\ No newline at end of file
diff --git a/docs/examples/machine_translation/index.rst b/docs/examples/machine_translation/index.rst
deleted file mode 100644
index 061e1197e2..0000000000
--- a/docs/examples/machine_translation/index.rst
+++ /dev/null
@@ -1,28 +0,0 @@
-Machine Translation
-===================
-
-.. container:: cards
-
- .. card::
- :title: Training GNMT on IWSLT 2015 Dataset
- :link: gnmt.html
-
- Learn how to train Google Neural Machine Translation, a seq2seq with attention model.
-
- .. card::
- :title: Using Pre-trained Transformer
- :link: transformer.html
-
- Learn how to use a pre-trained transformer translation model for English-German translation.
-
-
-
-.. toctree::
- :hidden:
- :maxdepth: 2
-
- gnmt.ipynb
- transformer.ipynb
-
-
-
diff --git a/docs/examples/machine_translation/nmt b/docs/examples/machine_translation/nmt
deleted file mode 120000
index aa09220de6..0000000000
--- a/docs/examples/machine_translation/nmt
+++ /dev/null
@@ -1 +0,0 @@
-../../model_zoo/machine_translation
\ No newline at end of file
diff --git a/docs/examples/machine_translation/transformer.md b/docs/examples/machine_translation/transformer.md
deleted file mode 100644
index d8cdb50654..0000000000
--- a/docs/examples/machine_translation/transformer.md
+++ /dev/null
@@ -1,249 +0,0 @@
-# Using Pre-trained Transformer
-
-In this notebook, we will show how to use Transformer introduced in [1] and evaluate the pre-trained model with GluonNLP. Transformer model is shown to be more accurate and easier to parallelize than previous seq2seq-based models such as Google Neural Machine Translation. We will use the state-of-the-art pre-trained Transformer model, evaluate the pre-trained Transformer model on newstest2014 and translate a few sentences ourselves with the `BeamSearchTranslator`;
-
-## Setup
-
-We start with some usual preparation such as importing libraries and setting the environment.
-
-
-### Load MXNet and GluonNLP
-
-```{.python .input}
-import warnings
-warnings.filterwarnings('ignore')
-
-import random
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-nlp.utils.check_version('0.7.0')
-```
-
-### Setup the environment
-
-```{.python .input}
-np.random.seed(100)
-random.seed(100)
-mx.random.seed(10000)
-ctx = mx.gpu(0)
-```
-
-## Using the pre-trained transformer model
-
-Next, we load the Transformer model in GluonNLP model zoo and use the full `newstest2014` segment of the WMT 2014 English-German test dataset, and evaluate the model on it.
-
-### Load the transformer
-
-We load the pre-trained Transformer using the model API in GluonNLP, which returns the source and target vocabulary along with the model.
-
-```{.python .input}
-import nmt
-
-wmt_model_name = 'transformer_en_de_512'
-
-wmt_transformer_model, wmt_src_vocab, wmt_tgt_vocab = \
- nlp.model.get_model(wmt_model_name,
- dataset_name='WMT2014',
- pretrained=True,
- ctx=ctx)
-
-# we are using mixed vocab of EN-DE, so the source and target language vocab are the same
-print(len(wmt_src_vocab), len(wmt_tgt_vocab))
-```
-
-The Transformer model architecture is shown as below:
-
-![transformer](transformer.png)
-
-### Load and preprocess the dataset
-
-We then load the `newstest2014` segment in the WMT 2014 English-German test dataset for evaluation purpose.
-
-The following shows how to process the dataset and cache the processed dataset
-for the future use. The processing steps include:
-
-1) clip the source and target sequences
-2) split the string input to a list of tokens
-3) map the string token into its index in the vocabulary
-4) append EOS token to source sentence and add BOS and EOS tokens to target sentence.
-
-Let's first look at the WMT 2014 corpus. GluonNLP provides [WMT2014BPE](../../api/data.rst#gluonnlp.data.WMT2014BPE)
-and [WMT2014](../../api/data.rst#gluonnlp.data.WMT2014) classes. The former contains BPE-tokenized dataset, while
-the later contains the raw text. Here, we use the former for scoring, and the later for
-demonstrating actual translation.
-
-```{.python .input}
-import hyperparameters as hparams
-
-wmt_data_test = nlp.data.WMT2014BPE('newstest2014',
- src_lang=hparams.src_lang,
- tgt_lang=hparams.tgt_lang)
-print('Source language %s, Target language %s' % (hparams.src_lang, hparams.tgt_lang))
-print('Sample BPE tokens: "{}"'.format(wmt_data_test[0]))
-
-wmt_test_text = nlp.data.WMT2014('newstest2014',
- src_lang=hparams.src_lang,
- tgt_lang=hparams.tgt_lang)
-print('Sample raw text: "{}"'.format(wmt_test_text[0]))
-
-wmt_test_tgt_sentences = wmt_test_text.transform(lambda src, tgt: tgt)
-print('Sample target sentence: "{}"'.format(wmt_test_tgt_sentences[0]))
-```
-
-```{.python .input}
-import dataprocessor
-
-print(dataprocessor.TrainValDataTransform.__doc__)
-
-# wmt_transform_fn includes the four preprocessing steps mentioned above.
-wmt_transform_fn = dataprocessor.TrainValDataTransform(wmt_src_vocab, wmt_tgt_vocab)
-wmt_dataset_processed = wmt_data_test.transform(wmt_transform_fn, lazy=False)
-print(*wmt_dataset_processed[0], sep='\n')
-
-def get_length_index_fn():
- global idx
- idx = 0
- def transform(src, tgt):
- global idx
- result = (src, tgt, len(src), len(tgt), idx)
- idx += 1
- return result
- return transform
-
-wmt_data_test_with_len = wmt_dataset_processed.transform(get_length_index_fn(), lazy=False)
-```
-
-### Create the sampler and `DataLoader`
-
-Now, we have obtained the transformed datasets. The next step is to construct the sampler and `DataLoader`. First, we need to construct the batchify function, which pads and stacks sequences to form mini-batches.
-
-```{.python .input}
-wmt_test_batchify_fn = nlp.data.batchify.Tuple(
- nlp.data.batchify.Pad(pad_val=0),
- nlp.data.batchify.Pad(pad_val=0),
- nlp.data.batchify.Stack(dtype='float32'),
- nlp.data.batchify.Stack(dtype='float32'),
- nlp.data.batchify.Stack())
-```
-
-In GluonNLP, all dataset items are tuples. In the preprocessed `wmt_data_test_with_len`, it includes
-`(src, tgt, len(src), len(tgt), idx)` elements. In order to express how we'd like to batchify them, we use the built-in batchify functions.
-
-* [Tuple](../../api/data.batchify.rst#gluonnlp.data.batchify.Tuple) is the GluonNLP way of applying different batchify functions to each element of a dataset item. In this case, we are applying `Pad` to `src` and `tgt`, `Stack` to `len(src)` and `len(tgt)` with conversion to float32, and simple `Stack` to `idx` without type conversion.
-* [Pad](../../api/data.batchify.rst#gluonnlp.data.batchify.Pad) takes the elements from all dataset items in a batch, and pad them according to the item of maximum length to form a padded matrix/tensor.
-* [Stack](../../api/data.batchify.rst#gluonnlp.data.batchify.Stack) simply stacks all elements in a batch, and requires all elements to be of the same length.
-
-
-We can then construct bucketing samplers, which generate batches by grouping sequences with similar lengths. Here, we use [FixedBucketSampler](../../api/data.rst#gluonnlp.data.FixedBucketSampler) with [ExpWidthBucket](../../api/data.rst#gluonnlp.data.ExpWidthBucket). FixedBucketSampler aims to assign each data sample to a fixed bucket based on its length. With this setting, the sampler would select buckets following an approximately exponentially increasing interval of maximum bucket lengths.
-
-```{.python .input}
-wmt_bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
-wmt_test_batch_sampler = nlp.data.FixedBucketSampler(
- lengths=wmt_data_test_with_len.transform(lambda src, tgt, src_len, tgt_len, idx: tgt_len), # target length
- use_average_length=True, # control the element lengths (i.e. number of tokens) to be about the same
- bucket_scheme=wmt_bucket_scheme,
- batch_size=256)
-print(wmt_test_batch_sampler.stats())
-```
-
-Given the samplers, we can use a `[DataLoader]`(https://mxnet.apache.org/versions/master/api/python/gluon/data.html#mxnet.gluon.data.DataLoader) to sample the datasets.
-
-```{.python .input}
-wmt_test_data_loader = gluon.data.DataLoader(
- wmt_data_test_with_len,
- batch_sampler=wmt_test_batch_sampler,
- batchify_fn=wmt_test_batchify_fn,
- num_workers=8)
-len(wmt_test_data_loader)
-```
-
-### Evaluating the transformer
-
-Next, we evaluate the performance of the model on the WMT test dataset. We first define the `BeamSearchTranslator` to generate the actual translations.
-
-```{.python .input}
-wmt_translator = nmt.translation.BeamSearchTranslator(
- model=wmt_transformer_model,
- beam_size=hparams.beam_size,
- scorer=nlp.model.BeamSearchScorer(alpha=hparams.lp_alpha, K=hparams.lp_k),
- max_length=200)
-```
-
-Then we calculate the `loss` as well as the `bleu` score on the `newstest2014` WMT 2014 English-German test dataset. This may take a while.
-
-```{.python .input}
-import time
-import utils
-
-eval_start_time = time.time()
-
-wmt_test_loss_function = nlp.loss.MaskedSoftmaxCELoss()
-wmt_test_loss_function.hybridize()
-
-wmt_detokenizer = nlp.data.SacreMosesDetokenizer()
-
-wmt_test_loss, wmt_test_translation_out = utils.evaluate(wmt_transformer_model,
- wmt_test_data_loader,
- wmt_test_loss_function,
- wmt_translator,
- wmt_tgt_vocab,
- wmt_detokenizer,
- ctx)
-
-wmt_test_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([wmt_test_tgt_sentences],
- wmt_test_translation_out,
- tokenized=False,
- tokenizer=hparams.bleu,
- split_compound_word=False,
- bpe=False)
-
-print('WMT14 EN-DE SOTA model test loss: %.2f; test bleu score: %.2f; time cost %.2fs'
- %(wmt_test_loss, wmt_test_bleu_score * 100, (time.time() - eval_start_time)))
-```
-
-```{.python .input}
-print('Sample translations:')
-num_pairs = 3
-
-for i in range(num_pairs):
- print('EN:')
- print(wmt_test_text[i][0])
- print('DE-Candidate:')
- print(wmt_test_translation_out[i])
- print('DE-Reference:')
- print(wmt_test_tgt_sentences[i])
- print('========')
-```
-
-### Translation Inference
-
-We now show the actual translation example (EN-DE) when given a source language using the SOTA Transformer model.
-
-```{.python .input}
-import utils
-
-print('Translate the following English sentence into German:')
-
-sample_src_seq = 'We love language .'
-
-print('[\'' + sample_src_seq + '\']')
-
-sample_tgt_seq = utils.translate(wmt_translator,
- sample_src_seq,
- wmt_src_vocab,
- wmt_tgt_vocab,
- wmt_detokenizer,
- ctx)
-
-print('The German translation is:')
-print(sample_tgt_seq)
-```
-
-If you'd like to train your own transformer models, you may find the training scripts in our
-[model zoo](../../model_zoo/machine_translation/index.rst).
-
-## References
-
-[1] Vaswani, Ashish, et al. "Attention is all you need." Advances in Neural Information Processing Systems. 2017.
diff --git a/docs/examples/machine_translation/transformer.png b/docs/examples/machine_translation/transformer.png
deleted file mode 100644
index 2561c01a11..0000000000
Binary files a/docs/examples/machine_translation/transformer.png and /dev/null differ
diff --git a/docs/examples/machine_translation/utils.py b/docs/examples/machine_translation/utils.py
deleted file mode 100644
index faa16a39d0..0000000000
--- a/docs/examples/machine_translation/utils.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utilities for transformer."""
-
-import numpy as np
-import math
-import mxnet as mx
-import time
-import logging
-import io
-import nmt
-import hyperparameters as hparams
-
-def evaluate(model, data_loader, test_loss_function, translator, tgt_vocab, detokenizer, context):
- """Evaluate given the data loader
-
- Parameters
- ----------
- data_loader : DataLoader
-
- Returns
- -------
- avg_loss : float
- Average loss
- real_translation_out : list of list of str
- The translation output
- """
- translation_out = []
- all_inst_ids = []
- avg_loss_denom = 0
- avg_loss = 0.0
- for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \
- in enumerate(data_loader):
- src_seq = src_seq.as_in_context(context)
- tgt_seq = tgt_seq.as_in_context(context)
- src_valid_length = src_valid_length.as_in_context(context)
- tgt_valid_length = tgt_valid_length.as_in_context(context)
- # Calculating Loss
- out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
- loss = test_loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean().asscalar()
- all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
- avg_loss += loss * (tgt_seq.shape[1] - 1)
- avg_loss_denom += (tgt_seq.shape[1] - 1)
- # Translate
- samples, _, sample_valid_length = \
- translator.translate(src_seq=src_seq, src_valid_length=src_valid_length)
- max_score_sample = samples[:, 0, :].asnumpy()
- sample_valid_length = sample_valid_length[:, 0].asnumpy()
- for i in range(max_score_sample.shape[0]):
- translation_out.append(
- [tgt_vocab.idx_to_token[ele] for ele in
- max_score_sample[i][1:(sample_valid_length[i] - 1)]])
- avg_loss = avg_loss / avg_loss_denom
- real_translation_out = [None for _ in range(len(all_inst_ids))]
- for ind, sentence in zip(all_inst_ids, translation_out):
- real_translation_out[ind] = detokenizer(nmt.bleu._bpe_to_words(sentence),
- return_str=True)
- return avg_loss, real_translation_out
-
-def translate(translator, src_seq, src_vocab, tgt_vocab, detokenizer, ctx):
- src_sentence = src_vocab[src_seq.split()]
- src_sentence.append(src_vocab[src_vocab.eos_token])
- src_npy = np.array(src_sentence, dtype=np.int32)
- src_nd = mx.nd.array(src_npy)
- src_nd = src_nd.reshape((1, -1)).as_in_context(ctx)
- src_valid_length = mx.nd.array([src_nd.shape[1]]).as_in_context(ctx)
- samples, _, sample_valid_length = \
- translator.translate(src_seq=src_nd, src_valid_length=src_valid_length)
- max_score_sample = samples[:, 0, :].asnumpy()
-
- sample_valid_length = sample_valid_length[:, 0].asnumpy()
- translation_out = []
- for i in range(max_score_sample.shape[0]):
- translation_out.append(
- [tgt_vocab.idx_to_token[ele] for ele in
- max_score_sample[i][1:(sample_valid_length[i] - 1)]])
- real_translation_out = [None for _ in range(len(translation_out))]
- for ind, sentence in enumerate(translation_out):
- real_translation_out[ind] = detokenizer(nmt.bleu._bpe_to_words(sentence),
- return_str=True)
- return real_translation_out
-
-def train_one_epoch(epoch_id, model, train_data_loader, trainer, label_smoothing, loss_function, grad_interval, average_param_dict, update_average_param_dict, step_num, ctx):
- log_avg_loss = 0
- log_wc = 0
- loss_denom = 0
- step_loss = 0
- log_start_time = time.time()
- for batch_id, seqs in enumerate(train_data_loader):
- if batch_id % grad_interval == 0:
- step_num += 1
- new_lr = hparams.lr / math.sqrt(hparams.num_units) * min(1. / math.sqrt(step_num), step_num * hparams.warmup_steps ** (-1.5))
- trainer.set_learning_rate(new_lr)
- src_wc, tgt_wc, bs = np.sum([(shard[2].sum(), shard[3].sum(), shard[0].shape[0])
- for shard in seqs], axis=0)
- src_wc = src_wc.asscalar()
- tgt_wc = tgt_wc.asscalar()
- loss_denom += tgt_wc - bs
- seqs = [[seq.as_in_context(context) for seq in shard]
- for context, shard in zip([ctx], seqs)]
- Ls = []
- with mx.autograd.record():
- for src_seq, tgt_seq, src_valid_length, tgt_valid_length in seqs:
- out, _ = model(src_seq, tgt_seq[:, :-1],
- src_valid_length, tgt_valid_length - 1)
- smoothed_label = label_smoothing(tgt_seq[:, 1:])
- ls = loss_function(out, smoothed_label, tgt_valid_length - 1).sum()
- Ls.append((ls * (tgt_seq.shape[1] - 1)) / hparams.batch_size / 100.0)
- for L in Ls:
- L.backward()
- if batch_id % grad_interval == grad_interval - 1 or\
- batch_id == len(train_data_loader) - 1:
- if update_average_param_dict:
- for k, v in model.collect_params().items():
- average_param_dict[k] = v.data(ctx).copy()
- update_average_param_dict = False
-
- trainer.step(float(loss_denom) / hparams.batch_size / 100.0)
- param_dict = model.collect_params()
- param_dict.zero_grad()
- if step_num > hparams.average_start:
- alpha = 1. / max(1, step_num - hparams.average_start)
- for name, average_param in average_param_dict.items():
- average_param[:] += alpha * (param_dict[name].data(ctx) - average_param)
- step_loss += sum([L.asscalar() for L in Ls])
- if batch_id % grad_interval == grad_interval - 1 or\
- batch_id == len(train_data_loader) - 1:
- log_avg_loss += step_loss / loss_denom * hparams.batch_size * 100.0
- loss_denom = 0
- step_loss = 0
- log_wc += src_wc + tgt_wc
- if (batch_id + 1) % (hparams.log_interval * grad_interval) == 0:
- wps = log_wc / (time.time() - log_start_time)
- logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, '
- 'throughput={:.2f}K wps, wc={:.2f}K'
- .format(epoch_id, batch_id + 1, len(train_data_loader),
- log_avg_loss / hparams.log_interval,
- np.exp(log_avg_loss / hparams.log_interval),
- wps / 1000, log_wc / 1000))
- log_start_time = time.time()
- log_avg_loss = 0
- log_wc = 0
\ No newline at end of file
diff --git a/docs/examples/notes/data_api.rst b/docs/examples/notes/data_api.rst
deleted file mode 100644
index 8b7cc26fd6..0000000000
--- a/docs/examples/notes/data_api.rst
+++ /dev/null
@@ -1,286 +0,0 @@
-Data Loading API
-----------------
-
-In this tutorial, we show how to load and process the sentiment dataset to form batches that can be processed efficiently,
-using classes from :mod:`gluonnlp.data.sampler` and :mod:`gluonnlp.data.batchify`.
-We use :class:`~gluonnlp.data.IMDB`
-dataset as an example, where the dataset has 50,000 movie reviews, labeled as positive or negative. The dataset
-is split into the training/testing dataset, each consisting of 25,000 reviews.
-
-Data Loading
-~~~~~~~~~~~~
-
-Let us see a quick example.
-
-.. code:: python
-
- >>> import mxnet as mx
- >>> from mxnet import gluon, nd
- >>> import gluonnlp as nlp
-
-.. code:: python
-
- >>> train_dataset, test_dataset = [nlp.data.IMDB(root='data/imdb', segment=segment)
- >>> for segment in ('train', 'test')]
-
-.. code:: python
-
- >>> print('#training samples={:d}, #testing samples={:d}'.format(len(train_dataset),
- >>> len(test_dataset)))
-
- #training samples: 25000, #testing samples: 25000
-
-.. code:: python
-
- >>> print(train_dataset[0])
-
- ['Bromwell High is a cartoon comedy. It ran at the same time as some other programs
- about school life, such as "Teachers". My 35 years in the teaching profession lead
- me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers".
- The scramble to survive financially, the insightful students who can see right through
- their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of
- the schools I knew and their students. When I saw the episode in which a student repeatedly
- tried to burn down the school, I immediately recalled ......... at .......... High. A
- classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to
- Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched.
- What a pity that it isn\'t!', 9]
-
-In the above example, we load ``train_dataset`` and ``test_dataset``, which are both :class:`~mxnet.gluon.data.SimpleDataset` objects.
-
-:class:`~mxnet.gluon.data.SimpleDataset`: wrapper for lists and arrays. Each entry in the train_dataset is a [string, score] pair,
-where the score falls into [1, 2, ..., 10]. Thus in the given example, 9 indicates a positive feedback on the movie.
-
-
-Data Processing
-~~~~~~~~~~~~~~~
-
-The next step is to preprocess the data so that it can be used to train the model. The following code
-shows how to tokenize the string with :class:`~gluonnlp.data.SpacyTokenizer` and then :class:`clip `
-the list of output tokens by length.
-
-.. code:: python
-
- >>> tokenizer = nlp.data.SpacyTokenizer('en')
- >>> # We use 50 as maximum length for illustration
- >>> # For actual learning, we may use a large value such as 500
- >>> length_clip = nlp.data.ClipSequence(50)
- >>> seq, score = train_dataset[0]
- >>> print(length_clip(tokenizer(seq)))
-
- ['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at', 'the', 'same',
- 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as',
- '"', 'Teachers', '"', '.', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead',
- 'me', 'to', 'believe', 'that', 'Bromwell', 'High', "'s", 'satire', 'is', 'much', 'closer',
- 'to', 'reality', 'than', 'is']
-
-Now, we are ready to preprocess the whole dataset. The following code shows how to tokenize the dataset parallelly.
-
-.. code:: python
-
- >>> import time
- >>> import multiprocessing as mp
- >>> length_clip = nlp.data.ClipSequence(500)
-
-.. code:: python
-
- >>> # Dataset preprocessing
- >>> def preprocess(x):
- >>> data, label = x
- >>> # In the labeled train/test sets, a negative review has a score <= 4
- >>> # out of 10, and a positive review has a score >= 7 out of 10. Thus
- >>> # reviews with more neutral ratings are not included in the train/test
- >>> # sets. We labeled a negative review whose score <= 4 as 0, and a
- >>> # positive review whose score >= 7 as 1. As the neural ratings are not
- >>> # included in the datasets, we can simply use 5 as our threshold.
- >>> label = int(label > 5)
- >>> data = length_clip(tokenizer(data))
- >>> return data, label
- >>>
- >>> def get_length(x):
- >>> return float(len(x[0]))
- >>>
- >>> def preprocess_dataset(dataset):
- >>> start = time.time()
- >>> pool = mp.Pool()
- >>> dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
- >>> lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
- >>> end = time.time()
- >>> print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
- >>> return dataset, lengths
- >>>
- >>> # Preprocess the dataset
- >>> train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
- >>> test_dataset, test_data_lengths = preprocess_dataset(test_dataset)
-
- Tokenize using spaCy...
-
- Done! Tokenizing Time=12.85s, #Sentences=25000
-
- Done! Tokenizing Time=12.99s, #Sentences=25000
-
-Then, we are going to construct a :class:`vocabulary ` for the training dataset. The :class:`vocabulary `
-will be used to convert the tokens to numerical indices, which facilitates the creation of word embedding matrices.
-
-.. code:: python
-
- >>> import itertools
- >>> train_seqs = [sample[0] for sample in train_dataset]
- >>> counter = nlp.data.count_tokens(list(itertools.chain.from_iterable(train_seqs)))
- >>> vocab = nlp.Vocab(counter, max_size=10000, padding_token=None,
- >>> bos_token=None, eos_token=None)
- >>> print(vocab)
-
- Vocab(size=10001, unk="", reserved="None")
-
-.. code:: python
-
- >>> # Convert string token to its index in the dictionary
- >>> def token_to_idx(x):
- >>> return vocab[x[0]], x[1]
- >>>
- >>> pool = mp.Pool()
- >>> train_dataset = pool.map(token_to_idx, train_dataset)
- >>> test_dataset = pool.map(token_to_idx, test_dataset)
- >>> pool.close()
- >>> print(train_dataset[0][0][:50])
-
- [0, 2012, 8, 4, 1116, 231, 3, 51, 2311, 40, 1, 188, 67, 20, 59, 97, 6190, 49, 422, 133,
- 2, 160, 20, 13, 0, 13, 3, 374, 5063, 174, 9, 1, 5390, 6674, 498, 83, 7, 282, 12, 0, 2012,
- 15, 2042, 8, 88, 2661, 7, 714, 87, 8]
-
-
-Bucketing and Dataloader
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-The next step is to construct a :class:`dataloader ` for training.
-As the sequences have variable lengths, we need to pad the sequences so that they have the same
-lengths in the minibatch, which allows the fast tensor manipulation in GPU.
-
-.. code:: python
-
- >>> batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0, pad_val=0),
- >>> nlp.data.batchify.Stack())
-
-:class:`~gluonnlp.data.batchify.Tuple` wraps multiple batchify functions and applies each input function on each input field,
-respectively. In this case, we are applying :class:`~gluonnlp.data.batchify.Pad` on the sequence and :class:`~gluonnlp.data.batchify.Stack`
-on the labels. Given the batchify function, we can construct the dataloaders for both training samples and testing samples.
-
-.. code:: python
-
- >>> train_dataloader = gluon.data.DataLoader(dataset=train_dataset,
- >>> batch_size=batch_size,
- >>> shuffle=True,
- >>> batchify_fn=batchify_fn)
- >>> test_dataloader = gluon.data.DataLoader(dataset=test_dataset,
- >>> batch_size=batch_size,
- >>> shuffle=False,
- >>> batchify_fn=batchify_fn)
-
-As :class:`~mxnet.gluon.data.DataLoader` is iterable, we can iterate over the dataset easily using the following code:
-
-.. code:: python
-
- >>> for data, label in train_dataloader:
-
-In the above example, minibatcheas are formed using uniform sampling, which can cause a large amount of padding as shown
-in the figure below.
-
-.. image:: ./images/no_bucket_strategy.png
- :height: 200px
- :width: 1000 px
- :alt: alternate text
- :align: center
-
-In light of this, we consider
-constructing a sampler using bucketing, which defines how the samples in a dataset will be iterated in a more economic way.
-
-.. code:: python
-
- >>> batch_sampler = nlp.data.sampler.FixedBucketSampler(train_data_lengths,
- >>> batch_size=16,
- >>> num_buckets=10,
- >>> ratio=0,
- >>> shuffle=True)
- >>> print(batch_sampler.stats())
-
- FixedBucketSampler:
- sample_num=25000, batch_num=1567
- key=[68, 116, 164, 212, 260, 308, 356, 404, 452, 500]
- cnt=[981, 1958, 5686, 4614, 2813, 2000, 1411, 1129, 844, 3564]
- batch_size=[16, 16, 16, 16, 16, 16, 16, 16, 16, 16]
-
-In this example, we use a :class:`~gluonnlp.data.sampler.FixedBucketSampler`, which assigns each data sample to a
-fixed bucket based on its length.
-
-The bucket keys are either given or generated from the input sequence lengths. We construct 10 buckets, where `cnt`
-shows the number of samples belonging to each bucket. A graphic illustration of using :class:`~gluonnlp.data.sampler.FixedBucketSampler`
-can be seen as follows:
-
-.. image:: ./images/fixed_bucket_strategy_ratio0.0.png
- :height: 200px
- :width: 1000 px
- :alt: alternate text
- :align: center
-
-To further improve the throughput, we can consider scaling up the batch size of smaller buckets. This can be achieved
-by using a parameter ``ratio``. Assume the :math:`i` th key is :math:`K_i` , the default batch size is :math:`B` , the ratio to
-scale the batch size is :math:`\alpha` and the batch size corresponds to the :math:`i` th bucket is :math:`B_i` . We have:
-
-.. math::
-
- B_i = \max(\alpha B \times \frac{\max_j sum(K_j)}{sum(K_i)}, B)
-
-.. image:: ./images/fixed_bucket_strategy_ratio0.7.png
- :height: 200px
- :width: 1000 px
- :alt: alternate text
- :align: center
-
-Thus, setting this to a value larger than 0, like 0.5, will scale up the batch size of the
-smaller buckets.
-
-.. code:: python
-
- >>> batch_sampler = nlp.data.sampler.FixedBucketSampler(train_data_lengths,
- >>> batch_size=16,
- >>> num_buckets=10,
- >>> ratio=0.5,
- >>> shuffle=True)
- >>> print(batch_sampler.stats())
-
- FixedBucketSampler:
- sample_num=25000, batch_num=1306
- key=[68, 116, 164, 212, 260, 308, 356, 404, 452, 500]
- cnt=[981, 1958, 5686, 4614, 2813, 2000, 1411, 1129, 844, 3564]
- batch_size=[58, 34, 24, 18, 16, 16, 16, 16, 16, 16]
-
-Now, we can create dataloader using bucketing sampler for both training set.
-
-.. code:: python
-
- >>> train_dataloader = gluon.data.DataLoader(dataset=train_dataset,
- >>> batch_sampler=batch_sampler,
- >>> batchify_fn=batchify_fn)
-
-In our sampler API, we also provide another sampler called :class:`~gluonnlp.data.sampler.SortedBucketSampler`,
-which results in the following padding pattern:
-
-.. image:: ./images/sorted_bucket_strategy.png
- :height: 200px
- :width: 1000 px
- :alt: alternate text
- :align: center
-
-With this strategy, we partition data to a number of buckets with size `batch_size * mult`, where `mult` is a multiplier
-to determine the bucket size. Each bucket contains `batch_size * mult` elements. The samples inside each bucket are sorted
-based on sort_key and then batched.
-
-.. code:: python
-
- >>> batch_sampler = nlp.data.sampler.SortedBucketSampler(train_data_lengths,
- >>> batch_size=16,
- >>> mult=100,
- >>> shuffle=True)
-
-More details about the training using pre-trained language model and bucketing can be found in the
-`sentiment analysis tutorial `_.
diff --git a/docs/examples/notes/images/fixed_bucket_strategy_ratio0.0.png b/docs/examples/notes/images/fixed_bucket_strategy_ratio0.0.png
deleted file mode 100644
index cae9de8c7a..0000000000
Binary files a/docs/examples/notes/images/fixed_bucket_strategy_ratio0.0.png and /dev/null differ
diff --git a/docs/examples/notes/images/fixed_bucket_strategy_ratio0.7.png b/docs/examples/notes/images/fixed_bucket_strategy_ratio0.7.png
deleted file mode 100644
index 685a80d7cf..0000000000
Binary files a/docs/examples/notes/images/fixed_bucket_strategy_ratio0.7.png and /dev/null differ
diff --git a/docs/examples/notes/images/no_bucket_strategy.png b/docs/examples/notes/images/no_bucket_strategy.png
deleted file mode 100644
index dd11d631c3..0000000000
Binary files a/docs/examples/notes/images/no_bucket_strategy.png and /dev/null differ
diff --git a/docs/examples/notes/images/sorted_bucket_strategy.png b/docs/examples/notes/images/sorted_bucket_strategy.png
deleted file mode 100644
index 227e48b8a7..0000000000
Binary files a/docs/examples/notes/images/sorted_bucket_strategy.png and /dev/null differ
diff --git a/docs/examples/notes/index.rst b/docs/examples/notes/index.rst
deleted file mode 100644
index f090dbd4e8..0000000000
--- a/docs/examples/notes/index.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-Data Loading and Vocabularies
-=============================
-
-Here are some notes on the basic usage of our API.
-
-
-.. container:: cards
-
- .. card::
- :title: Data Loading API
- :link: data_api.html
-
- See how to load and process the sentiment dataset to form batches that can be processed efficiently.
-
- .. card::
- :title: Vocabulary and Embedding API
- :link: vocab_emb.html
-
- See how to how to write simple code to create index for tokens.
-
-.. toctree::
- :hidden:
- :maxdepth: 1
-
- data_api
- vocab_emb
diff --git a/docs/examples/notes/vocab_emb.rst b/docs/examples/notes/vocab_emb.rst
deleted file mode 100644
index 04496a6e2d..0000000000
--- a/docs/examples/notes/vocab_emb.rst
+++ /dev/null
@@ -1,99 +0,0 @@
-Vocabulary and Embedding API
-----------------------------
-
-This note illustrates how to write simple code to create index for tokens to form a
-:class:`vocabulary `, and utilize pre-trained :mod:`word-embeddings
-`.
-
-All the code demonstrated in this document assumes that the following
-modules or packages are imported.
-
-.. code:: python
-
- >>> from mxnet import gluon, nd
- >>> import gluonnlp as nlp
-
-
-Indexing words and using pre-trained word embeddings
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-As a common use case, let us index words, attach pre-trained word
-embeddings for them, and use such embeddings in :mod:`mxnet.gluon` in just a few
-lines of code.
-
-To begin with, suppose that we have a simple text data set in the string
-format. We can count word frequency in the data set.
-
-.. code:: python
-
- >>> text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world']
- >>> counter = nlp.data.count_tokens(text_data)
-
-The obtained :class:`~gluonnlp.data.Counter` has key-value pairs whose keys are words and
-values are word frequencies. This allows us to filter out infrequent
-words. Suppose that we want to build indices for all the keys in :class:`~gluonnlp.data.Counter`.
-We need a :class:`~gluonnlp.Vocab` instance with :class:`~gluonnlp.data.Counter` as its argument.
-
-.. code:: python
-
- >>> my_vocab = nlp.Vocab(counter)
-
-To attach word embeddings to indexed words in ``my_vocab``, let us go on
-to create a :class:`fastText word embedding ` instance by specifying the embedding
-name ``fasttext`` and the pre-trained file name ``wiki.simple``.
-
-.. code:: python
-
- >>> fasttext = nlp.embedding.create('fasttext', source='wiki.simple')
-
-This automatically downloads the corresponding embedding file from public repo,
-and the file is by default stored in ~/.mxnet/embedding/.
-Next, we can attach word embedding ``fasttext`` to indexed words
-``my_vocab``.
-
-.. code:: python
-
- >>> my_vocab.set_embedding(fasttext)
-
-Now we are ready to access the :class:`fastText word embedding ` vectors for
-indexed words, such as 'hello' and 'world'.
-
-.. code:: python
-
- >>> my_vocab.embedding[['hello', 'world']]
-
- [[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01
- ...
- -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02]
- [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01
- ...
- -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]]
-
-
-To demonstrate how to use pre-trained word embeddings with :mod:`mxnet.gluon` models,
-let us first obtain indices of the words ‘hello’ and ‘world’.
-
-.. code:: python
-
- >>> my_vocab[['hello', 'world']]
- [5, 4]
-
-We can obtain the vector representation for the words ‘hello’ and
-‘world’ by specifying their indices (5 and 4) and the weight matrix
-``my_vocab.embedding.idx_to_vec`` in :class:`mxnet.gluon.nn.Embedding`.
-
-.. code:: python
-
- >>> input_dim, output_dim = my_vocab.embedding.idx_to_vec.shape
- >>> layer = gluon.nn.Embedding(input_dim, output_dim)
- >>> layer.initialize()
- >>> layer.weight.set_data(my_vocab.embedding.idx_to_vec)
- >>> layer(nd.array([5, 4]))
-
- [[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01
- ...
- -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02]
- [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01
- ...
- -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]]
-
diff --git a/docs/examples/sentence_embedding/bert b/docs/examples/sentence_embedding/bert
deleted file mode 120000
index 849189304e..0000000000
--- a/docs/examples/sentence_embedding/bert
+++ /dev/null
@@ -1 +0,0 @@
-../../../scripts/bert/
\ No newline at end of file
diff --git a/docs/examples/sentence_embedding/bert-embed.png b/docs/examples/sentence_embedding/bert-embed.png
deleted file mode 100644
index 1100e970cf..0000000000
Binary files a/docs/examples/sentence_embedding/bert-embed.png and /dev/null differ
diff --git a/docs/examples/sentence_embedding/bert-sentence-pair.png b/docs/examples/sentence_embedding/bert-sentence-pair.png
deleted file mode 100644
index 1dc37953f4..0000000000
Binary files a/docs/examples/sentence_embedding/bert-sentence-pair.png and /dev/null differ
diff --git a/docs/examples/sentence_embedding/bert.md b/docs/examples/sentence_embedding/bert.md
deleted file mode 100644
index f26b9b7e57..0000000000
--- a/docs/examples/sentence_embedding/bert.md
+++ /dev/null
@@ -1,421 +0,0 @@
-# Fine-tuning Pre-trained BERT Models
-
-Pre-trained language representations have been shown to improve many downstream NLP tasks such as
-question answering, and natural language inference. To apply pre-trained
-representations to these tasks, there are two main strategies:
-
-1. The *feature-based* approach, which uses the pre-trained representations as additional
-features to the downstream task.
-2. Or the *fine-tuning*-based approach, which trains the downstream tasks by
-fine-tuning pre-trained parameters.
-
-While feature-based approaches such as ELMo [3] (introduced in the previous tutorial) are effective
-in improving many downstream tasks, they require task-specific architectures.
-Devlin, Jacob, et al proposed BERT [1] (Bidirectional Encoder Representations
-from Transformers), which *fine-tunes* deep bi-directional representations on a
-wide range of tasks with minimal task-specific parameters, and obtains state-
-of-the-art results.
-
-In this tutorial, we will focus on fine-tuning with the
-pre-trained BERT model to classify semantically equivalent sentence pairs.
-
-Specifically, we will:
-
-1. Load the state-of-the-art pre-trained BERT model and attach an additional layer for classification
-2. Process and transform sentence-pair data for the task at hand
-3. Fine-tune the BERT model for sentence classification
-
-## Setup
-
-To use this tutorial, please download the required files from the above download link, and install
-GluonNLP.
-
-### Importing necessary modules
-
-```{.python .input}
-import warnings
-warnings.filterwarnings('ignore')
-
-import io
-import random
-import numpy as np
-import mxnet as mx
-import gluonnlp as nlp
-from gluonnlp.calibration import BertLayerCollector
-# this notebook assumes that all required scripts are already
-# downloaded from the corresponding tutorial webpage on http://gluon-nlp.mxnet.io
-from bert import data
-
-nlp.utils.check_version('0.8.1')
-```
-
-### Setting up the environment
-
-Please note the comment in the code if no GPU is available.
-
-```{.python .input}
-np.random.seed(100)
-random.seed(100)
-mx.random.seed(10000)
-# change `ctx` to `mx.cpu()` if no GPU is available.
-ctx = mx.gpu(0)
-```
-
-## Using the pre-trained BERT model
-
-The list of pre-trained BERT models available
-in GluonNLP can be found
-[here](../../model_zoo/bert/index.rst).
-
-In this
-tutorial, the BERT model we will use is BERT
-BASE trained on an uncased corpus of books and
-the English Wikipedia dataset in the
-GluonNLP model zoo.
-
-### Get BERT
-
-Let's first take
-a look at the BERT model
-architecture for sentence pair classification below:
-![bert-sentence-pair](bert-sentence-pair.png)
-where the model takes a pair of
-sequences and pools the representation of the
-first token in the sequence.
-Note that the original BERT model was trained for a
-masked language model and next-sentence prediction tasks, which includes layers
-for language model decoding and
-classification. These layers will not be used
-for fine-tuning the sentence pair classification.
-
-We can load the
-pre-trained BERT fairly easily
-using the model API in GluonNLP, which returns the vocabulary
-along with the
-model. We include the pooler layer of the pre-trained model by setting
-`use_pooler` to `True`.
-
-```{.python .input}
-bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',
- dataset_name='book_corpus_wiki_en_uncased',
- pretrained=True, ctx=ctx, use_pooler=True,
- use_decoder=False, use_classifier=False)
-print(bert_base)
-```
-
-### Transform the model for `SentencePair` classification
-
-Now that we have loaded
-the BERT model, we only need to attach an additional layer for classification.
-The `BERTClassifier` class uses a BERT base model to encode sentence
-representation, followed by a `nn.Dense` layer for classification.
-
-```{.python .input}
-bert_classifier = nlp.model.BERTClassifier(bert_base, num_classes=2, dropout=0.1)
-# only need to initialize the classifier layer.
-bert_classifier.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-bert_classifier.hybridize(static_alloc=True)
-
-# softmax cross entropy loss for classification
-loss_function = mx.gluon.loss.SoftmaxCELoss()
-loss_function.hybridize(static_alloc=True)
-
-metric = mx.metric.Accuracy()
-```
-
-## Data preprocessing for BERT
-
-For this tutorial, we need to do a bit of preprocessing before feeding our data introduced
-the BERT model. Here we want to leverage the dataset included in the downloaded archive at the
-beginning of this tutorial.
-
-### Loading the dataset
-
-We use
-the dev set of the
-Microsoft Research Paraphrase Corpus dataset. The file is
-named 'dev.tsv'. Let's take a look at the first few lines of the raw dataset.
-
-```{.python .input}
-tsv_file = io.open('dev.tsv', encoding='utf-8')
-for i in range(5):
- print(tsv_file.readline())
-```
-
-The file contains 5 columns, separated by tabs.
-The header of
-the file explains each of these columns, although an explanation for each is included
-here:
-0. The label indicating whether the two
-sentences are semantically equivalent
-1. The id of the first sentence in this
-sample
-2. The id of the second sentence in this sample
-3. The content of the
-first sentence
-4. The content of the second sentence
-
-For our task, we are
-interested in the 0th, 3rd and 4th columns.
-To load this dataset, we can use the
-`TSVDataset` API and skip the first line because it's just the schema:
-
-```{.python .input}
-# Skip the first line, which is the schema
-num_discard_samples = 1
-# Split fields by tabs
-field_separator = nlp.data.Splitter('\t')
-# Fields to select from the file
-field_indices = [3, 4, 0]
-data_train_raw = nlp.data.TSVDataset(filename='dev.tsv',
- field_separator=field_separator,
- num_discard_samples=num_discard_samples,
- field_indices=field_indices)
-sample_id = 0
-# Sentence A
-print(data_train_raw[sample_id][0])
-# Sentence B
-print(data_train_raw[sample_id][1])
-# 1 means equivalent, 0 means not equivalent
-print(data_train_raw[sample_id][2])
-```
-
-To use the pre-trained BERT model, we need to pre-process the data in the same
-way it was trained. The following figure shows the input representation in BERT:
-![bert-embed](bert-embed.png)
-
-We will use
-`BERTDatasetTransform` to perform the following transformations:
-- tokenize
-the
-input sequences
-- insert [CLS] at the beginning
-- insert [SEP] between sentence
-A and sentence B, and at the end
-- generate segment ids to indicate whether
-a token belongs to the first sequence or the second sequence.
-- generate valid length
-
-```{.python .input}
-# Use the vocabulary from pre-trained model for tokenization
-bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
-
-# The maximum length of an input sequence
-max_len = 128
-
-# The labels for the two classes [(0 = not similar) or (1 = similar)]
-all_labels = ["0", "1"]
-
-# whether to transform the data as sentence pairs.
-# for single sentence classification, set pair=False
-# for regression task, set class_labels=None
-# for inference without label available, set has_label=False
-pair = True
-transform = data.transform.BERTDatasetTransform(bert_tokenizer, max_len,
- class_labels=all_labels,
- has_label=True,
- pad=True,
- pair=pair)
-data_train = data_train_raw.transform(transform)
-
-print('vocabulary used for tokenization = \n%s'%vocabulary)
-print('%s token id = %s'%(vocabulary.padding_token, vocabulary[vocabulary.padding_token]))
-print('%s token id = %s'%(vocabulary.cls_token, vocabulary[vocabulary.cls_token]))
-print('%s token id = %s'%(vocabulary.sep_token, vocabulary[vocabulary.sep_token]))
-print('token ids = \n%s'%data_train[sample_id][0])
-print('segment ids = \n%s'%data_train[sample_id][1])
-print('valid length = \n%s'%data_train[sample_id][2])
-print('label = \n%s'%data_train[sample_id][3])
-```
-
-## Fine-tuning the model
-
-Now we have all the pieces to put together, and we can finally start fine-tuning the
-model with very few epochs. For demonstration, we use a fixed learning rate and
-skip the validation steps. For the optimizer, we leverage the ADAM optimizer which
-performs very well for NLP data and for BERT models in particular.
-
-```{.python .input}
-# The hyperparameters
-batch_size = 32
-lr = 5e-6
-
-# The FixedBucketSampler and the DataLoader for making the mini-batches
-train_sampler = nlp.data.FixedBucketSampler(lengths=[int(item[2]) for item in data_train],
- batch_size=batch_size,
- shuffle=True)
-bert_dataloader = mx.gluon.data.DataLoader(data_train, batch_sampler=train_sampler)
-
-trainer = mx.gluon.Trainer(bert_classifier.collect_params(), 'adam',
- {'learning_rate': lr, 'epsilon': 1e-9})
-
-# Collect all differentiable parameters
-# `grad_req == 'null'` indicates no gradients are calculated (e.g. constant parameters)
-# The gradients for these params are clipped later
-params = [p for p in bert_classifier.collect_params().values() if p.grad_req != 'null']
-grad_clip = 1
-
-# Training the model with only three epochs
-log_interval = 4
-num_epochs = 3
-for epoch_id in range(num_epochs):
- metric.reset()
- step_loss = 0
- for batch_id, (token_ids, segment_ids, valid_length, label) in enumerate(bert_dataloader):
- with mx.autograd.record():
-
- # Load the data to the GPU
- token_ids = token_ids.as_in_context(ctx)
- valid_length = valid_length.as_in_context(ctx)
- segment_ids = segment_ids.as_in_context(ctx)
- label = label.as_in_context(ctx)
-
- # Forward computation
- out = bert_classifier(token_ids, segment_ids, valid_length.astype('float32'))
- ls = loss_function(out, label).mean()
-
- # And backwards computation
- ls.backward()
-
- # Gradient clipping
- trainer.allreduce_grads()
- nlp.utils.clip_grad_global_norm(params, 1)
- trainer.update(1)
-
- step_loss += ls.asscalar()
- metric.update([label], [out])
-
- # Printing vital information
- if (batch_id + 1) % (log_interval) == 0:
- print('[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f}'
- .format(epoch_id, batch_id + 1, len(bert_dataloader),
- step_loss / log_interval,
- trainer.learning_rate, metric.get()[1]))
- step_loss = 0
-```
-
-## Quantize the model
-
-GluonNLP also delivered some INT8 quantization methods to improve the performance and reduce the deployment costs for the natural language inference tasks. In real production, there are two main benefits of lower precision (INT8). First, the computation can be accelerated by the low precision instruction, like Intel Vector Neural Network Instruction (VNNI). Second, lower precision data type would save the memory bandwidth and allow for better cache locality and save the power. The new feature can get up to 4X performance speedup in the latest [AWS EC2 C5 instances](https://aws.amazon.com/blogs/aws/now-available-new-c5-instance-sizes-and-bare-metal-instances/) under the [Intel Deep Learning Boost (VNNI)](https://www.intel.ai/intel-deep-learning-boost/) enabled hardware with less than 0.5% accuracy drop.
-
-Now we have a fine-tuned model on MRPC training dataset and in this section, we will quantize the model into INT8 data type on a subset of MRPC validation dataset.
-
-```{.python .input}
-# The hyperparameters
-dev_batch_size = 32
-num_calib_batches = 5
-quantized_dtype = 'auto'
-calib_mode = 'customize'
-
-# sampler for evaluation
-pad_val = vocabulary[vocabulary.padding_token]
-batchify_fn = nlp.data.batchify.Tuple(
- nlp.data.batchify.Pad(axis=0, pad_val=pad_val), # input
- nlp.data.batchify.Pad(axis=0, pad_val=0), # segment
- nlp.data.batchify.Stack(), # length
- nlp.data.batchify.Stack('int32')) # label
-dev_dataloader = mx.gluon.data.DataLoader(data_train, batch_size=dev_batch_size, num_workers=4,
- shuffle=False, batchify_fn=batchify_fn)
-
-# Calibration function
-def calibration(net, dev_data, num_calib_batches, quantized_dtype, calib_mode):
- """calibration function on the dev dataset."""
- print('Now we are doing calibration on dev with cpu.')
- collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=None)
- num_calib_examples = dev_batch_size * num_calib_batches
- quantized_net = mx.contrib.quantization.quantize_net_v2(net, quantized_dtype=quantized_dtype,
- exclude_layers=[],
- quantize_mode='smart',
- quantize_granularity='channel-wise',
- calib_data=dev_data,
- calib_mode=calib_mode,
- num_calib_examples=num_calib_examples,
- ctx=mx.cpu(),
- LayerOutputCollector=collector,
- logger=None)
- print('Calibration done with success.')
- return quantized_net
-
-# will remove until mxnet 1.7 release.
-try:
- quantized_net = calibration(bert_classifier,
- dev_dataloader,
- num_calib_batches,
- quantized_dtype,
- calib_mode)
-except AttributeError:
- nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx)
- warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
-```
-
-## Deployment
-
-After quantization, we can also export the quantized model for inference deployment.
-
-```{.python .input}
-prefix = './model_bert_squad_quantized'
-
-def deployment(net, prefix, dataloader):
- net.export(prefix, epoch=0)
- print('Saving quantized model at ', prefix)
- print('load symbol file directly as SymbolBlock for model deployment.')
- static_net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(prefix),
- ['data0', 'data1', 'data2'],
- '{}-0000.params'.format(prefix))
- static_net.hybridize(static_alloc=True, static_shape=True)
- for batch_id, (token_ids, segment_ids, valid_length, label) in enumerate(dev_dataloader):
- token_ids = token_ids.as_in_context(mx.cpu())
- valid_length = valid_length.as_in_context(mx.cpu())
- segment_ids = segment_ids.as_in_context(mx.cpu())
- label = label.as_in_context(mx.cpu())
- out = static_net(token_ids, segment_ids, valid_length.astype('float32'))
- metric.update([label], [out])
-
- # Printing vital information
- if (batch_id + 1) % (log_interval) == 0:
- print('[Batch {}/{}], acc={:.3f}'
- .format(batch_id + 1, len(bert_dataloader),
- metric.get()[1]))
- return metric
-
-# will remove until mxnet 1.7 release.
-try:
- eval_metric = deployment(quantized_net, prefix, dev_dataloader)
-except NameError:
- nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx)
- warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
-```
-
-## Conclusion
-
-In this tutorial, we showed how to fine-tune a sentence pair
-classification model with pre-trained BERT parameters. In GluonNLP, this can be
-done with such few, simple steps. All we did was apply a BERT-style data transformation to
-pre-process the data, automatically download the pre-trained model, and feed the
-transformed data into the model, all within 50 lines of code!
-
-For demonstration purpose, we skipped the warmup learning rate
-schedule and validation on the dev dataset used in the original
-implementation. Please visit the
-[BERT model zoo webpage](../../model_zoo/bert/index.rst), or the scripts/bert folder
-in the Github repository for the complete fine-tuning scripts.
-
-## References
-
-[1] Devlin, Jacob, et al. "Bert:
-Pre-training of deep
-bidirectional transformers for language understanding."
-arXiv preprint
-arXiv:1810.04805 (2018).
-
-[2] Dolan, William B., and Chris
-Brockett.
-"Automatically constructing a corpus of sentential paraphrases."
-Proceedings of
-the Third International Workshop on Paraphrasing (IWP2005). 2005.
-
-[3] Peters,
-Matthew E., et al. "Deep contextualized word representations." arXiv
-preprint
-arXiv:1802.05365 (2018).
diff --git a/docs/examples/sentence_embedding/bert.png b/docs/examples/sentence_embedding/bert.png
deleted file mode 100644
index 74243dae62..0000000000
Binary files a/docs/examples/sentence_embedding/bert.png and /dev/null differ
diff --git a/docs/examples/sentence_embedding/dev.tsv b/docs/examples/sentence_embedding/dev.tsv
deleted file mode 100644
index 435bde0d09..0000000000
--- a/docs/examples/sentence_embedding/dev.tsv
+++ /dev/null
@@ -1,409 +0,0 @@
-Quality #1 ID #2 ID #1 String #2 String
-1 1355540 1355592 He said the foodservice pie business doesn 't fit the company 's long-term growth strategy . " The foodservice pie business does not fit our long-term growth strategy .
-0 2029631 2029565 Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war . His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
-0 487993 487952 The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat . The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
-1 1989515 1989458 The AFL-CIO is waiting until October to decide if it will endorse a candidate . The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
-0 1783137 1782659 No dates have been set for the civil or the criminal trial . No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
-1 3039165 3039036 Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed . It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
-0 1490811 1490840 While dioxin levels in the environment were up last year , they have dropped by 75 percent since the 1970s , said Caswell . The Institute said dioxin levels in the environment have fallen by as much as 76 percent since the 1970s .
-1 426112 426210 This integrates with Rational PurifyPlus and allows developers to work in supported versions of Java , Visual C # and Visual Basic .NET. IBM said the Rational products were also integrated with Rational PurifyPlus , which allows developers to work in Java , Visual C # and VisualBasic .Net.
-1 1439663 1439808 The top rate will go to 4.45 percent for all residents with taxable incomes above $ 500,000 . For residents with incomes above $ 500,000 , the income-tax rate will increase to 4.45 percent .
-1 3147370 3147525 The results appear in the January issue of Cancer , an American Cancer Society journal , being published online today . The results appear in the January issue of Cancer , an American Cancer Society ( news - web sites ) journal , being published online Monday .
-1 3300040 3299992 The delegates said raising and distributing funds has been complicated by the U.S. crackdown on jihadi charitable foundations , bank accounts of terror-related organizations and money transfers . Bin Laden ’ s men pointed out that raising and distributing funds has been complicated by the U.S. crackdown on jihadi charitable foundations , bank accounts of terror-related organizations and money transfers .
-0 524136 524119 " Sanitation is poor ... there could be typhoid and cholera , " he said . " Sanitation is poor , drinking water is generally left behind . . . there could be typhoid and cholera . "
-0 969512 969295 The broader Standard & Poor 's 500 Index .SPX gave up 11.91 points , or 1.19 percent , at 986.60 . The technology-laced Nasdaq Composite Index was down 25.36 points , or 1.53 percent , at 1,628.26 .
-1 1685339 1685429 The only announced Republican to replace Davis is Rep. Darrell Issa of Vista , who has spent $ 1.71 million of his own money to force a recall . So far the only declared major party candidate is Rep. Darrell Issa , a Republican who has spent $ 1.5 million of his own money to fund the recall .
-1 1967578 1967664 The decision to issue new guidance has been prompted by intelligence passed to Britain by the FBI in a secret briefing in late July . Scotland Yard 's decision to issue new guidance has been prompted by new intelligence passed to Britain by the FBI in late July .
-1 2047034 2046820 Unable to find a home for him , a judge told mental health authorities they needed to find supervised housing and treatment for DeVries somewhere in California . The judge had told the state Department of Mental Health to find supervised housing and treatment for DeVries somewhere in California .
-1 2046630 2046644 The decision came a year after Whipple ended federal oversight of the district 's racial balance , facilities , budget , and busing . The decision came a year after Whipple ended federal oversight of school busing as well as the district 's racial balance , facilities and budget .
-0 2221603 2221633 In midafternoon trading , the Nasdaq composite index was up 8.34 , or 0.5 percent , to 1,790.47 . The Nasdaq Composite Index .IXIC dipped 8.59 points , or 0.48 percent , to 1,773.54 .
-1 129995 129864 Morgan Stanley raised its rating on the beverage maker to " overweight " from " equal-weight " saying in part that pricing power with its bottlers should improve in 2004 . Morgan Stanley raised its rating on the company to " overweight " from " equal-weight , " saying the beverage maker 's pricing power with bottlers should improve in 2004 .
-0 919683 919782 The pound also made progress against the dollar , reached fresh three-year highs at $ 1.6789 . The British pound flexed its muscle against the dollar , last up 1 percent at $ 1.6672 .
-0 970740 971209 Friday , Stanford ( 47-15 ) blanked the Gamecocks 8-0 . Stanford ( 46-15 ) has a team full of such players this season .
-1 2745055 2745022 Last month Intel raised its revenue guidance for the quarter to between $ 7.6 billion and $ 7.8 billion . At the end of the second quarter , Intel initially predicted sales of between $ 6.9 billion and $ 7.5 billion .
-0 2199097 2199072 The driver , Eugene Rogers , helped to remove children from the bus , Wood said . At the accident scene , the driver was " covered in blood " but helped to remove children , Wood said .
-1 1609290 1609098 ONG KONG , July 9 Tens of thousands of demonstrators gathered tonight before the legislature building here to call for free elections and the resignation of Hong Kong 's leader . Tens of thousands of demonstrators gathered yesterday evening to stand before this city 's legislature building and call for free elections and the resignation of Hong Kong 's leader .
-1 1597193 1597119 Saddam loyalists have been blamed for sabotaging the nation 's infrastructure , as well as frequent attacks on U.S. soldiers . Hussein loyalists have been blamed for sabotaging the nation 's infrastructure and attacking US soldiers .
-1 2758944 2758975 Its closest living relatives are a family frogs called sooglossidae that are found only in the Seychelles in the Indian Ocean . Its closest relative is found in the Seychelles Archipelago , near Madagascar in the Indian Ocean .
-0 2584416 2584653 Cooley said he expects Muhammad will similarly be called as a witness at a pretrial hearing for Malvo . Lee Boyd Malvo will be called as a witness Wednesday in a pretrial hearing for fellow sniper suspect John Allen Muhammad .
-1 86007 86373 " Instead of pursuing the most imminent and real threats - international terrorists , " Graham said , " this Bush administration chose to settle old scores . " " Instead of pursuing the most imminent and real threats - international terrorists - this Bush administration has chosen to settle old scores , " Graham said .
-1 1602860 1602844 He said they lied on a sworn affidavit that requires them to list prior marriages . Morgenthau said the women , all U.S. citizens , lied on a sworn affidavit that requires them to list prior marriages .
-1 1201306 1201329 The association said 28.2 million DVDs were rented in the week that ended June 15 , compared with 27.3 million VHS cassettes . The Video Software Dealers Association said 28.2 million DVDs were rented out last week , compared to 27.3 million VHS cassettes .
-0 461779 461815 With these assets , Funny Cide has a solid chance to become the first Triple Crown winner since Affirmed in 1978 . Funny Cide is looking to become horse racing 's first Triple Crown winner in a generation .
-1 1438666 1438643 Intel was disappointed and assessing its " options in the event Mr. Hamidi resumes his spamming activity against Intel , " spokesman Chuck Mulloy said . Intel spokesman Chuck Mulloy said the company was disappointed and assessing its " options in the event Mr. Hamidi resumes his spamming activity against Intel . "
-1 3261484 3261306 Mr Annan also warned the US should not use the war on terror as an excuse to suppress " long-cherished freedoms " . Annan warned that the dangers of extremism after September 11 should not be used as an excuse to suppress " long-cherished " freedoms .
-1 1277539 1277527 At community colleges , tuition will jump to $ 2,800 from $ 2,500 . Community college students will see their tuition rise by $ 300 to $ 2,800 or 12 percent .
-1 3035788 3035918 He made a point of saying during Tuesdays debate that the Confederate flag was a racist symbol . Though Dean made a point of saying during the debate that the Confederate flag is a racist symbol .
-0 132553 132725 Bush wanted " to see an aircraft landing the same way that the pilots saw an aircraft landing , " White House press secretary Ari Fleischer said yesterday . On Tuesday , before Byrd 's speech , Fleischer said Bush wanted ' ' to see an aircraft landing the same way that the pilots saw an aircraft landing .
-0 2259788 2259747 On Monday the Palestinian Prime Minister , Mahmoud Abbas , will report to the Palestinian parliament on his Government 's achievements in its first 100 days in office . Palestinian Prime Minister Mahmoud Abbas must defend the record of his first 100 days in office before Parliament today as the death toll in the occupied territories continues to rise .
-0 2307064 2307235 The civilian unemployment rate improved marginally last month -- slipping to 6.1 percent -- even as companies slashed payrolls by 93,000 . The civilian unemployment rate improved marginally last month _ sliding down to 6.1 percent _ as companies slashed payrolls by 93,000 amid continuing mixed signals about the nation 's economic health .
-1 3046488 3046824 Per-user pricing is $ 29 for Workplace Messaging , $ 89 for Team Collaboration and $ 35 for Collaborative Learning . Workplace Messaging is $ 29 , Workplace Team Collaboration is $ 89 , and Collaborative Learning is $ 35 .
-1 86020 86007 " Instead of pursuing the most imminent and real threats – international terrorism – this Bush administration chose to settle old scores , " Mr. Graham said . " Instead of pursuing the most imminent and real threats - international terrorists , " Graham said , " this Bush administration chose to settle old scores . "
-0 1100998 1100441 SARS has killed about 800 people and affected more than 8400 since being detected in China in November . SARS has killed about 800 people and sickened more than 8,400 worldwide , mostly in Asia .
-1 2268396 2268480 Authorities had no evidence to suggest the two incidents were connected . There was no immediate evidence that the two incidents were connected , police said .
-0 1984039 1983986 " Jeremy 's a good guy , " Barber said , adding : " Jeremy is living the dream life of the New York athlete . He also said Shockey is " living the dream life of a New York athlete .
-0 2697659 2697747 Ratliff 's daughters , Margaret and Martha Ratliff , were adopted by Peterson after their mother 's death . Peterson helped raise Ratliff 's two daughters , Margaret and Martha Ratliff , who supported him throughout the trial .
-0 2175939 2176090 After losing as much as 84.56 earlier , the Dow Jones industrial average closed up 22.81 , or 0.2 percent , at 9,340.45 . In midday trading , the Dow Jones industrial average lost 68.84 , or 0.7 percent , to 9,248.80 .
-1 886618 886456 Rumsfeld , who has been feuding for two years with Army leadership , passed over nine active-duty four-star generals . Rumsfeld has been feuding for a long time with Army leadership , and he passed over nine active-duty four-star generals .
-1 588637 588864 Consumers who said jobs are difficult to find jumped from 29.4 to 32.6 , while those claiming work was plentiful slipped from 13 to 12.6 . Consumers who said jobs are difficult to find jumped to 32.6 from 29.4 , while those saying work was plentiful slipped to 12.6 from 13 in April .
-0 2252795 2252970 He has no immediate plans for television advertising , believing it is unnecessary this early . A Lieberman aide said there were no immediate plans for television advertising .
-1 1756329 1756394 " I think it happened very quickly , " Houston Police Department homicide investigator Phil Yochum said of the crime . " I think it happened very quickly , " said Investigator Phil Yochum of the Houston Police Department 's homicide division .
-1 1673112 1673068 United issued a statement saying it will " work professionally and cooperatively with all its unions . " Senior vice president Sara Fields said the airline " will work professionally and cooperatively with all our unions . "
-1 2357324 2357271 " But they never climb out of the pot of beer again . " It 's just that they never climb out of the beer again . "
-1 780408 780363 Chief financial officer Andy Bryant has said that hike had a greater affect volume than officials expected . Bryant has said that hike had a greater effect on demand than officials expected .
-1 821523 821385 Robert Liscouski , the Assistant Secretary of Homeland Security for Infrastructure Protection , will oversee NCSD . NCSD 's chief will be Robert Liscouski , the assistant secretary of Homeland Security for Infrastructure Protection .
-1 2304696 2304863 HP 's shipments increased 48 percent year-over-year , compared to an increase of 31 percent for Dell . HPs shipments increased 48 per cent year-on-year , compared to an increase of 31 per cent for Dell .
-1 2531749 2531607 Chirac , who can pardon a law-breaker , refused Humbert 's request last year but kept in close touch with the family . Chirac , who has the authority to pardon law-breakers , refused Humbert 's request to be allowed to die last year but kept in close touch with the family .
-1 3180014 3179967 The charges allege that he was part of the conspiracy to kill and kidnap persons in a foreign country . The government now charges that Sattar conspired with Rahman to kill and kidnap individuals in foreign countries .
-1 726966 726945 In the 2002 study , the margin of error ranged from 1.8 to 4.4 percentage points . It has a margin of error of plus or minus three to four percentage points .
-1 2638861 2638982 Mr. Clinton 's national security adviser , Sandy Berger , said that the White House wasn 't informed of the FBI activities . Clinton ’ s national security adviser , Sandy Berger , said in an interview that the White House was not informed of the FBI activities .
-1 2495223 2495307 " This decision is clearly incorrect , " FTC Chairman Timothy Muris said in a written statement . The decision is " clearly incorrect , " FTC Chairman Tim Muris said .
-1 55187 54831 Prosecutors allege that Nichols and co-conspirator Timothy McVeigh worked together to prepare a bomb that destroyed the Alfred P. Murrah Federal Building . Prosecutors allege that Nichols and coconspirator Timothy McVeigh worked together to prepare a 4,000-pound fuel-and-fertilizer bomb that destroyed the Murrah building .
-0 2763381 2763517 Terri Schiavo , 39 , is expected to die sometime in the next two weeks in the Tampa-area hospice where she has spent the past several years . Terri Schiavo , 39 , underwent the procedure at the Tampa Bay area hospice where she has been living for several years , said her father , Bob Schindler .
-1 1990975 1991132 Secretary of State Colin Powell designated the Chechen leader believed responsible for last year 's hostage standoff in a Moscow theater as a threat to U.S. security Friday . U.S. Secretary of State Colin Powell on Friday designated Chechen rebel leader Shamil Basayev a threat to the security of the United States and to U.S. citizens .
-1 2204353 2204418 " Today , we are trying to convey this problem to Russian President Vladimir Putin and US President George W Bush . " " Today , we are trying to convey this problem to Russian President Vladimir Putin ( news - web sites ) and President Bush ( news - web sites ) . "
-1 60122 60445 That would be a potential setback to Chief Executive Phil Condit 's strategy of bolstering defense-related sales during a slump in jetliner deliveries . The inquiry may hinder Chief Executive Phil Condit 's strategy of bolstering defense-related sales during a slump in jetliner deliveries .
-1 961836 962243 PeopleSoft also said its board had officially rejected Oracle 's offer . Thursday morning , PeopleSoft 's board rejected the Oracle takeover offer .
-0 3140260 3140288 The Dow Jones industrial average ended the day down 10.89 at 9,837.94 , after advancing 111.04 Wednesday . The Dow Jones industrial average fell 10.89 points , or 0.11 percent , to 9,837.94 .
-1 1720166 1720115 Cortisol levels in the saliva of day care children were highest and rose most steeply in those judged by day care center personnel to be the shyest . Cortisol levels in the saliva of day-care children were highest and rose most steeply in those whom day-care centre staffed judged to be the shyest .
-1 2573262 2573319 " The idea that Tony Abbott is in some way a one-dimensional political head-kicker couldn 't be more wrong , " Mr Howard said . " The idea that Tony Abbott is in some way a one-dimensional political head kicker couldn 't be more wrong . "
-0 1353356 1353174 " Biotech products , if anything , may be safer than conventional products because of all the testing , " Fraley said , adding that 18 countries have adopted biotechnology . " Biotech products , if anything , may be safer than conventional products because of all the testing , " said Robert Fraley , Monsanto 's executive vice president .
-1 2738677 2738741 The rate of skin cancer has tripled since the 1950s in Norway and Sweden , according to the study . The study also found that skin cancer nearly tripled in Norway and Sweden since the 1950s .
-1 1638813 1639087 We acted because we saw the existing evidence in a new light , through the prism of our experience on 11 September , " Rumsfeld said . Rather , the US acted because the administration saw " existing evidence in a new light , through the prism of our experience on September 11 " .
-1 1605350 1605425 Trans fat makes up only 1 percent to 3 percent of the total fat Americans consume , compared with 14 percent for saturated fat . Trans fat accounts for 2.5 percent of Americans ' daily calories , compared to 11 percent to 12 percent for saturated fat .
-1 2494149 2494073 However , a recent slide in prices and OPEC 's expectations of a surge in oil inventories have compounded its fears about a further softening of the market . A 14 percent slide in crude prices this month and expectations of a build up in oil inventories compounded OPEC 's fears of a further softening of the market .
-1 3023029 3023229 Peterson , 31 , is now charged with murder in the deaths of his 27-year-old wife and their unborn son . Peterson , 31 , is charged with two counts of first-degree murder in the slayings of his wife , Laci , and their unborn son , Conner .
-1 1351550 1351155 Carlson on Tuesday said he would not recuse himself from the case . Service officials said Carlson refused to recuse himself from the case .
-1 981185 981234 The program will grow to include ports in Dubai , Turkey and Malaysia , among others . The program will be expanded to include areas of the Middle East such as Dubai , Turkey and Malaysia , Mr. Ridge said .
-0 2111629 2111786 McCabe said he was considered a witness , not a suspect . " He is not considered a suspect , " McCabe said .
-1 655498 655391 The woman was exposed to the SARS virus while in the hospital but was not a health care worker , said Dr. Colin D ’ Cunha , Ontario ’ s commissioner of public health . The woman was exposed to the SARS virus while in the hospital but was not a health-care worker , said Dr Colin D 'Cunha , Ontario 's commissioner of public health .
-1 533823 533909 He added that those " are not solely American principles , nor are they exclusively Western . " " These are not solely American principles nor are they exclusively Western , " Rumsfeld said .
-1 581592 581570 " If we don 't march into Tehran , I think we will be in pretty good shape , " he said . " As long as we don 't march on Tehran , I think we are going to be in pretty good shape , " he said .
-0 1010655 1010430 On Saturday , a 149mph serve against Agassi equalled Rusedski 's world record . On Saturday , Roddick equalled the world record with a 149 m.p.h. serve in beating Andre Agassi .
-1 2241925 2242066 Chad Kolton , emergency management spokesman with the Department of Homeland Security , said the government is open to new technologies and methods to communicate more quickly and efficiently . Chad Kolton , emergency management spokesman with the Department of Homeland Security , said the government is open to new ways to communicate .
-1 2796978 2797024 " APEC leaders are painfully aware that security and prosperity are inseparable , " Thai Prime Minister Thaksin Shinawatra told business leaders . " APEC leaders are painfully aware that security and prosperity are inseparable , " Thaksin said .
-0 101746 101775 Danbury prosecutor Warren Murray could not be reached for comment Monday . Prosecutors could not be reached for comment after the legal papers were obtained late Monday afternoon .
-1 327839 327748 Wittig resigned last year after being indicted on federal bank fraud charges involving a real estate loan unrelated to Westar business . Wittig resigned in late November about two weeks after being indicted on bank fraud charges in a real estate case unrelated to the company .
-0 2988297 2988555 Shattered Glass , " starring Hayden Christensen as Stephen Glass , debuted well with $ 80,000 in eight theaters . " Shattered Glass " _ starring Hayden Christensen as Stephen Glass , The New Republic journalist fired for fabricating stories _ debuted well with $ 80,000 in eight theaters .
-1 2217613 2217659 He was arrested Friday night at an Alpharetta seafood restaurant while dining with his wife , singer Whitney Houston . He was arrested again Friday night at an Alpharetta restaurant where he was having dinner with his wife .
-0 2128530 2128455 However , EPA officials would not confirm the 20 percent figure . Only in the past few weeks have officials settled on the 20 percent figure .
-1 2208376 2208198 University of Michigan President Mary Sue Coleman said in a statement on the university 's Web site , " Our fundamental values haven 't changed . " Our fundamental values haven 't changed , " Mary Sue Coleman , president of the university , said in a statement in Ann Arbor .
-1 1980654 1980641 The first products are likely to be dongles costing between US $ 100 and US $ 150 that will establish connections between consumer electronics devices and PCs . The first products will likely be dongles costing $ 100 to $ 150 that will establish connections between consumer electronics devices and PCs .
-0 589579 589557 However , Lapidus expects foreign brands ' sales to be up 4 percent , driven by strong truck sales at Honda Motor Co . Lapidus expects Ford to be down 5 percent , Chrysler down 10 percent and foreign brands up 4 percent driven by strong truck sales at Honda .
-1 1636060 1635946 Michel , who remains in the government , denied that US pressure had provoked the government 's move . Michel , who has stayed in the new government , denied that it was U.S. pressure which had provoked the government 's move .
-1 1630585 1630657 Some of the computers also are used to send spam e-mail messages to drum up traffic to the sites . Some are also used to send spam e-mail messages to boost traffic to the sites .
-0 447728 447699 Indonesia 's army has often been accused of human rights abuses during GAM 's battle for independence , charges it has generally denied while accusing the separatists of committing rights violations . Indonesia 's army has been accused of human rights abuses during its earlier battles with GAM , charges it has generally denied .
-1 1606495 1606619 Bush also hoped to polish his anti-AIDS credentials in Uganda , which has been hailed as an African pioneer in fighting the killer disease . President Bush flies to Uganda Friday hoping to polish his anti- AIDS credentials in a country hailed as an African pioneer in fighting the epidemic .
-1 1550897 1550977 Later this year , the command will send trainers with soldiers from four North African nations on patrolling and intelligence gathering missions . This fall the command will send trainers to work with soldiers from four North African nations on patrolling and gathering intelligence .
-0 490376 490490 The reports helped overcome investor jitters after the euro briefly hit an all-time high against the dollar Tuesday . Stocks slipped at the open after the euro hit record highs against the dollar .
-1 3084554 3084612 Sales for the quarter beat expectations , rising 37 percent year-on-year to 1.76 billion euros . Sales rose 37 per cent year-on-year to 1.76bn , beating expectations .
-1 315647 315778 If the MTA 's appeal to a higher court is successful , the $ 2 bus and subway base fare won 't be rolled back . If the MTA 's appeal is successful , the $ 2 bus and subway base fare won 't change .
-1 3428298 3428362 Robert Walsh , 40 , remained in critical but stable condition Friday at Staten Island University Hospital 's north campus . Walsh , also 40 , was in critical but stable condition at Staten Island University Hospital last night .
-1 2523564 2523358 The Guru microcontroller serves four functions : hardware monitoring , overclocking management , BIOS ( Basic Input Output System ) update and a troubleshooting-assistance feature called Black Box . The µGuru microcontroller serves four functions : hardware monitoring , overclocking management , BIOS update and a troubleshooting-assistance feature called Black Box .
-1 2079200 2079131 U.S. corporate bond yield spreads tightened in spotty trading on Friday as Wall Street labored to get back on its feet after the largest power outage ever in North America . U.S. stocks rose slightly on feather-light volume on Friday , as Wall Street regrouped after the biggest-ever power outage in North America .
-1 818091 817811 The company said it would issue revised guidance for the full fiscal year next month when it releases its Q2 results . The company said it would renew its guidance for 2003 when it announces its second quarter results in mid-July .
-1 1580638 1580663 " I stand 100 percent by it , and I think our intelligence services gave us the correct information at the time . " I stand 100 percent by it , and I think that our intelligence services gave us the correct intelligence and information at the time , " Blair said .
-0 1919740 1919926 " I don 't know if the person I 'm talking to now may end up being someone else at another time that may not follow the rules , " Parrish said . " I don 't know whether the person I 'm talking to now may end up being someone else , " Parrish said .
-1 2748287 2748550 " I think it 's going to be a close vote , but I think the grant proposal is going to win , " McConnell said . " I think it 's going to be a close vote , but I think the grant proposal 's going to win , " said Sen. Mitch McConnell , assistant majority leader .
-1 3394891 3394775 Twenty-eight people were believed to have been spending Christmas Day with the caretaker of the St Sophia 's camp , when the mudslide smashed into two cabins . Twenty-seven people were believed to have been spending Christmas Day with the caretaker of Saint Sophia Camp , a Greek Orthodox facility , when the mudslide roared through .
-0 2963943 2963880 One , Capt. Doug McDonald , remained hospitalized in critical condition on Thursday . Her 20-year-old sister , Allyson , was severely burned and remained hospitalized in critical condition .
-0 1865364 1865251 The United States finally relented during President Bush 's visit to Africa earlier this month . During President Bush 's trip to Africa earlier this month , however , Washington said it would support the increase .
-1 263690 263819 " There is no conscious policy of the United States , I can assure you of this , to move the dollar at all , " he said . He also said there is no conscious policy by the United States to move the value of the dollar .
-1 283751 283290 It 's the first such drill since the September 11 terrorist attacks on New York and Washington . It is the nation 's first large-scale counterterrorism exercise since the Sept . 11 terrorist attacks .
-1 2517014 2516995 Myanmar 's pro-democracy leader Aung San Suu Kyi will return home late Friday but will remain in detention after recovering from surgery at a Yangon hospital , her personal physician said . Myanmar 's pro-democracy leader Aung San Suu Kyi will be kept under house arrest following her release from a hospital where she underwent surgery , her personal physician said Friday .
-1 1330643 1330622 According to the Merchant Marine Ministry , the 37-year-old ship is registered to Alpha Shipping Inc. based in the Pacific Ocean nation of Marshall Islands . The Baltic Sky is a 37-year-old ship registered to Alpha Shipping Inc. based in the Pacific Ocean nation of Marshall Islands .
-1 3111452 3111428 In an unusual move , the U.S. Patent and Trademark Office is reconsidering a patent affecting Internet pages that critics contend could disrupt millions of Web sites . In an unusual move that critics contend could disrupt millions of Web sites , the U.S. Patent and Trademark Office is reconsidering a patent affecting Internet pages .
-0 1167835 1167651 Kansas Department of Health and Environment records show there were 88 abortions performed on girls age 14 and younger last year . Statistics from the Kansas Department of Health and Environment show that 11,844 abortions were performed in the state last year .
-0 1423836 1423708 A European Union spokesman said the Commission was consulting EU member states " with a view to taking appropriate action if necessary " on the matter . Laos 's second most important export destination - said it was consulting EU member states ' ' with a view to taking appropriate action if necessary ' ' on the matter .
-1 2090911 2091154 Waiting crowds filling the streets on both sides overwhelmed the peacekeepers soon after daylight , sweeping past the barbed wire barricades . But waiting crowds filling the streets rushed the bridges soon after daylight , overrunning razor-wire barricades .
-1 2265271 2265152 Barry Callebaut will be able to use Brach 's retail network to sell products made from its German subsidiary Stollwerck , which makes chocolate products not sold in the United States . Barry Callebaut will be able to use Brach 's retail network to sell products made from its German subsidiary Stollwerck , which makes chocolate products unknown to the American market .
-1 3062202 3062308 By skirting the FDA 's oversight , Eagan said , the quality of the imported drugs is " less predictable " than for those obtained in the United States . By skirting the FDA 's oversight , Eagan said the quality of the imported drugs is " less predictable " than U.S. drugs .
-1 2155514 2155377 He said : " For the first time there is an easy and affordable way of making this treasure trove of BBC content available to all . " " For the first time , there is an easy and affordable way of making this treasure trove of BBC content available to all , " Dyke said .
-1 1552068 1551928 Three such vigilante-style attacks forced the hacker organizer , who identified himself only as " Eleonora [ 67 ] , " to extend the contest until 7 p.m. EST Sunday . Three such vigilante-style attacks forced the hacker organiser , who identified himself only as " Eleonora67 ] , " to extend the contest until 8am ( AEST ) today .
-1 936978 937500 Eric Gagne pitched a perfect ninth for his 23rd save in as many opportunities . Gagne struck out two in a perfect ninth inning for his 23rd save .
-0 985015 984975 One way or another , Harry Potter And The Order Of The Phoenix will be in your hands by Saturday . Just about everything about " Harry Potter and the Order of the Phoenix " will set records .
-1 1430357 1430425 " Allison just proves you don 't need to wait until August or September to have a disaster , " said Josh Lichter , a meteorologist with the Houston-Galveston weather office . " Allison just proves you don 't need to wait until August or September to have a disaster , " Lichter said .
-1 3039310 3039413 Today , analysts say , UN members can no longer ignore the shifts since the September 11 2001 attacks . On Wednesday , analysts say , UN members can no longer ignore the shifts since the attacks in the US of September 11 2001 .
-1 34513 34742 Police say CIBA was involved in the importation of qat , a narcotic substance legal in Britain but banned in the United States . Mr McKinlay said that CIBA was involved in the importation of qat , a narcotic substance legal in Britain but banned in the US .
-1 368067 368018 Chiron already has nearly 20 percent acceptances from PowderJect 's shareholders . Chiron has acceptances from holders of nearly 20 percent of PowderJect shares .
-0 611663 611716 Ernst & Young has denied any wrongdoing and plans to fight the allegations . Ernst & Young has denied the SEC 's claims , and called its recommendations " irresponsible " .
-1 98432 98657 The attack followed several days of disturbances in the city where American soldiers exchanged fire with an unknown number of attackers as civilians carried out demonstrations against the American presence . The attack came after several days of disturbance in the city in which U.S. soldiers exchanged fire with an unknown number of attackers as civilians protested the American presence .
-1 3039007 3038845 No company employee has received an individual target letter at this time . She said no company official had received " an individual target letter at this time . "
-1 1708040 1708062 Second-quarter results reflected a gain of 10 cents per diluted share , while the 2002 results included a loss of 19 cents per diluted share . The second-quarter results had a non-operating gain of 10 cents a share while the 2002 second-quarter performance had a net non-operating loss of 19 cents a share .
-0 1757264 1757375 He allegedly told his ex-wife in an angry phone call that he had no intention of following their new custody agreement . The two had battled over custody and he allegedly told her in an angry phone call that he had no intention of following their new custody agreement .
-1 383417 383558 Worldwide , more than 50 million people have seen " Les Miz , " with gross receipts of $ 1.8 billion . Worldwide , Les Misérables has been seen by over 50 million people , with a total gross of over $ 2 billion .
-0 2766112 2766084 In fiction : Edward P. Jones ( " The Known World " ) and Scott Spencer ( " A Ship Made of Paper " ) . The fifth nominee for fiction is Scott Spencer , for A Ship Made of Paper .
-1 1261116 1261234 " Overwhelmingly the Windows brand really resonated with them . " " Windows was the part of the experience that really resonated with people . "
-1 3028143 3028234 The Centers for Medicare and Medicaid Services , the federal agency that runs Medicare , last year began a similar effort for nursing homes . The Centers for Medicare and Medicaid launched a similar consumer tool for nursing homes last year .
-0 249699 249623 Vivace was founded in 1999 and has raised over $ 118 million in three rounds of venture financing . During difficult times for technology venture capital , Vivace raised over $ 118 million in three rounds of venture financing .
-0 3448488 3448449 The Dow Jones industrial average < .DJI > added 28 points , or 0.27 percent , at 10,557 , hitting its highest level in 21 months . The Dow Jones industrial average < .DJI > rose 49 points , or 0.47 percent , to 10,578 .
-1 2749322 2749663 The Democratic candidates also began announcing their fund-raising totals before Wednesday 's deadline to file quarterly reports with the Federal Election Commission . The Democratic candidates also began announcing their fund-raising totals in advance of the deadline today to file quarterly reports with the Federal Election Commission .
-0 2204592 2204588 Sun Microsystems Inc. on Thursday said it had added 100 new third-party systems and 100 new components to its Hardware Compatibility List for the Solaris x86 operating system Platform Edition . The vendor has added 100 new third-party systems and 100 new components to the operating system 's Hardware Compatibility List ( HCL ) .
-1 2889005 2888954 Prosecutors said PW Marketing violated the state 's 1998 anti-spam law by sending unsolicited e-mail without a toll-free number for recipients to call to stop additional mailings . Prosecutors said PW Marketing violated the 1998 anti-spam law because these unsolicited e-mails were sent without a free call number for recipients to phone to stop additional mailings .
-0 1657632 1657619 The Neighbours star and singer spent yesterday resting at her family home in Sydney and will have more tests today . Goodrem spent yesterday resting in her family home in Sydney and will have more tests today to determine her exact treatment .
-0 555617 555528 The 3 rd Armored Cavalry Regiment is 5,200 strong and the largest combat unit at Fort Carson . Broomhead , 34 , was assigned to the 2nd Squadron , 3rd Armored Cavalry Regiment .
-1 2396937 2396818 " The risk of inflation becoming undesirably low remains the predominant concern for the foreseeable future , " the Fed said in a statement accompanying the unanimous decision . " The risk of inflation becoming undesirably low remains the predominant concern for the foreseeable future , " the policy-setting Federal Open Market Committee said .
-0 2339738 2339771 " It is bad for Symbian , " said Per Lindberg , analyst at Dresdner Kleinwort Wasserstein . " Motorola has displayed clear disloyalty " to Symbian , said Per Lindberg , an analyst at Dresdner Kleinwort Wasserstein in London .
-0 1616174 1616206 Bob Richter , a spokesman for House Speaker Tom Craddick , had no comment about the ruling . Bob Richter , spokesman for Craddick , R-Midland , said the speaker had not seen the ruling and could not comment .
-1 635783 635802 But Ms Ward said the headroom under its financial covenants was " tight " and that there could be another downgrade if Southcorp breached any of its banking covenants . But Ms Ward said the headroom under its financial covenants was " tight " and that there could be a rating downgrade if Southcorp did breach any banking covenants .
-1 3444633 3444733 He added : ``I 've never heard of more reprehensiblebehaviour by a doctor . The Harrisons ’ lawyer Paul LiCalsi said : “ I ’ ve never heard of more reprehensible behaviour by a doctor .
-1 555553 555528 Broomhead was assigned to 2nd Squadron , 3rd Armor Cavalry Regiment , based at Fort Carson . Broomhead , 34 , was assigned to the 2nd Squadron , 3rd Armored Cavalry Regiment .
-1 1112021 1111925 Other staff members , however , defended the document , saying it would still help policy-makers and the agency improve efforts to address the climate issue . Some E.P.A. staff members defended the document , saying that although pared down it would still help policy makers and the agency address the climate issue .
-0 2749410 2749625 President Bush raised a record-breaking $ 49.5 million for his re-election campaign over the last three months , with contributions from 262,000 Americans , the president 's campaign chairman said Tuesday . President Bush has raised $ 83.9 million since beginning his re-election campaign in May , and has $ 70 million of that left to spend , his campaign said Tuesday .
-1 1629064 1629043 An episode is declared when the ozone reaches .20 parts per million parts of air for one hour . A Stage 1 episode is declared when ozone levels reach 0.20 parts per million .
-1 789691 789665 " He may not have been there , " the defence official said on Thursday . " He may not have been there , " said a defence official speaking on condition of anonymity .
-1 844421 844679 The U.N. troops are in Congo to protect U.N. installations and personnel , and they can only fire in self defense and have been unable to stem the violence . The troops - whose mandate is to protect U.N. installations and personnel - can only fire in self-defense and have been unable to stem the violence .
-1 58540 58567 North American markets grabbed early gains Monday morning , as earnings season begins to slow and economic indicators take the spotlight . North American futures pointed to a strong start to the first trading session of the week Monday , as earnings season slows and economic indicators take the spotlight .
-1 781439 781461 Xerox itself paid a $ 10 million fine last year to settle similar SEC charges . Xerox itself previously paid a $ 10-million penalty to settle the SEC accusations .
-1 1909579 1909408 " This deal makes sense for both companies , " said National Chief Executive Brian Halla . " This deal makes sense for both companies , " Halla said in a prepared statement .
-0 787432 787464 The blasts killed two people and injured more than 150 others . The Atlanta Olympic Games attack killed one woman and injured more than 100 other people .
-0 52758 52343 Morrill 's wife , Ellie , sobbed and hugged Bondeson 's sister-in-law during the service . At the service Morrill 's widow , Ellie , sobbed and hugged Bondeson 's sister-in-law as people consoled her .
-1 1675025 1675047 Spansion products are to be available from both AMD and Fujitsu , AMD said . Spansion Flash memory solutions are available worldwide from AMD and Fujitsu .
-1 2131318 2131372 About 1,500 police will be deployed for the visit . Around 1,500 police are to be deployed at Niigata for the ferry 's visit .
-1 325763 325928 Gamarekian told The News she remembers only the woman 's first name - and refused to reveal it . She told the New York Daily News she remembers only the intern 's first name , which she refused to reveal .
-1 2638975 2638855 One of the FBI ’ s key operatives , who had a falling out with the bureau , provided an account of the operation at a friend ’ s closed immigration court proceeding . One of the FBI 's key operatives , who has had a falling-out with the bureau , provided an account of the operation at a friend 's closed immigration court proceeding .
-1 2198694 2198937 A nationally board certified teacher with a master 's degree , Kelley makes a salary of $ 65,000 in his 30th year . A nationally board certified teacher with a master 's degree , Kelley , in his 30th year teaching , makes $ 65,000 .
-1 1825432 1825301 A man arrested for allegedly threatening to shoot and kill a city councilman from Queens was ordered held on $ 100,000 bail during an early morning court appearance Saturday . The Queens man arrested for allegedly threatening to shoot City Councilman Hiram Monserrate was held on $ 100,000 bail Saturday , a spokesman for the Queens district attorney said .
-1 2906104 2906322 They were being held Sunday in the Camden County Jail on $ 100,000 bail . They remained in Camden County Jail on Sunday on $ 100,000 bail .
-1 722278 722383 Ms Stewart , the chief executive , was not expected to attend . Ms Stewart , 61 , its chief executive officer and chairwoman , did not attend .
-0 101747 101777 Christina 's aunt , Shelley Riling , said the defense 's claims were preposterous . Christina 's aunt , Shelley Riling , said she will address the court .
-1 2224884 2224819 The Justice Department Aug. 19 gave pre-clearance for the Oct. 7 date for the election to recall Gov. Gray Davis , saying it would not affect minority voting rights . The Justice Department on Aug. 19 sanctioned the Oct. 7 date for recall election , saying it would not affect voting rights .
-0 977938 978162 Lord Falconer hailed the changes as " a new beginning as far as the courts , Crown Prosecution Service and police are concerned " . " It 's a new beginning as far as the courts , Crown Prosecution Service and police are concerned , making the criminal justice system work better . "
-0 1015010 1014963 GE stock closed at $ 30.65 a share , down about 42 cents , on the New York Stock Exchange . GE 's shares closed at $ 30.65 on Friday on the New York Stock Exchange .
-1 1513190 1513246 At least 27 US troops have been killed in hostile fire since Bush 's statement . At least 26 American troops have been killed in hostile fire since major combat was officially declared over on May 1 .
-1 2385348 2385394 A recent poll showed Edwards with a narrow lead in South Carolina , and he plans a rally there later on Tuesday . A recent poll showed Edwards in a virtual four-way tie at the top in South Carolina , and he plans a rally there later on Tuesday .
-1 2317018 2317252 November 17 's last victim was British defence attache Stephen Saunders , who was shot on an Athens road in June 2000 . November 17 's last victim was British defense attache Stephen Saunders , who was shot and killed at point-blank range on a busy Athens road in June 2000 .
-0 1831696 1831660 The agency charged that one WD Energy worker discussed false reporting with traders at two other energy companies . The agency found further that a WD Energy employee discussed false reporting with traders at two other energy companies , which the CFTC didn 't identify .
-1 1528383 1528083 Zulifquar Ali , a worshipper slightly wounded by shrapnel , said the assailants first targeted the mosque 's security guards . Witness Zulfiqar Ali , who was slightly wounded by shrapnel , said the attackers had focused on the mosque 's guards .
-1 917965 918315 For the second year in a row , rises in hospital costs accounted for much of the inflation , accounting for 51 percent of the overall cost increase . For the second year in a row , rises in hospital costs dominated the increase , accounting for 51 percent of the overall cost spiral .
-0 3218713 3218830 Q : Can I buy coverage for prescription drugs right away ? Congress has added a new benefit - an option to buy insurance coverage for prescription drugs .
-1 221079 221003 The airline also said it has the option to buy 380 more airplanes , orders that would be split evenly between the two manufacturers . The airline has the option to buy 380 more , split evenly between the two manufacturers .
-1 2546175 2546198 Dr Mark McClean , Jonathan 's family doctor , said if the drug had been administered earlier Jonathan would have retained more of his brain functions . Dr Mark McClean , the family 's GP , said had the drug been administered to Jonathan earlier , he would have retained more of his brain function .
-0 799346 799268 The chain operates more than 3,400 stores , and has annual revenue of about $ 15.8 billion . The chain , which has been under new management since late 1999 , has more than 3,400 stores and $ 15.8 billion in annual revenue .
-0 2673104 2673130 All patients developed some or all of the symptoms of E. coli food poisoning : bloody diarrhea , vomiting , abdominal cramping and nausea . Symptoms of the E. coli infection include bloody diarrhea , nausea , vomiting and abdominal cramping .
-1 1354501 1354476 Federal regulators have turned from sour to sweet on a proposed $ 2.8 billion merger of ice cream giants Nestle Holdings Inc. and Dreyer 's Grand Ice Cream Inc . Federal regulators have changed their minds on a proposed $ 2.8 billion merger of ice cream giants Nestle Holdings and Dreyer 's Grand Ice Cream .
-1 3070979 3070949 Environmental campaigners are using this weekend ’ s lunar eclipse to highlight the huge increase in light pollution across the UK . Environmental campaigners used the eclipse to highlight the surge in light pollution across Britain .
-0 1264509 1264471 Available July 7 , the software supports the Solaris , IBM AIX , Red Hat Linux and Windows operating systems . The OpForce product currently works with Solaris , AIX , Red Hat Linux and Windows servers .
-1 103280 103431 Justice Minister Martin Cauchon and Prime Minister Jean Chrétien have both said the Liberal government will introduce legislation soon to decriminalize possession of small amounts of pot for personal use . Justice Minister Martin Cauchon and Prime Minister Jean Chretien both have said the government will introduce legislation to decriminalize possession of small amounts of pot .
-0 110731 110648 But Chauncey Billups demonstrated he 's also capable of big games , scoring 77 points over the final two games against the Magic . Billups scored 77 points in the final two games of the first-round series against the Magic .
-1 2274844 2274714 Kelly killed himself after being exposed as the source for a BBC report which claimed the government had embellished evidence of Iraq 's banned weapons to justify the war . He killed himself after being exposed as the source for a BBC report which claimed the government exaggerated the case for war against Iraq .
-0 1050307 1050144 And it 's going to be a wild ride , " said Allan Hoffenblum , a Republican consultant . Now the rest is just mechanical , " said Allan Hoffenblum , a Republican consultant .
-1 2810634 2810670 While the Ibrahims had one separation operation , Goodrich and Dr. David Staffenberg plan about three for the Aguirres , with several weeks between each . Instead of one long operation to separate the twins , Goodrich and Dr. David Staffenberg plan about three , with several weeks between each .
-1 3073773 3073779 Lay had contended that turning over the documents would violate his Fifth Amendment right against self-incrimination . Lay had refused to turn over the papers , asserting his Fifth Amendment right against self-incrimination .
-0 261202 260995 The WHO experts didn 't say how many cases in Hebei were in rural areas . Hebei has reported 191 cases and eight deaths , though the WHO experts did not say how many were in rural areas .
-1 1824224 1824209 Nearly 300 mutinous troops who seized a Manila shopping and apartment complex demanding the government resign gave up and retreated peacefully after some 19 hours . Mutinous troops who seized a Manila shopping and apartment complex demanding the government resign ended a 19-hour standoff late Sunday and returned to barracks without a shot fired .
-1 548867 548785 In three years , Lend Lease has slipped from a top-five stock , when its share price was around $ 24 , to 37th . In the space of three years , Lend Lease has slipped from a top-five 5 stock when its share price hovered around $ 24 to 37th on the list .
-0 2796658 2796682 About two hours later , his body , wrapped in a blanket , was found dumped a few blocks away . Then his body was dumped a few blocks away , found in a driveway on Argyle Road .
-1 1808166 1808434 Columbia broke up over Texas upon re-entry on Feb. 1 . Columbia broke apart in the skies above Texas on Feb. 1 .
-1 853475 853342 A year or two later , 259 , or 10 per cent , of the youths reported that they had started to smoke , or had taken just a few puffs . Within two years , 259 , or 10 percent , of the youths reported they had started to smoke or had at least taken a few puffs .
-0 977772 977804 The Lord Chancellor was guardian of the Great Seal , used to stamp all official documents from the sovereign . Falconer will hold on , for now , to the Lord Chancellor 's Great Seal , used to sign off instructions from the sovereign .
-1 577854 578500 Cindy Yeast , a 50-year-old Washington-area publicist , says she began taking supplements two years ago in part to avoid mild dementia that affects her elderly parents . She started taking supplements two years ago - partly to stave off mild dementia that affects her elderly parents .
-1 2829194 2829229 The two are not related , but have referred to each other as father and son . He 's not related to Malvo , but the two have referred to each other as father and son .
-1 2074182 2074668 Gibson said last month in a press statement that " neither I nor my film are anti-Semitic . Gibson said in a June statement that he and his film are not anti-Semitic .
-0 2758265 2758282 The world 's largest software company said it recognized the difficulty the multiple patches posed for companies , and set out to make it easier for them to apply the updates . The world 's largest software company said it recognized the difficulty the multiple patches posed for companies trying to apply them .
-1 1958079 1958143 The Dow Jones industrial average .DJI ended up 64.64 points , or 0.71 percent , at 9,191.09 , according to the latest available data . The blue-chip Dow Jones industrial average .DJI added 38 points , or 0.42 percent , to 9,165 .
-1 544217 544325 The vote came just two days after Kurds swept City Council elections , taking the largest single block of votes on the 30-seat council . The vote for mayor followed City Council elections that gave Kurds the largest block of votes on the 30-seat council .
-1 2385288 2385256 Large swells and dangerous surf already were being felt along sections of the coast . Already large swells and dangerous surf have arrived along the mid-Atlantic .
-0 2324708 2325028 Based on a separate survey of households , the unemployment rate fell in August to 6.1 percent from 6.2 percent . Labor Department analysts discounted a slight improvement in the national unemployment rate , which fell in August to 6.1 percent from 6.2 percent .
-1 2139506 2139427 " We will work with the board to ensure a smooth transition . " He said federal regulators would work with the corporation to ensure a " smooth transition . "
-1 2965576 2965701 Gasps could be heard in the courtroom when the photo was displayed . Gasps could be heard as the photo was projected onto the screen .
-1 2931098 2931144 Gilead had earnings of $ 73.1 million , or 33 cents a share , compared with $ 20.8 million , or 10 cents , in the year-ago quarter . Quarterly profit climbed to $ 73.1 million , or 33 cents a share , from $ 20.8 million , or 10 cents , a year earlier , the company said .
-0 644788 644816 " I had one bad stretch of holes that put me out of contention to win , " Woods said . " I had one bad stretch of holes that put me out of contention , " Woods said , referring to his 42 on the front nine Saturday .
-0 2551891 2551563 The poll had a margin of error of plus or minus 2 percentage points . It had a margin of sampling error of plus or minus four percentage points and was conducted Thursday through Saturday .
-1 1089053 1089297 Sen. Patrick Leahy of Vermont , the committee 's senior Democrat , later said the problem is serious but called Hatch 's suggestion too drastic . Sen. Patrick Leahy , the committee 's senior Democrat , later said the problem is serious but called Hatch 's idea too drastic a remedy to be considered .
-1 3435735 3435717 The broad Standard & Poor 's 500 < .SPX > eased 0.37 of a point , or 0.03 percent , at 1,121 . The Standard & Poor 's 500 Index < .SPX > slipped 0.26 point , or 0.02 percent , to 1,121.96 .
-0 1954 2142 Watertown , Saugus and Framingham also are going smoke-free Monday , joining a growing number of cities around the country . Along with Boston , Watertown , Saugus and Framingham also are going smoke-free Monday .
-1 3400796 3400822 That is evident from their failure , three times in a row , to get a big enough turnout to elect a president . Three times in a row , they failed to get a big _ enough turnout to elect a president .
-1 1220668 1220801 We firmly believe we have an absolute right to use the common word ' spike ' as the name of our network . " We firmly believe that we have an absolute right to use the common word ' spike ' to name our network .
-1 1889954 1889847 Sources who knew of the bidding said last week that cable TV company Comcast Corp. was also looking at VUE . Late last week , sources told Reuters cable TV company Comcast Corp. CMCSA.O also was looking at buying VUE assets .
-1 315785 315653 But MTA officials appropriated the money to the 2003 and 2004 budgets without notifying riders or even the MTA board members considering the 50-cent hike , Hevesi found . MTA officials appropriated the surplus money to later years ' budgets without notifying riders or the MTA board members when the 50-cent hike was being considered , he said .
-0 1521034 1520582 White , who had suffered kidney failure from years of high blood pressure , died at Cedars-Sinai Medical Center around 9 : 30 a.m. , said manager Ned Shankman . White , who had kidney failure from years of high blood pressure , had been undergoing dialysis and had been hospitalized since a September stroke .
-1 2083598 2083810 About 10 percent of high school and 16 percent of elementary students must be proficient at math . In math , 16 percent of elementary and middle school students and 9.6 percent of high school students must be proficient .
-1 1910610 1910455 The legal ruling follows three days of intense speculation Hewlett-Packard Co. may be bidding for the company . The legal ruling follows three days of wild volatility in RIM 's stock over speculation that PC giant Hewlett-Packard Co. may be bidding for the company .
-1 3113791 3113782 The European Commission , the EU 's antitrust enforcer , is expected to issue its decision next spring — unless a settlement is reached . The European Commission is expected to issue its decision in the case next spring — unless a settlement is reached .
-1 3214517 3214483 " So Sebastian did his best to convincingly confess to a crime that he didn 't commit in order to survive , " she told jurors . " Sebastian did his best to confess convincingly to a crime he didn 't do in order to survive , " Ms. Richardson declared .
-0 2083612 2083810 Twenty percent of Latino students and 23 percent of black students performed at proficient or higher . In math , 16 percent of elementary and middle school students and 9.6 percent of high school students must be proficient .
-1 661390 661218 He is charged in three bombings in Atlanta including a blast at the 1996 Olympics and one in Alabama . He is charged in three bombings in Atlanta - including a blast at the 1996 Olympics - along with the bombing in Alabama .
-1 1269572 1269682 The men were remanded in custody and are due to appear again before court on July 8 . They were remanded in custody and will appear in court again on July 8 .
-1 1095780 1095652 " No matter who becomes the sponsor for stock-car racing 's top series , NASCAR will need an all-star event , " Wheeler said in a statement . No matter who becomes the sponsor for stock-car racings top series , NASCAR will need an all-star event , Wheeler said Tuesday .
-1 116294 116332 The Phillies were upset that Counsell had stolen second in the sixth inning with Arizona leading 7-1 . The Phillies were apparently upset when Counsell stole during the sixth with the Diamondbacks up 7-1 .
-1 941617 941673 He said his hatred for such people grew from these discussions and had helped convince him violence was the answer . His hatred for these people had germinated from these discussions and helped cement his belief that violence was the panacea .
-1 2640607 2640576 " There is no need for one deadline for all to create the ASEAN Economic Community , " Thaksin said . Thus , he said , there did not have to one deadline to create the economic community .
-1 3310210 3310286 The announcement was made during the recording of a Christmas concert attended by top Vatican cardinals , bishops , and many elite from Italian society , witnesses said . The broadside came during the recording on Saturday night of a Christmas concert attended by top Vatican cardinals , bishops and many elite of Italian society , witnesses said .
-1 3376093 3376101 The additional contribution brings total U.S. food aid to North Korea this year to 100,000 tonnes . The donation of 60,000 tons brings the total of U.S. contributions for the year to 100,000 .
-1 1549586 1549609 Leon Williams ' body was found inside his third-floor apartment at 196 Bay St. , in Tompkinsville . The dead man , Leon Williams , was found in his third-floor apartment .
-1 460211 460445 The player 's eyes were bloodshot and a blood-alcohol test produced a reading of 0.18 - well above Tennessee 's level of presumed intoxication of 0.10 , the report said . He failed a field sobriety test and a blood-alcohol test produced a reading of 0.18 – well above Tennessee 's level of presumed intoxication of 0.10 , the report said .
-1 1196962 1197061 But Virgin wants to operate Concorde on routes to New York , Barbados and Dubai . Branson said that his preference would be to operate a fully commercial service on routes to New York , Barbados and Dubai .
-0 862804 862715 He tried to fight off officers and was taken to a hospital after a police dog bit him but was later released . Cruz tried to fight off officers and was hospitalized after a police dog bit him , Sgt. Steve Dixon said .
-1 1726935 1726879 The announcement , which economists said was not a surprise , may be bittersweet for the millions of Americans without jobs . Economists said the announcement was not a surprise , and politicians said it offered little comfort to the millions of Americans without jobs .
-0 331980 332110 Asked if the delegates could leave on Friday , police intelligence chief in Aceh , Surya Dharma , told reporters they could not because they did not have proper permission . Asked if the delegates could leave on Friday , police intelligence chief Surya Dharma told reporters : " Of course they may not go .
-1 173879 173832 Dealers said the dollar also drew some downside support as Japanese investors are expected to keep snapping up foreign bonds amid the yen 's rise against the dollar . Dealers said the dollar also drew some downside support as Japanese investors are expected to keep snapping up foreign bonds amid ever-falling domestic interest rates .
-0 2834988 2835026 Iran has until the end of the month to satisfy the agency it has no plans for nuclear weapons . The Iranians have until the end of the month to answer all the agency 's questions about their past nuclear activities .
-1 2587300 2587243 Her father , Florin Cioaba , the king of Transylvania 's Gypsies , had her brought back and she was married against her will . Her father , Roma King Florin Cioaba , had her brought back and she was promptly married against her will .
-0 554905 554627 Claire had advanced to the third round of the 76th annual Scripps Howard National Spelling Bee . One by one they strolled to the microphone , all 251 youngsters in the 76th Scripps Howard National Spelling Bee .
-1 1912524 1912648 Citigroup Inc . C.N , the world 's largest financial services company , on Wednesday promoted Marjorie Magner to chairman and chief executive of its global consumer group . Citigroup ( C ) on Wednesday named Marjorie Magner chairman and chief executive of its colossal global consumer business .
-1 3255597 3255668 " They 've been in the stores for over six weeks , " says Carney . The quarterlies usually stay in stores for between six to eight weeks , " Carney added .
-1 629316 629289 Let me just say this : the evidence that we have of weapons of mass destruction was evidence drawn up and accepted by the joint intelligence community . " The evidence that we had of weapons of mass destruction was drawn up and accepted by the Joint Intelligence Committee , " he said .
-1 54181 53570 Ridge said no actual explosives or other harmful substances will be used . Ridge said no real explosives or harmful devices will be used in the exercise .
-1 723557 724115 Thus far , Stewart 's company appears ready to stand behind her . For now , the company 's management appears to be standing behind Stewart .
-0 2607718 2607708 But late Thursday night , the campaign issued a statement saying there would be no news conference and no big announcement . But late yesterday , the campaign and the state Democratic Party said there would be no news conference .
-1 753858 753890 There 's also a flaw that results because IE does not implement an appropriate block on a file download dialog box . The second vulnerability is a result of IE not implementing a block on a file download dialog box .
-1 587009 586969 Another $ 100-million in savings will come from management layoffs and pay cuts . The airline expects to save another $ 100-million a year through management layoffs and pay cuts .
-1 308567 308525 He called on Prime Minister John Howard to establish a royal commission on child sex abuse . The Senate motion also called on Prime Minister John Howard to hold a royal commission into child sex abuse .
-0 665419 665612 " We think that the United States of America should support the free speech of all groups , " Mr. White said , objecting to Mr. Olson 's recommendation . We think that the United States of America should support the free speech of all groups , he said .
-1 2763517 2763576 Terri Schiavo , 39 , underwent the procedure at the Tampa Bay area hospice where she has been living for several years , said her father , Bob Schindler . The tube was removed Wednesday from Terri Schiavo , 39 , at the Tampa Bay-area hospice where she has lived for several years .
-0 3107118 3107136 After 18 months , Nissen found that Lipitor stopped plaque buildup in the patients ' arteries . After 18 months , the atorvastatin patients had no change in the plaque in their arteries .
-1 780604 780466 Toll , Australia 's second-largest transport company , last week offered NZ75 a share for Tranz Rail . Toll last week offered to buy the company for NZ75c a share , or $ NZ158 million .
-0 1989213 1989116 " This child was literally neglected to death , " Armstrong County District Attorney Scott Andreassi said . Armstrong County District Attorney Scott Andreassi said the many family photos in the home did not include Kristen .
-1 1462409 1462504 Wal-Mart , the nation 's largest private employer , has expanded its antidiscrimination policy to protect gay and lesbian employees , company officials said Tuesday . Wal-Mart Stores Inc . , the nation 's largest private employer , will now include gays and lesbians in its anti-discrimination policy , company officials said Wednesday .
-1 260952 260924 Metro , bus and local rail services in France 's four largest towns -- Paris , Lyon , Lille and Marseille -- were severely disrupted , Europe 1 radio reported . Subway , bus and suburban rail services in France 's four largest cities -- Paris , Lyon , Lille and Marseille -- were severely disrupted , transport authorities said .
-1 1224743 1225510 In the undergraduate case , Rehnquist said the use of race was not " narrowly tailored " to achieve the university 's asserted interest in diversity . Rehnquist wrote that the system was not narrowly tailored to achieve the interest in educational diversity .
-0 3329379 3329416 SP2 is basically about security enhancements to Windows , such as the improved Internet Connection Firewall ( ICF ) . The firewall in the current Windows XP was known as the Internet Connection Firewall ( ICF ) .
-1 2362761 2362698 A landslide in central Chungchong province derailed a Seoul-bound train and 28 passengers were injured , television said . In central Chungchong province , a landslide caused a Seoul-bound Saemaeul Express train to derail , injuring 28 people , local television said .
-0 1465073 1464854 They will help draft a plan to attack obesity that Kraft will implement over three to four years . The team will help draft a plan by the end of the year to attack obesity .
-1 195728 196099 But that amount would probably be impossible to pass in the Senate , where Republican moderates have refused to go above $ 350 billion . Such an amount would probably be unable to summon a majority of the Senate , where Republican moderates have refused to go above $ 350 billion .
-1 2587767 2587673 In the clash with police , Lt. Mothana Ali said about 1,000 demonstrators had gone to the station demanding jobs . In Baghdad , police Lieut . Mothana Ali said about 1,000 demonstrators arrived at the station demanding jobs .
-0 1490044 1489975 Corixa shares rose 54 cents to $ 7.74 yesterday on the Nasdaq Stock Market . Shares of Corixa rose 54 cents , or about 8 percent , to close at $ 7.74 .
-1 958161 957782 Committee approval , expected today , would set the stage for debate on the Senate floor beginning Monday . That would clear the way for debate in the full Senate beginning on Monday .
-1 1033204 1033365 O 'Brien was charged with leaving the scene of a fatal accident , a felony . Bishop Thomas O 'Brien , 67 , was booked on a charge of leaving the scene of a fatal accident .
-0 2996241 2996734 Tom Hamilton said his daughter was conscious and alert and in stable condition after the attack Friday morning . Bethany , who remained in stable condition after the attack Friday morning , talked of the attack Saturday .
-0 2015389 2015410 The Calgary woman , who is in her twenties , donated blood on Aug. 7 . The woman -- who has no symptoms of illness -- donated blood Aug. 7 .
-1 221515 221509 Quattrone lawyer John W. Keker said his client is innocent . In a statement Monday , his lawyer John Keker said ``Frank Quattrone is innocent .
-0 2283737 2283794 In the weeks leading up to the execution , several Florida officials received anonymous threatening letters . Several Florida officials connected to the case have received threatening letters , accompanied by rifle bullets .
-1 2826681 2826474 The disagreement over online music sales was disclosed in documents filed last week with the judge and made available by the court yesterday . The fight over online music sales was disclosed in documents made available Monday by the court .
-1 2249237 2249305 Parson was charged with intentionally causing and attempting to cause damage to protected computers . Parson is charged with one count of intentionally causing damage to a protected computer .
-1 389239 389299 " The court and the public need to know much more of the details of the defendant 's seemingly massive fraud , " the judge said . " The court and the public need to know more of the defendants ' seemingly massive fraud , " he said .
-1 2652187 2652218 The U.S. Supreme Court will hear arguments on Wednesday on whether companies can be sued under the Americans with Disabilities Act for refusing to rehire rehabilitated drug users . The high court will hear arguments today on whether companies can be sued under the ADA for refusing to rehire rehabilitated drug users .
-1 2945693 2945847 The IRS said taxpayers can avoid undelivered checks by having refunds deposited directly into their checking or savings accounts . The IRS said taxpayers can avoid problems with lost or stolen refunds by having refunds deposited directly into personal checking or savings accounts .
-1 2065523 2065836 " More than 70,000 men and women from bases in Southern California were deployed in Iraq . In all , more than 70,000 troops based in Southern California were deployed to Iraq .
-1 2222998 2223097 BP shares slipped 0.8 percent to 433.50 pence ( $ 6.85 ) each in afternoon trading on the London Stock Exchange . BP shares slipped 48 cents to $ 41.72 Friday in trading on the New York Stock Exchange .
-1 2561999 2561941 Because of the accounting charge , the company now says it lost $ 1.04 billion , or 32 cents a share , in the quarter ended June 30 . Including the charge , the Santa Clara , Calif.-based company said Monday it lost $ 1.04 billion , or 32 cents per share , in the period ending June 30 .
-0 2324704 2325023 Friday 's report raised new worries that a weak job market could shackle the budding economic recovery despite a slight improvement in the overall unemployment rate . U.S. companies slashed payrolls for a seventh straight month in August , raising new worries that a weak jobs market could shackle the budding economic recovery .
-1 2336453 2336545 Federal Emergency Management Administration designated $ 20 million to establish the registry . The registry was launched with $ 20 million from the Federal Emergency Management Agency .
-1 720572 720486 BREAST cancer cases in the UK have hit an all-time high with more than 40,000 women diagnosed with the disease each year , Cancer Re-search UK revealed yesterday . Cases of breast cancer in Britain have reached a record high , with the number of women diagnosed with the disease passing the 40,000 mark for the first time .
-1 1605818 1605806 " It was never our intention to sell the product , " said Health Minister Anne McClellan , a skeptic of medical marijuana use . " It was never the intention of us to sell product , " federal Health Minister Anne McLellan said yesterday in Edmonton .
-0 2440680 2440474 GM , the world 's largest automaker , has 115,000 active UAW workers and another 340,000 retirees and spouses . They cover more than 300,000 UAW workers and 500,000 retirees and spouses .
-0 726399 726078 Rosenthal is hereby sentenced to custody of the Federal Bureau of prisons for one day with credit for time served , " Breyer said to tumultuous cheers in the courtroom . " Rosenthal is hereby sentenced to custody of the Federal Bureau of Prisons for one day with credit for time served . "
-1 533903 533818 " We are committed to helping the Iraqi people get on the path to a free society , " Rumsfeld said in a speech to the Council on Foreign Relations . " We are committed to helping the Iraqi people get on the path to a free society , " he said .
-1 1166473 1166857 Mr. Young said he was disappointed that the government didn 't see the severe acute respiratory syndrome crisis as worthy of federal disaster-relief money . Young said he was disappointed the government didn 't see the SARS crisis as worthy of federal disaster relief money .
-1 144089 143697 The 12-nation currency has risen by 33 percent against the dollar over the past 15 months . The euro is up 9 percent against the dollar in the past six weeks .
-1 3439854 3439874 In February 2000 , the officers — Kenneth Boss , Sean Carroll , Edward McMellon and Richard Murphy — were acquitted of all charges in the killing . The officers -- Kenneth Boss , Sean Carroll , Edward McMellon and Richard Murphy -- were acquitted in 2000 of state murder charges .
-1 3464314 3464302 I was surprised it turned out me talking and the president just listening . " I was surprised it turned out me talking and the president just listening . . . It was mostly a monologue . "
-1 2008984 2009175 The state 's House delegation currently consists of 17 Democrats and 15 Republicans . Democrats hold a 17-15 edge in the state 's U.S. House delegation .
-0 816867 816831 Freddie also said Leland C. Brendsel will retire as chairman and chief executive and resign from the board . He replaces Leland Brendsel , 61 , who retired as chairman and chief executive .
-1 192285 192327 We 'll be listening carefully to the [ IAEA ] director general 's report at the next board meeting . " We 'll be listening carefully to the ( IAEA ) director-general 's report at the next board meeting . "
-1 2688145 2688162 In that position , Elias will report to Joe Tucci , president and CEO of EMC . As executive vice president of new ventures , Elias will report to Joe Tucci , EMC 's president and chief executive .
-1 3294207 3294290 But with the PM due to leave tomorrow afternoon for personal reasons there was a risk he might not be present when the final decision was made . But with the Prime Minister due to leave tomorrow , a day early , he may not be present when the final decision is made .
-0 205100 205145 A pro-independence radical , Miodrag Zivkovic , of the Liberal Alliance , came in second with 31 percent of the vote . Miodrag Zivkovic , of the Liberal Alliance of Montenegro , won 31 percent of the vote while the independent Dragan Hajdukovic got four percent .
-0 3242051 3241897 Mr. Kerkorian tried unsuccessfully to take over Chrysler in 1995 , but did win representation on its board . Kerkorian and Tracinda had also tried to take over Chrysler in 1995 .
-0 1076861 1077018 Glover spoke at a news conference that included about 20 relatives of the victims . About 20 family members of the victims were invited to the news conference .
-1 2095803 2095786 Drax faced a financial crisis late last year after it lost its most lucrative sales contract , held with insolvent utility TXU Europe . Drax ’ s troubles began late last year when it lost its most lucrative sales contract , with the insolvent utility TXU Europe .
-1 2112330 2112376 But I would rather be talking about high standards than low standards . " " I would rather be talking about positive numbers rather than negative .
-1 3389318 3389271 It was not immediately known how many people were on flight UTA 141 , which could carry 141 passengers and crew . It was still not known exactly how many people were on the plane , which could carry 141 passengers and crew .
-1 698948 698933 The market remains pinned in a narrow range after a powerful rally drove the broad Standard & Poor 's 500 index .SPX up more than 20 percent since mid-March . The market remains pinned in a narrow range after a powerful rally pushed the broad S & P 500 index up more than 20 percent since mid-March .
-1 539585 539355 Witnesses said they believed the man planned to crash the Launceston-bound Qantas flight 1737 , which was carrying 47 passengers and six crew . Witnesses believe he wanted to crash Flight 1737 , which had 47 passengers and six crew .
-1 684848 684557 As Samudra sat down to hear the indictment , he looked over to his nine lawyers and shouted ``God is Great ' ' three times . As he sat down to hear the indictment , Samudra looked over to his nine lawyers and shouted " Takbir ! " , or " Proclaim ! " , a religious rallying cry .
-1 347017 347002 In hardest-hit Taipei , traffic has disappeared from once bustling streets , ubiquitous department stores stand mostly empty and restaurants are eerily quiet . In hardest-hit Taipei , traffic has disappeared from once-bustling streets and department stores and restaurants are virtually empty .
-1 1592037 1592076 In a statement , Lee said he " no longer believes that Viacom deliberately intended to trade on my name when naming Spike TV . " Spike Lee no longer believes that Viacom deliberately intended to trade on his name by calling its own venture " Spike TV , " according to a statement read in court Tuesday .
-0 3013483 3013540 Singapore Prime Minister Goh Chok Tong says China plays an important role in the integration of Asia , including managing the stresses and strains both within and between countries . HAINAN PROVINCE , China : Singapore Prime Minister Goh Chok Tong said China plays an important role in the integration of Asia .
-1 2020252 2020081 The worm attacks Windows computers via a hole in the operating system , an issue Microsoft on July 16 had warned about . The worm attacks Windows computers via a hole in the operating system , which Microsoft warned of 16 July .
-0 2614947 2614904 The premium edition adds OfficeFront Page 2003 , Acceleration Server 2000 , and SQL Server 2000 . The premium edition adds ISA Server , SQL Server and a specialized edition of BizTalk 2004 .
-0 1744257 1744378 In the year-ago quarter , the steelmaker recorded a profit of $ 16.2 million , or 15 cents per share , on sales of $ 1.14 billion . In the second quarter last year , AK Steel reported a profit of $ 16.2 million , or 15 cents a share .
-0 1119721 1119714 Sony claimed that the reader 's capacitance sensing technology cannot be fooled by paper copies and does not require cleaning . Its capacitance sensing technology electronically reads a fingerprint ; Sony says it can 't be fooled by paper copies and doesn 't require cleaning .
-1 1186754 1187056 Amazon.com shipped out more than a million copies of the new book , making Saturday the largest distribution day of a single item in e-commerce history . Amazon.com shipped more than a million copies by Saturday afternoon , making Saturday the largest distribution day of a single item in e-commerce history .
-1 2842562 2842582 The show 's closure affected third-quarter earnings per share by a penny . The company said this impacted earnings by a penny a share .
-0 431076 431242 After the two-hour meeting on May 14 , publisher Arthur O. Sulzberger Jr . , executive editor Howell Raines and managing editor Gerald Boyd pledged quick remedies to staff grievances . The committee will make recommendations to Publisher Arthur Sulzberger , Executive Editor Howell Raines and Managing Editor Gerald Boyd .
-1 1393764 1393984 It 's been a busy couple of days for security gurus assigned to keep their companies safe and sound . It 's been a busy couple of days for enterprise security gurus tasked with the job of keeping their companies safe and sound .
-0 2916199 2916164 Lu reclined in a soft chair wearing a woolly coat near the blackened capsule . " It 's great to be back home , " said Lu , dressed in a woolly coat near the blackened capsule .
-1 2530671 2530542 Gov. Bob Riley proposed the budget cuts after Alabama voters rejected his $ 1.2 billion tax plan Sept . 9 . After Alabama voters rejected his $ 1.2 billion tax plan Sept . 9 , Riley forecast significant cuts in state programs .
-1 219064 218969 " It is probably not the easiest time to come in and take over the shuttle program , but then again , I look forward to the challenge , " he said . " It 's probably not the easiest time to come in and take over the shuttle program , but I look forward to the challenge , " Parsons told reporters at NASA headquarters .
-0 2377289 2377259 Estonia 's place in the European mainstream and safeguard its independence regained in 1991 . Estonia was forcibly incorporated in the Soviet Union in 1940 and regained its independence only in 1991 .
-0 2110220 2110199 Franklin County Judge-Executive Teresa Barton said a firefighter was struck by lightning and was taken to the Frankfort Regional Medical Center . A county firefighter , was struck by lightning and was in stable condition at Frankfort Regional Medical Center .
-0 1864253 1863810 Police suspected that Shaichat , 20 , had been abducted either by Palestinians or by Israeli Arabs . Nobody claimed responsibility for Schaichat 's death , but police suspect that the 20-year-old soldier was abducted either by Palestinians or Israeli Arabs .
-0 3150803 3150839 During this year 's August to October quarter , Lowe 's opened 38 new stores , including two relocations . During the third quarter , Lowe 's opened 38 new stores and now has 932 stores in 45 states .
-0 969381 969512 The technology-laced Nasdaq Composite Index < .IXIC > declined 25.78 points , or 1.56 percent , to 1,627.84 . The broader Standard & Poor 's 500 Index .SPX gave up 11.91 points , or 1.19 percent , at 986.60 .
-1 271891 271839 Sony said the PSP would also feature a 4.5-inch LCD screen , Memory Stick expansion slots . It also features a 4.5 in back-lit LCD screen and memory expansion facilities .
-0 2829648 2829613 Clinton did not mention that two Democratic senators , Charles Robb of Virginia and Wendell Ford of Kentucky , voted to shelve the McCain bill . Two Democrats , Sen. Charles Robb of Virginia and Wendell Ford of Kentucky , voted with the 40 Republicans .
-1 886904 887158 Some of the company 's software developers will join Microsoft , but details haven 't been finalized , said Mike Nash , corporate vice president of Microsoft 's security business unit . Some of the companys software developers will join Microsoft , but details havent been finalized , said Mike Nash , corporate vice president of Microsofts security business unit .
-0 2632692 2632767 Wal-Mart has said it plans to open at least 40 Supercenters in the state in the coming years ; analysts expect four or more to be in San Diego County . At least 40 of the outlets will be in California , and analysts expect four or more to be in San Diego County .
-1 2240399 2240149 Cintas is battling efforts to unionize 17,000 of its workers and to let unions organize the workers by signing cards , rather than by a lengthy election process . Cintas is battling efforts to unionize 17,000 of its workers and labor 's demands to let its workers organize by signing cards , rather than by a lengthy election process .
-1 805457 805985 The opposition would resort to rolling mass action " at strategic times of our choice and without warning to the dictatorship , " he said . " From now onwards we will embark on rolling mass action at strategic times of our choice and without any warning to the dictatorship , " he said .
-1 2896308 2896334 Federal Agriculture Minister Warren Truss said the Government still did not know the real reason the sheep were rejected at the Saudi port of Jeddah on August 21 . He said the Government still did not know the real reason the original Saudi buyer pulled out on August 21 .
-1 2110775 2110924 Tom Kraynak , manager of operations and resources for the Canton , Ohio-based East Central Area Reliability Council , said that scenario is one among many that investigators are considering . Tom Kraynak , manager of operations and resources for the Canton , Ohio-based East Central Area Reliability Council , said investigators are considering the scenario .
-1 1762569 1762526 Hester said Sanmina was the best fit among several purchase offers the company received from electronics manufacturers and computer makers . Hester said Sanmina 's offer was the best among several Newisys received from electronics manufacturers and computer makers .
-0 2706154 2706185 The other inmate fell but Selenski shimmed down the makeshift rope to a second-story roof and used the mattress to scale a razor-wire fence , Fischi said . After the other inmate fell , Selenski used the mattress to scale a 10-foot , razor-wire fence , Fischi said .
-1 1057995 1057778 The hearing , expected to last a week , will determine whether Akbar faces a court-martial . The purpose of the hearing is to determine whether Akbar should be court-martialled .
-1 1386884 1386857 He said he has begun a court action to seize Beacon Hill 's assets and has frozen more than $ 13 million Beacon Hill had when it closed . He said he has initiated a forfeiture action in court and frozen more than $ 13 million Beacon Hill had when it closed .
-1 3093023 3092996 Speaking for the first time yesterday , Brigitte 's maternal aunt said his family was unaware he had was in prison or that he had remarried . Brigitte 's maternal aunt said his family was unaware he had been sent to prison , or that he had remarried in Sydney .
-1 1661381 1661317 " Close co-operation between our law enforcement agencies , close co-operation between our intelligence services lie at the heart of the ongoing fight against terrorism . " Close cooperation between regional law enforcement agencies and intelligence services was at the heart of the fight against terrorism , he said .
-0 2926039 2925982 The mother of a Briton held by Colombian guerrillasspoke of her relief yesterday after hearing that he might be freed in the next few weeks . The parents of a Briton being held hostage by Colombian rebels spoke yesterday of their optimism that he would be freed in time for his birthday next month .
-0 637168 637447 We strongly disagree with Novell 's position and view it as a desperate measure to curry favor with the Linux community . McBride characterized Novell 's move as " a desperate measure to curry favor with the Linux community . "
-1 696677 696932 After more than two years ' detention under the State Security Bureau , the four were found guilty of subversion in Beijing 's No. 1 Intermediate Court last Wednesday . After more than two years in detention by the State Security Bureau , the four were found guilty last Wednesday of subversion .
-1 3122429 3122305 Mr Russell , 46 , a coal miner from Brisbane , said : " They are obviously hurting , so we are basically going over there to help them . " " They are obviously hurting so we are basically going over there to help them , " Russell , 46 , said .
-1 1348909 1348954 The New York Democrat and former first lady has said she will not run for the White House in 2004 , but has not ruled out a race in later years . The former first lady has said she will not run for the White House in 2004 but has not ruled out a race later on .
-0 162203 162101 It does not affect the current Windows Media Player 9.0 Series . Windows Media Player has had security problems before .
-0 71501 71627 The seizure took place at 4 a.m. on March 18 , just hours before the first American air assault . The time was about 4 a.m. on March 18 , just hours before the first pinpoint missiles rained down on the capital .
-1 2907762 2907649 Donations stemming from the Sept . 11 attacks helped push up contributions to human service organizations and large branches of the United Way by 15 percent and 28.6 percent , respectively . Donations stemming from the Sept . 11 attacks helped push up contributions to human service organizations by 15 percent and to large branches of the United Way by 28.6 percent .
-1 2167771 2167744 In May , Mr. Hatfill said he was struck by a vehicle being driven by an FBI employee who was tailing him in Georgetown . Last May , Hatfill was struck by a vehicle being driven by an FBI employee who was tailing him in Washington 's Georgetown neighborhood .
-1 3320577 3320553 " I will support a constitutional amendment which would honor marriage between a man and a woman , codify that , " he said . " If necessary , I will support a constitutional amendment which would honour marriage between a man and a woman , codify that . "
-1 849291 849442 IBM of the US and Infineon Technologies of Germany will today announce a technological development that could threaten multi-billion dollar memory chip markets . IBMof the US andInfineon Technologies of Germany willon Tuesdayannounce a technological development that could threaten multi-billion dollar memory chip markets .
-0 763948 763991 Costa 's semifinal opponent is Spaniard Juan Carlos Ferrero , whom he beat in last year 's final . Costa will play Juan Carlos Ferrero next in a rematch of last year 's final .
-1 1908763 1908744 A former employee of a local power company pleaded guilty Wednesday to setting off a bomb that knocked out a power substation during the Winter Olympics last year . A former Utah Power meter reader pleaded guilty Wednesday to bombing a power substation during the 2002 Winter Olympics .
-0 1876120 1876059 Thyroid hormones are known to help in weight loss by stimulating metabolism - and cutting cholesterol - but come with the unwanted side effect of speeding up the heartbeat . Thyroid hormones are known to help in weight loss by stimulating metabolism , and they can help cut cholesterol too .
-1 518089 518133 Judge Craig Doran said it wasn 't his role to determine if Hovan was " an evil man " but maintained that " he has committed an evil act . " Judge Craig Doran said he couldn 't determine if Hovan was " an evil man " but said he " has committed an evil act . "
-0 224932 224868 The Hartford shares rose $ 2.88 , or 6.6 percent , to close Monday at $ 46.50 on the New York Stock Exchange . Shares of Hartford rose $ 2.88 to $ 46.50 in New York Stock Exchange composite trading .
-1 1771131 1771091 It also offers a built-in NAND flash boot loader so that high-density NAND flash memory can be used without having to install an additional support chip . The S3C2440 has a built-in NAND flash boot loader , for example , so that high-density NAND flash memory can be installed without an additional support chip .
-0 2728425 2728251 It decided instead to issue them before the stock market opened Monday after the downgrade of its debt late Friday by Moody 's , the credit rating agency . It decided instead to issue them before the stock market opened Monday to counteract the downgrade of its debt late Friday by Moody 's to one step above junk status .
-0 953733 953537 Altria shares fell 2.5 percent or $ 1.11 to $ 42.57 and were the Dow 's biggest percentage loser . Its shares fell $ 9.61 to $ 50.26 , ranking as the NYSE 's most-active issue and its biggest percentage loser .
-1 349215 349241 It will be followed in November by a third movie , " The Matrix Revolutions . " The film is the second of a trilogy , which will wrap up in November with " The Matrix Revolutions . "
-1 2919853 2919804 Massachusetts regulators and the Securities and Exchange Commission on Tuesday pressed securities fraud charges against Putnam Investments and two of its former portfolio managers for alleged improper mutual fund trading . State and federal securities regulators filed civil charges against Putnam Investments and two portfolio managers in the ever-expanding mutual fund trading scandal .
-1 954526 954607 He is blocking them until the Air Force assigns four additional C-130 cargo planes to Gowen Field , an Idaho Air National Guard base in Boise . He is holding them up until the Air Force agrees to assign four additional C-130 cargo planes to the Idaho Air National Guard .
-1 69773 69792 Cisco pared spending to compensate for sluggish sales . In response to sluggish sales , Cisco pared spending .
-0 2823575 2823513 The study , published Monday in the journal Molecular Brain Research , is likely to also apply to humans , its authors said . The study , conducted on the brains of developing mice , was being published today in the journal Molecular Brain Research .
-1 2455942 2455978 My decision today is not based on any one event . " Governor Rowland said his decision was " not based on any one event . "
-1 131979 131957 Nelson , 27 , is being retried on civil-rights charges stemming from the disturbance which led to Rosenbaum 's death . Nelson , 27 , is being retried on civil rights charges stemming from the disturbance that led to Rosenbaum 's death .
-0 2010705 2010779 " The government elements who have been causing trouble are still in place . The government elements who have been causing trouble are still in place , they are attacking us . "
-1 54142 53641 Next Monday at about 2 p.m. ( CST ) , hospital officials in and near Chicago will notice a sudden increase in people complaining of flu-like symptoms . Around the same time , hospital officials in and near Chicago will notice a sudden increase in people complaining of flu-like symptoms .
-1 1015249 1015204 Wal-Mart Stores Inc . , Kohl 's Corp. , Family Dollar Stores Inc. and Big Lots Inc. were among the merchants posting May sales that fell below Wall Street 's modest expectations . Wal- Mart , Kohl 's Corp. , Family Dollar Stores Inc . , and Big Lots Inc. posted May sales that fell below Wall Street 's modest expectations .
-0 753928 753890 The patch also fixes a vulnerability that results because IE does not implement an appropriate block on a file download dialog box . The second vulnerability is a result of IE not implementing a block on a file download dialog box .
-1 3022833 3023029 Peterson , a former fertilizer salesman , is charged with murder in the deaths of his 27-year-old wife and the baby boy she was carrying . Peterson , 31 , is now charged with murder in the deaths of his 27-year-old wife and their unborn son .
-0 751520 751373 SPOT products run a Microsoft operating system and the company 's DirectBand radio technology developed with SCA Data Systems . The DirectBand network was developed with the assistance of SCA Data Systems .
-0 218848 218851 He replaces Ron Dittemore , who announced his resignation in April . Dittemore announced his plans to resign on April 23 .
-1 3181118 3181443 Detectives told Deasean 's father , Stelly Chisolm , a college student , and mother , Kimberly Hill , of the arrest shortly after Perry was apprehended . Shortly after his arrest , detectives told Deasean 's father , Stelly Chisolm , a college student , and mother , Kimberly Hill , a medical assistant , about the development .
-1 515581 515752 They were among about 40 people attending the traditional Jewish ceremony colored by some non-traditional touches . He said about 40 people attended the traditional Jewish ceremony colored by some nontraditional touches .
-1 347022 347003 Taiwan had been relatively free of the viral infection until a fiasco at a Taipei hospital in late April caused the number of infections to skyrocket . Taiwan had been relatively free of the viral infection until a severe outbreak at a Taipei hospital in late April .
-1 3311600 3311633 Mr. Rowland attended a party in South Windsor for the families of Connecticut National Guard soldiers called to active duty . Rowland was making an appearance at a holiday party for families of Connecticut National Guard soldiers assigned to duty in Iraq and Afghanistan .
-0 3439114 3439084 Ross Garber , Rowland 's lawyer , said Tuesday he would attend the meeting and would ask to speak on the issue . Ross Garber , Rowland 's legal counsel , said the governor would have no comment on the condo deal .
-0 487951 488007 The euro was at 1.5281 versus the Swiss franc EURCHF = , up 0.2 percent on the session , after hitting its highest since mid-2001 around 1.5292 earlier in the session . The euro was steady versus the Swiss franc after hitting its highest since mid-2001 of 1.5261 earlier in the session .
-0 314997 315030 On the stand Wednesday , she said she was referring only to the kissing . On the stand Wednesday , she testified that she was referring to the kissing before the alleged rape .
-0 4733 4557 Garner said the group would probably be expanded to include , for example , a Christian and perhaps another Sunni leader . The group has already met several times and Gen. Garner said it probably will be expanded to include a Christian and perhaps another Sunni Muslim leader .
-1 2820371 2820525 Blair 's Foreign Secretary Jack Straw was to take his place on Monday to give a statement to parliament on the European Union . Blair 's office said his Foreign Secretary Jack Straw would take his place on Monday to give a statement to parliament on the EU meeting the prime minister attended last week .
-1 801552 801516 " There were more people surrounding the clubhouse than the Unabomber 's house up in the hills , " Baker said . " There are more people surrounding the clubhouse than surrounded the Unabomber 's home in the hills .
-1 1704987 1705268 Charles O. Prince , 53 , was named as Mr. Weill 's successor . Mr. Weill 's longtime confidant , Charles O. Prince , 53 , was named as his successor .
-1 396041 396188 Officials are also meeting with the International Organization for Epizootics ( OIE ) , which establishes animal-health standards for the world . Canadian officials were also expected to meet yesterday with the International Organization for Epizootics ( OIE ) , which establishes animal-health standards for the world .
-0 1014983 1014963 GE stock closed Friday at $ 30.65 a share , down about 42 cents , on the New York Stock Exchange . GE 's shares closed at $ 30.65 on Friday on the New York Stock Exchange .
-1 2320654 2320666 The Midwestern research center will focus on the development of diagnostic , therapeutic and vaccine products for anthrax , botulism , tularemia , hemorrhagic fever viruses and plague . The Midwestern center will focus on diagnosis , treatment and vaccines for anthrax , botulism , tularemia , hemorrhagic fever viruses and plague .
-1 1057876 1057778 The hearing is to determine whether there is enough evidence to order Akbar to a general court-martial proceeding . The purpose of the hearing is to determine whether Akbar should be court-martialled .
-0 2116843 2116883 In the United States , heart attacks kill about 460,000 year , in Canada about 80,000 . In the United States , heart attacks kill about 460,000 yearly , according to the National Institutes of Health .
-1 1461629 1461781 Ninety-five percent of international cargo to the United States is carried by ship . Ships carry 95 percent of international cargo to the United States .
-0 374015 374162 " It 's a major victory for Maine , and it 's a major victory for other states . The Maine program could be a model for other states .
-1 2493369 2493428 News that oil producers were lowering their output starting in November exacerbated a sell-off that was already under way on Wall Street . News that the Organization of Petroleum Exporting Countries was lowering output starting in November exacerbated a stock sell-off already under way yesterday .
-1 490355 490378 They note that after several weeks of rallies on upbeat earnings , investors are looking for stronger evidence of a recovery before sending stocks higher . After several weeks of market rallies on upbeat earnings , many investors are looking for more concrete signs of an economic recovery .
-1 2691044 2691264 Most economists had expected a more dire report , with many anticipating the fifth month of job losses in six months . Most economists had been expecting a far more dire report , with many expecting to see the fifth month of job losses in six months in September .
-1 1831453 1831491 But software license revenues , a measure financial analysts watch closely , decreased 21 percent to $ 107.6 million . License sales , a key measure of demand , fell 21 percent to $ 107.6 million .
-1 2380695 2380822 King , brand-name writer , master of the horror story and e-book pioneer , is receiving this year 's medal for Distinguished Contributions to American Letters . Stephen King , master of the horror story and e-book pioneer , is receiving this year 's medal for Distinguished Contributions to American Letters from the National Book Foundation .
-1 2577517 2577531 The Denver-based natural gas producer and marketer said the inaccurate reporting was discovered after it received a subpoena from the U.S. Commodity Futures Trading Commission . The natural gas producer and marketer said the inaccurate reporting was discovered in response to a subpoena from the U.S. Commodity Futures Trading Commission , or CFTC .
-1 3267026 3266930 The steel tariffs , which the U.S. president imposed in March 2002 , will officially end at midnight , instead of March 2005 as initially planned . The U.S. steel tariffs , which Bush imposed in March 2002 , were to officially end at midnight Thursday ( 0500 GMT ) , instead of March 2005 as initially planned .
-1 360875 360943 Business Week 's online edition reported on Friday that WorldCom and the SEC could announce a settlement as early as Monday . BusinessWeek Online has learned that the settlement could come as early as Monday , May 19 .
-1 162632 162653 Only one of the five buildings in the Baghdad compound of the United Nations Development Program escaped being burned , the UN said on its Web site . Only one of the five buildings in the compound in Baghdad run by the UN Development Program , escaped being burned , the UN said on its Web site .
-1 1128884 1128865 Shares of Salix have rocketed 64 percent since Axcan made its first offer on April 10 . Since the initial takeover offer , Salix shares have risen about 35 percent .
-1 3264732 3264648 The jury verdict , reached Wednesday after less than four hours of deliberation , followed a 2 week trial , during which Waagner represented himself . The quick conviction followed a 2 1 / 2 week trial , during which the Venango County man represented himself .
-1 1721433 1721267 It 's happened five times in the last 11 years : A disaster puts this Southwestern town in the headlines during the summer tourist season . It 's happened five times in the last decade : A disaster puts this tourist town in the headlines during summer , its busiest season .
-0 146112 146127 The broader Standard & Poor 's 500 Index .SPX edged down 9 points , or 0.98 percent , to 921 . The technology-laced Nasdaq Composite Index < .IXIC > shed 15 points , or 0.98 percent , to 1,492 .
-1 389117 389052 The company emphasized that McDonald 's USA does not import any raw beef or hamburger patties from Canada for McDonald 's use in the United States . McDonald 's said in a statement that it does not import any raw beef or hamburger patties from Canada for use in the United States .
-1 872784 872834 Gregory Parseghian , a former investment banker , was appointed chief executive . Greg Parseghian was appointed the new chief executive .
-0 2977500 2977547 Their contract will expire at 12 : 01 a.m. Wednesday instead of 12 : 01 a.m. Sunday , said Rian Wathen , organizing director for United Food and Commercial Workers Local 700 . " It has outraged the membership , " said Rian Wathen , organizing director of United Food and Commercial Workers Local 700 .
-1 3107137 3107119 But plaque volume increased by 2.7 percent in pravastatin patients . The volume of plaque in Pravachol patients ' arteries rose by 3 % .
-1 1619244 1619274 Today in the US , the book - kept under wraps by its publishers , G. P. Putnam 's Sons , since its inception - will appear in bookstores . Tomorrow the book , kept under wraps by G. P. Putnam 's Sons since its inception , will appear in bookstores .
-0 3061836 3062031 The S & P / TSX composite rose 87.74 points on the week , while the TSX Venture Exchange composite gained 44.49 points . On the week , the Dow Jones industrial average rose 11.56 points , while the Nasdaq Stock Market gained 39.42 points .
-1 485999 486011 Ex-KGB agent Putin added that the Beatles were considered ' propaganda of an alien ideology ' . In Soviet times the Beatles ' music " was considered propaganda of an alien ideology .
diff --git a/docs/examples/sentence_embedding/elmo_sentence_representation.md b/docs/examples/sentence_embedding/elmo_sentence_representation.md
deleted file mode 100644
index 84c8309bdc..0000000000
--- a/docs/examples/sentence_embedding/elmo_sentence_representation.md
+++ /dev/null
@@ -1,165 +0,0 @@
-# Extracting Sentence Features with Pre-trained ELMo
-
-While word embeddings have been shown to capture syntactic and semantic information of words as well as have become a standard component in many state-of-the-art NLP architectures, their context-free nature limits their ability to represent context-dependent information.
-Peters et. al. proposed a deep contextualized word representation method, called Embeddings from Language Models, or ELMo for short [1].
-This model is pre-trained with a self-supervising task called a bidirectional language model; they show that the representation from this model is powerful and improves the state-of-the-art performance on many tasks such as question-answer activities, natural language inference, semantic role labeling, coreference resolution, named-entity recognition, and sentiment analysis.
-
-In this notebook, we will show how to leverage the model API in GluonNLP to automatically download the pre-trained ELMo model, and generate sentence representation with this model.
-
-We will focus on:
-
-1) how to process and transform data to be used with pre-trained ELMo model, and
-2) how to load the pre-trained ELMo model, and use it to extract the representation from preprocessed data.
-
-## Preparation
-
-We start with the usual preparation like importing libraries and setting up the environment.
-
-### Load MXNet and GluonNLP
-
-```{.python .input}
-import warnings
-warnings.filterwarnings('ignore')
-import numpy as np
-import io
-
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-nlp.utils.check_version('0.7.0')
-```
-
-## Preprocess the data
-
-The goal of pre-processing the data is to numericalize the text using the pre-processing steps that are consistent with training ELMo model.
-
-The exact same vocabulary needs to be used so that the indices in model embedding matches the pre-trained model.
-In this section, we will proceed with the following steps:
-
-1) Loading a custom dataset
-2) Tokenizing the dataset in the same way as training ELMo
-3) Numericalizing the tokens on both words and characters using the provided `vocab`
-
-### Loading the dataset
-
-The first step is to create a dataset from existing data.
-Here, we use a paragraph from [1] as our dataset, using the built-in [TextLineDataset](../../api/data.rst#gluonnlp.data.TextLineDataset) class.
-It's a dataset of 7 samples, each of which is a sentence.
-
-```{.python .input}
-elmo_intro = """
-Extensive experiments demonstrate that ELMo representations work extremely well in practice.
-We first show that they can be easily added to existing models for six diverse and challenging language understanding problems, including textual entailment, question answering and sentiment analysis.
-The addition of ELMo representations alone significantly improves the state of the art in every case, including up to 20% relative error reductions.
-For tasks where direct comparisons are possible, ELMo outperforms CoVe (McCann et al., 2017), which computes contextualized representations using a neural machine translation encoder.
-Finally, an analysis of both ELMo and CoVe reveals that deep representations outperform those derived from just the top layer of an LSTM.
-Our trained models and code are publicly available, and we expect that ELMo will provide similar gains for many other NLP problems.
-"""
-
-elmo_intro_file = 'elmo_intro.txt'
-with io.open(elmo_intro_file, 'w', encoding='utf8') as f:
- f.write(elmo_intro)
-
-dataset = nlp.data.TextLineDataset(elmo_intro_file, 'utf8')
-print(len(dataset))
-print(dataset[2]) # print an example sentence from the input data
-```
-
-### Transforming the dataset
-
-Once we have the dataset that consists of sentences in raw text form, the next step is to transform
-the dataset into the format that ELMo model knows and on which it was trained.
-
-In our case, transforming the dataset consists of tokenization and numericalization.
-
-#### Tokenization
-
-The ELMo pre-trained models are trained on Google 1-Billion Words dataset, which was tokenized with the Moses Tokenizer.
-In GluonNLP, using [SacreMosesTokenizer](../../api/data.rst#gluonnlp.data.SacreMosesTokenizer) should do the trick.
-Once tokenized, we can add markers, or tokens, for the beginning and end of sentences. BOS means beginning of sentence, and EOS means the end of a sentence.
-
-```{.python .input}
-tokenizer = nlp.data.SacreMosesTokenizer()
-dataset = dataset.transform(tokenizer)
-dataset = dataset.transform(lambda x: [''] + x + [''])
-print(dataset[2]) # print the same tokenized sentence as above
-```
-
-
-#### Using Vocab from pre-trained ELMo
-
-Numericalizing the dataset is as straightforward as using the ELMo-specific character-level
-vocabulary as transformation. For details on ELMo's vocabulary, see
-[ELMoCharVocab](../../api/vocab.rst#gluonnlp.vocab.ELMoCharVocab).
-We also calculate the length of each sentence in preparation for batching.
-
-```{.python .input}
-vocab = nlp.vocab.ELMoCharVocab()
-dataset = dataset.transform(lambda x: (vocab[x], len(x)), lazy=False)
-```
-
-#### Creating the `DataLoader`
-
-Now that the dataset is ready, loading it with the `DataLoader` is straightforward.
-Here, we pad the first field to the maximum length, and append/stack the actual length of the sentence to form
-batches.
-The lengths will be used as a mask.
-For more advanced usage examples of the DataLoader object, check out the
-[Sentiment Analysis tutorial](../sentiment_analysis/sentiment_analysis.ipynb).
-
-```{.python .input}
-batch_size = 2
-dataset_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0),
- nlp.data.batchify.Stack())
-data_loader = gluon.data.DataLoader(dataset,
- batch_size=batch_size,
- batchify_fn=dataset_batchify_fn)
-```
-
-## Loading the pre-trained ELMo model
-
-Using the model API in GluonNLP, you can automatically download the pre-trained models simply by
-calling get_model. The available options are:
-
-1. elmo_2x1024_128_2048cnn_1xhighway
-2. elmo_2x2048_256_2048cnn_1xhighway
-3. elmo_2x4096_512_2048cnn_2xhighway
-
-Note that the second field in get_model's return value is ELMo's vocabulary.
-Since we already created an instance of it above, here we simply ignore this field.
-
-```{.python .input}
-elmo_bilm, _ = nlp.model.get_model('elmo_2x1024_128_2048cnn_1xhighway',
- dataset_name='gbw',
- pretrained=True,
- ctx=mx.cpu())
-print(elmo_bilm)
-```
-
-## Putting everything together
-
-Finally, now we feed the prepared data batch into the [ELMoBiLM](../../api/model.rst#gluonnlp.model.ELMoBiLM) model.
-
-```{.python .input}
-def get_features(data, valid_lengths):
- length = data.shape[1]
- hidden_state = elmo_bilm.begin_state(mx.nd.zeros, batch_size=batch_size)
- mask = mx.nd.arange(length).expand_dims(0).broadcast_axes(axis=(0,), size=(batch_size,))
- mask = mask < valid_lengths.expand_dims(1).astype('float32')
- output, hidden_state = elmo_bilm(data, hidden_state, mask)
- return output
-
-batch = next(iter(data_loader))
-features = get_features(*batch)
-print([x.shape for x in features])
-```
-
-## Conclusion and summary
-
-In this tutorial, we show how to generate sentence representation from the ELMo model.
-In GluonNLP, this can be done with just a few simple steps: reuse of the data transformation from ELMo for preprocessing the data, automatically downloading the pre-trained model, and feeding the transformed data into the model.
-To see how to plug in the pre-trained models in your own model architecture and use fine-tuning to improve downstream tasks, check our [Sentiment Analysis tutorial](../sentiment_analysis/sentiment_analysis.ipynb).
-
-## References
-
-[1] Peters, Matthew E., et al. "Deep contextualized word representations." NAACL (2018).
diff --git a/docs/examples/sentence_embedding/sentences.json b/docs/examples/sentence_embedding/sentences.json
deleted file mode 100644
index 1369580dfa..0000000000
--- a/docs/examples/sentence_embedding/sentences.json
+++ /dev/null
@@ -1,38 +0,0 @@
-[
- [
- "The U.S. Centers for Disease Control and Prevention initially advised school systems to close if outbreaks occurred , then reversed itself , saying the apparent mildness of the virus meant most schools and day care centers should stay open , even if they had confirmed cases of swine flu .",
- "When Ms. Winfrey invited Suzanne Somers to share her controversial views about bio-identical hormone treatment on her syndicated show in 2009 , it won Ms. Winfrey a rare dollop of unflattering press , including a Newsweek cover story titled \" Crazy Talk : Oprah , Wacky Cures & You . \"",
- "Elk calling -- a skill that hunters perfected long ago to lure game with the promise of a little romance -- is now its own sport .",
- "Don 't !",
- "Fish , ranked 98th in the world , fired 22 aces en route to a 6-3 , 6-7 ( 5 \/ 7 ) , 7-6 ( 7 \/ 4 ) win over seventh-seeded Argentinian David Nalbandian .",
- "Why does everything have to become such a big issue ?",
- "AMMAN ( Reuters ) - King Abdullah of Jordan will meet U.S. President Barack Obama in Washington on April 21 to lobby on behalf of Arab states for a stronger U.S. role in Middle East peacemaking , palace officials said on Sunday .",
- "To help keep traffic flowing the Congestion Charge will remain in operation through-out the strike and TfL will be suspending road works on major London roads wherever possible .",
- "If no candidate wins an absolute majority , there will be a runoff between the top two contenders , most likely in mid-October .",
- "Authorities previously served search warrants at Murray 's Las Vegas home and his businesses in Las Vegas and Houston ."
- ],
- [
- "Brent North Sea crude for November delivery rose 84 cents to 68.88 dollars a barrel .",
- "That seems to have been their model up til now .",
- "Gordon will join Luol Deng on the GB team ; their respective NBA teams , the Detroit Pistons and the Chicago Bulls , play tonight .",
- "Nikam maintains the attacks were masterminded by the Muslim militant group Lashkar-e-Taiba .",
- "Last year , Williams was unseeded , ranked 81st and coming off one of her worst losses on tour -- in a Tier 4 event at Hobart -- yet she beat six seeded players en route to the title at Melbourne Park .",
- "It said that two officers involved in the case had been disciplined .",
- "\" There is more intelligence now being gathered , \" the official said , adding that such efforts would continue for some time .",
- "The majority will be of the standard 6X6 configuration for carrying personnel .",
- "\" Consequently , necessary actions may not be taken to reduce the risks to children of sexual exploitation and drug or alcohol misuse , \" the report said . \u2022 Almost two-thirds of inspected schools were good or outstanding , but the number of underperforming secondaries remained \" stubborn and persistent . \"",
- "What a World Cup ."
- ],
- [
- "But , there have also been many cases of individuals and small groups of people protesting , as in the case of Rongye Adak , a nomad who called for the return of the Dalai Lama and for the freedom of Tibet during the Lithang Horse Racing Festival , in eastern Tibet .",
- "James Duncan , head of transportation at Bournemouth Borough Council , said : \" Our legal team is reviewing the entitlement of taxis to drop and pick up passengers at bus stops , only for as long as is absolutely necessary to fulfil that function and for no other reason .",
- "To Mo concerning the food log you kept -- Dr. Buchholz recommends the same thing .",
- "The CBO estimates that only 23 percent of that would be spent in 2009 and 2010 .",
- "Even so , Democrats slammed Bush as out of touch .",
- "An information campaign will be launched later to raise awareness of employment rights and how to enforce them .",
- "At the gallery the concept is less vague , as Ms. Piper cites specific instances of racial violence , political assassinations and the devastation of Hurricane Katrina .",
- "There have been some exceptions -- such as Medicare in 1965 .",
- "The government guidance will be reviewed early next year after a period of public comment .",
- "It wasn 't the most seaworthy of prizes ."
- ]
-]
diff --git a/docs/examples/sentiment_analysis/Bi-LSTM-Rep.png b/docs/examples/sentiment_analysis/Bi-LSTM-Rep.png
deleted file mode 100644
index 36fd1e7eb8..0000000000
Binary files a/docs/examples/sentiment_analysis/Bi-LSTM-Rep.png and /dev/null differ
diff --git a/docs/examples/sentiment_analysis/attention-nlp.png b/docs/examples/sentiment_analysis/attention-nlp.png
deleted file mode 100644
index debaab2a68..0000000000
Binary files a/docs/examples/sentiment_analysis/attention-nlp.png and /dev/null differ
diff --git a/docs/examples/sentiment_analysis/index.rst b/docs/examples/sentiment_analysis/index.rst
deleted file mode 100644
index 0e97ace35d..0000000000
--- a/docs/examples/sentiment_analysis/index.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Sentiment Analysis
-==================
-
-.. container:: cards
-
- .. card::
- :title: Fine-tuning LSTM-based Language Model
- :link: sentiment_analysis.html
-
- See how to fine-tune a pre-trained language model to perform sentiment analysis on movie reviews.
-
- .. card::
- :title: Training Structured Self-attentive Sentence Embedding
- :link: self_attentive_sentence_embedding.html
-
- See how to use GluonNLP to build more advanced model structure for extracting sentence
- embeddings to predict Yelp review rating.
-
-
-
-.. toctree::
- :hidden:
- :maxdepth: 2
-
- sentiment_analysis.ipynb
- self_attentive_sentence_embedding.ipynb
-
diff --git a/docs/examples/sentiment_analysis/samodel-v3.png b/docs/examples/sentiment_analysis/samodel-v3.png
deleted file mode 100644
index abe56d72dc..0000000000
Binary files a/docs/examples/sentiment_analysis/samodel-v3.png and /dev/null differ
diff --git a/docs/examples/sentiment_analysis/self_attentive_sentence_embedding.md b/docs/examples/sentiment_analysis/self_attentive_sentence_embedding.md
deleted file mode 100644
index ac60e0cea7..0000000000
--- a/docs/examples/sentiment_analysis/self_attentive_sentence_embedding.md
+++ /dev/null
@@ -1,559 +0,0 @@
-# Training Structured Self-attentive Sentence Embedding
-
-After the novelty of word embeddings to create new numerical representations of words, natural language processing (NLP) has still been effectively improved in many ways. Along with the widespread use of embedding techniques, many other methods have been developed to further express the semantics and meanings of sentences with words:
-
-1. A vector representation of multiple words in a sentence can be concatenated or weighted to obtain a vector to represent the entirety of a sentence.
-
-2. Convolution (CNN) and maximum pooling (MaxPooling) on the matrix of all the word vectors of the sentence, using the final result of these techniques to represent the sentence as a whole.
-
-3. Unrolling the sentence according to the time step of the word, inputting the vector representation of each word into a recurrent neural network (RNN), and using the output of the last time step of the RNN as the representation of the sentence.
-
-The above methods solve the problem of sentence meaning, but only to a certain extent. When concatenating is used in method one, if the word of the sentence is too long and the vector dimension of the word is slightly larger, then the vector dimension of the sentence will be particularly large, and the internal interaction between the words of the sentence can not be taken into account. The use of weighted averaging is not accurate and does not adequately express the impact of each word on sentence semantics.
-
-In the second method, many useful word meanings may be lost using CNNs and MaxPooling.
-
-In the third method, the representation selected is only the output of the last step. If a sentence is too long, the output of the last step does not accurately express the entirety of the sentence's semantics.
-
-Based on the aforementioned method, Zhouhan Lin, Minwei Feng et al. published a paper [A Structured Self-attentive Sentence Embedding](https://arxiv.org/pdf/1703.03130.pdf)[1] in 2017, proposing a novel method based on self-attention structures for sentence embedding and application to users' review classification, textual entailment and other NLP tasks. In the end, better results were obtained than the previous methods.
-
-In this tutorial, we will use [GluonNLP](https://gluon-nlp.mxnet.io/index.html) to reproduce the model structure in "A Structured Self-attentive Sentence Embedding" and apply it to [Yelp Data's review star rating data set](https://www.yelp.com/dataset/challenge) for classification.
-
-## Importing necessary packages
-
-The first step, as in every one of these tutorials, is to import the necessary packages.
-
-```{.python .input}
-import os
-import json
-import zipfile
-import time
-import itertools
-
-import numpy as np
-import mxnet as mx
-import multiprocessing as mp
-import gluonnlp as nlp
-
-from mxnet import gluon, nd, init
-from mxnet.gluon import nn, rnn
-from mxnet import autograd, gluon, nd
-
-# iUse sklearn's metric function to evaluate the results of the experiment
-from sklearn.metrics import accuracy_score, f1_score
-
-# fixed random number seed
-np.random.seed(2018)
-mx.random.seed(2018)
-
-def try_gpu():
- """If GPU is available, return mx.gpu(0); else return mx.cpu()."""
- try:
- ctx = mx.gpu()
- _ = nd.array([0], ctx=ctx)
- except:
- ctx = mx.cpu()
- return ctx
-```
-
-## Data pipeline
-
-The next step is to load and format the data according to the requirements of our model. The dataset used in this tutorial is the Yelp users' review dataset.
-
-### Loading the dataset
-
-The [Yelp users' review dataset](https://www.kaggle.com/yelp-dataset/yelp-dataset) is formatted as a JSON. The original paper selected 500,000 documents as the training set, 2,000 as the validation set, and 2,000 as the test set. For easier reproducibility of the experiment, we subsampled 198,000 documents from this dataset as the training set and 2,000 documents as validation set.
-
-Each sample in the data consists of a user's comment, in English, with each comment marked one through five, each number representing one of five different emotions the user expressed. Here we download, unzip, and reformat the dataset for ease of use further on.
-
-
-```{.python .input}
-# Download the data from the server
-data_url = 'http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/yelp_review_subset-167bb781.zip'
-zip_path = mx.gluon.utils.download(data_url)
-
-# Unzip the zip file
-zip_file = zipfile.ZipFile(zip_path)
-json_path = zip_file.extract(zip_file.namelist()[0])
-
-## Load the JSON data
-with open(json_path, 'r', encoding='utf-8') as fr:
- data = json.load(fr)
-
-# Create a list of review a label pairs
-dataset = [[text, int(label)] for text, label in zip(data['texts'], data['labels'])]
-
-# Randomly divide one percent from the training set as a verification set
-train_dataset, valid_dataset = nlp.data.train_valid_split(dataset, 0.01)
-len(train_dataset), len(valid_dataset)
-```
-
-### Preliminary processing of the data
-
-The purpose of the following code is to process the raw data so that the pre-processed data can be used for model training and prediction. We will use the `SpacyTokenizer` to split the document into tokens, `ClipSequence` to crop the comments to the specified length, and then build a vocabulary based on the word frequency of the training data. Next, we attach the [Glove](https://nlp.stanford.edu/pubs/glove.pdf) [2] pre-trained word vector to the vocabulary and convert each token into the corresponding word index in the vocabulary.
-Finally, we get the standardized training data set and verification data set. Here we also define a few helper functions for later. We take advantage of the `mp.Pool()` function to spread the pre-processing over multiple cores or machines.
-
-
-```{.python .input}
-# The tokenizer takes as input a string and outputs a list of tokens.
-tokenizer = nlp.data.SpacyTokenizer('en')
-
-# `length_clip` takes as input a list and outputs a list with maximum length 100.
-length_clip = nlp.data.ClipSequence(100)
-
-def preprocess(x):
-
- # Convert the number of stars 1, 2, 3, 4, 5 to zero-based index, 0, 1, 2, 3, 4
- data, label = x[0], x[1]-1
-
- # Clip the length of review words
- data = length_clip(tokenizer(data))
- return data, label
-
-def get_length(x):
- return float(len(x[0]))
-
-def preprocess_dataset(dataset):
- start = time.time()
-
- with mp.Pool() as pool:
- # Each sample is processed in an asynchronous manner.
- dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
- lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
- end = time.time()
-
- print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
- return dataset, lengths
-
-# Preprocess the dataset
-train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
-valid_dataset, valid_data_lengths = preprocess_dataset(valid_dataset)
-```
-
-This section creates the `vocab` object and converts the dataset's words to the Glove embeddings.
-
-```{.python .input}
-# Create the vocab
-train_seqs = [sample[0] for sample in train_dataset]
-counter = nlp.data.count_tokens(list(itertools.chain.from_iterable(train_seqs)))
-
-vocab = nlp.Vocab(counter, max_size=10000)
-
-# Load the pre-trained embedding, in this case the Glove embedding of 300 dimensions
-embedding_weights = nlp.embedding.GloVe(source='glove.6B.300d')
-vocab.set_embedding(embedding_weights)
-print(vocab)
-
-def token_to_idx(x):
- return vocab[x[0]], x[1]
-
-# A token index or a list of token indices is returned according to the vocabulary.
-with mp.Pool() as pool:
- train_dataset = pool.map(token_to_idx, train_dataset)
- valid_dataset = pool.map(token_to_idx, valid_dataset)
-
-```
-
-## Bucketing, mini-batches, and the `DataLoader`
-Since each sentence may have a different length, we need to use `Pad` to fill the sentences in a mini-batch to equal lengths so that the data can be quickly tensored on the GPU. At the same time, we need to use `Stack` to stack the category tags of a batch of data. For convenience, we use `Tuple` to combine `Pad` and `Stack`.
-
-In order to make the length of the sentence pad in each mini-batch as small as possible, we should make the sentences with similar lengths in a batch as much as possible. In light of this, we consider constructing a sampler using `FixedBucketSampler`, which defines how the samples in a dataset will be iterated in a more economical way.
-
-Finally, we use `DataLoader` to build a data loader for the training and validation datasets. The training dataset requires a `FixedBucketSampler`, but the validation dataset doesn't require the sampler.
-
-Here we define the helper functions to do all of the above as well as define the hyperparameters for this section:
-
-```{.python .input}
-batch_size = 64
-bucket_num = 10
-bucket_ratio = 0.5
-
-
-def get_dataloader():
-
- # Construct the DataLoader Pad data, stack label and lengths
- batchify_fn = nlp.data.batchify.Tuple(
- nlp.data.batchify.Pad(axis=0, pad_val=0),
- nlp.data.batchify.Stack())
-
- # In this example, we use a FixedBucketSampler,
- # which assigns each data sample to a fixed bucket based on its length.
- batch_sampler = nlp.data.sampler.FixedBucketSampler(
- train_data_lengths,
- batch_size=batch_size,
- num_buckets=bucket_num,
- ratio=bucket_ratio,
- shuffle=True)
- print(batch_sampler.stats())
-
- # Training set DataLoader
- train_dataloader = gluon.data.DataLoader(
- dataset=train_dataset,
- batch_sampler=batch_sampler,
- batchify_fn=batchify_fn)
- # Validation set DataLoader
- valid_dataloader = gluon.data.DataLoader(
- dataset=valid_dataset,
- batch_size=batch_size,
- shuffle=False,
- batchify_fn=batchify_fn)
- return train_dataloader, valid_dataloader
-
-train_dataloader, valid_dataloader = get_dataloader()
-```
-
-## Constructing the model and outlining the model's structure
-
-In the original paper, the representation of the sentence is broken into the following steps:
-
-Firstly, the sentence is disassembled into a list corresponding to the word.
-Then the words are unrolled in order, and the word vector of each word is calculated as the input of each step of the [bidirectional LSTM neural network layer](https://www.bioinf.jku.at/publications/older/2604.pdf) [3].
-Taking the output of each step of the bidirectional LSTM network layer, a matrix H is obtained. Suppose the hidden_dim of the bidirectional LSTM is `U`, the word length of the sentence is `N`, then the dimension of the last H is `N x 2U`. For example, the sentence "This movie is amazing!" would be represented as:
-![](Bi-LSTM-Rep.png)
-
-Attention is very similar to when we are actually looking at an object, we always give different importance (or weights) to things in the scope of the perspective. A brief quote from skymind.ai summarizes what attention means in our daily lives as well as in neural networks in a few clear words:
-
-> The word describes the mind’s ability to allocate consideration unevenly across a field of sensation, thought and proprioception, to focus and bring certain inputs to the fore, while ignoring or diminishing the importance of others. So for neural networks, we’re basically talking about credit assignment. [4]
-
-For example, when we are communicating with people, our eyes will always pay more attention to the face of the communicator, rather than the type of trousers they are wearing or their toe nail polish. So when we are expressing a sentence with this model, we can pay different amounts of attention to the output H of the bi-directional LSTM layer.
-![](attention-nlp.png)
-$$
-A = Softmax(W_{s2}tanh(W_{s1}H^T))
-$$
-
-Here, Ws1 is a weight matrix with the shape: da-by-2u, where da is a hyperparameter.
-Ws2 is a weight matrix with the shape: r-by-da, where r is the number of different attentions you want to use.
-
-When the attention matrix `A` and the output `H` of the LSTM are obtained, the final representation is obtained by $$M = AH$$.
-
-We can first customize a layer of attention, specify the number of hidden nodes (`att_unit`) and the number of attention channels (`att_hops`).
-
-
-```{.python .input}
-# A custom attention layer
-class SelfAttention(nn.HybridBlock):
- def __init__(self, att_unit, att_hops, **kwargs):
- super(SelfAttention, self).__init__(**kwargs)
- with self.name_scope():
- self.ut_dense = nn.Dense(att_unit, activation='tanh', flatten=False)
- self.et_dense = nn.Dense(att_hops, activation=None, flatten=False)
-
- def hybrid_forward(self, F, x):
- # x shape: [batch_size, seq_len, embedding_width]
- # ut shape: [batch_size, seq_len, att_unit]
- ut = self.ut_dense(x)
- # et shape: [batch_size, seq_len, att_hops]
- et = self.et_dense(ut)
-
- # att shape: [batch_size, att_hops, seq_len]
- att = F.softmax(F.transpose(et, axes=(0, 2, 1)), axis=-1)
- # output shape [batch_size, att_hops, embedding_width]
- output = F.batch_dot(att, x)
-
- return output, att
-```
-
-When the number of samples for labels are very unbalanced, applying different weights on different labels may improve the performance of the model significantly.
-
-```{.python .input}
-
-class WeightedSoftmaxCE(nn.Block):
- def __init__(self, sparse_label=True, from_logits=False, **kwargs):
- super(WeightedSoftmaxCE, self).__init__(**kwargs)
- with self.name_scope():
- self.sparse_label = sparse_label
- self.from_logits = from_logits
-
- def forward(self, pred, label, class_weight, depth=None):
- if self.sparse_label:
- label = nd.reshape(label, shape=(-1, ))
- label = nd.one_hot(label, depth)
- if not self.from_logits:
- pred = nd.log_softmax(pred, -1)
-
- weight_label = nd.broadcast_mul(label, class_weight)
- loss = -nd.sum(pred * weight_label, axis=-1)
-
- # return nd.mean(loss, axis=0, exclude=True)
- return loss
-
-```
-
-We now define the basic model characteristics in a self-attentive bi-LSTM model, and configure the layers and dropout, as well as how the model feeds forward.
-
-```{.python .input}
-class SelfAttentiveBiLSTM(nn.HybridBlock):
- def __init__(self, vocab_len, embsize, nhidden, nlayers, natt_unit, natt_hops, nfc, nclass,
- drop_prob, pool_way, prune_p=None, prune_q=None, **kwargs):
- super(SelfAttentiveBiLSTM, self).__init__(**kwargs)
- with self.name_scope():
- self.embedding_layer = nn.Embedding(vocab_len, embsize)
- self.bilstm = rnn.LSTM(nhidden, num_layers=nlayers, dropout=drop_prob, bidirectional=True)
- self.att_encoder = SelfAttention(natt_unit, natt_hops)
- self.dense = nn.Dense(nfc, activation='tanh')
- self.output_layer = nn.Dense(nclass)
-
- self.dense_p, self.dense_q = None, None
- if all([prune_p, prune_q]):
- self.dense_p = nn.Dense(prune_p, activation='tanh', flatten=False)
- self.dense_q = nn.Dense(prune_q, activation='tanh', flatten=False)
-
- self.drop_prob = drop_prob
- self.pool_way = pool_way
-
- def hybrid_forward(self, F, inp):
- # input_embed: [batch, len, emsize]
- inp_embed = self.embedding_layer(inp)
- h_output = self.bilstm(F.transpose(inp_embed, axes=(1, 0, 2)))
- # att_output: [batch, att_hops, emsize]
- att_output, att = self.att_encoder(F.transpose(h_output, axes=(1, 0, 2)))
-
- dense_input = None
- if self.pool_way == 'flatten':
- dense_input = F.Dropout(F.flatten(att_output), self.drop_prob)
- elif self.pool_way == 'mean':
- dense_input = F.Dropout(F.mean(att_output, axis=1), self.drop_prob)
- elif self.pool_way == 'prune' and all([self.dense_p, self.dense_q]):
- # p_section: [batch, att_hops, prune_p]
- p_section = self.dense_p(att_output)
- # q_section: [batch, emsize, prune_q]
- q_section = self.dense_q(F.transpose(att_output, axes=(0, 2, 1)))
- dense_input = F.Dropout(F.concat(F.flatten(p_section), F.flatten(q_section), dim=-1), self.drop_prob)
-
- dense_out = self.dense(dense_input)
- output = self.output_layer(F.Dropout(dense_out, self.drop_prob))
-
- return output, att
-```
-
-## Configuring the parameters and assembling the model
-
-The resulting `M` is a matrix, and the way to classify this matrix is `flatten`-ing, `mean`-ing or `prune`-ing. Pruning is an effective way of trimming parameters that was proposed in the original paper, and has been implemented for our example.
-
-
-```{.python .input}
-vocab_len = len(vocab)
-emsize = 300 # word embedding size
-nhidden = 300 # lstm hidden_dim
-nlayers = 2 # lstm layers
-natt_unit = 300 # the hidden_units of attention layer
-natt_hops = 2 # the channels of attention
-nfc = 512
-nclass = 5
-
-drop_prob = 0.5
-pool_way = 'flatten' # The way to handle M
-prune_p = None
-prune_q = None
-
-ctx = try_gpu()
-
-model = SelfAttentiveBiLSTM(vocab_len, emsize, nhidden, nlayers,
- natt_unit, natt_hops, nfc, nclass,
- drop_prob, pool_way, prune_p, prune_q)
-
-model.initialize(init=init.Xavier(), ctx=ctx)
-model.hybridize()
-
-# Attach a pre-trained glove word vector to the embedding layer
-model.embedding_layer.weight.set_data(vocab.embedding.idx_to_vec)
-# fixed the layer
-model.embedding_layer.collect_params().setattr('grad_req', 'null')
-```
-
-Using r attention can improve the representation of sentences with different semantics, but if the value of each line in the attention matrix `A` (r-byn) is very close, that is, there is no difference between several attentions. Subsequently, in $$M = AH$$, the resulting `M` will contain a lot of redundant information.
-So in order to solve this problem, we should try to force `A` to ensure that the value of each line has obvious differences, that is, try to satisfy the diversity of attention. Therefore, a penalty can be used to achieve this goal.
-
-$$ P = ||(AA^T-I)||_F^2 $$
-
-
-It can be seen from the above formula that if the value of each row of `A` is more similar, the result of `P` will be larger, and the value of `A` is less similar for each row, and `P` is smaller. This means that when the r-focused diversity of `A` is larger, the smaller `P` is. So by including this penalty item with the loss of the model, we can try to ensure the diversity of `A`.
-
-We incorporate these findings in the code below adding in the penalty coefficient along with the standard loss function.
-
-
-```{.python .input}
-def calculate_loss(x, y, model, loss, class_weight, penal_coeff):
- pred, att = model(x)
- if loss_name == 'sce':
- l = loss(pred, y)
- elif loss_name == 'wsce':
- l = loss(pred, y, class_weight, class_weight.shape[0])
-
- # penalty
- diversity_penalty = nd.batch_dot(att, nd.transpose(att, axes=(0, 2, 1))
- ) - nd.eye(att.shape[1], ctx=att.context)
- l = l + penal_coeff * diversity_penalty.norm(axis=(1, 2))
-
- return pred, l
-```
-
-We then define what one epoch of training would be for the model for easier use later. In addition, we calculate loss, the F1 score, and accuracy for each epoch and print them for easier understanding. Additionally, we dynamically adjust the learning rate as the number of epochs increase. We also include an `is_train` boolean to allow us to know whether or not we should be altering the original model or just reporting the loss.
-
-```{.python .input}
-def one_epoch(data_iter, model, loss, trainer, ctx, is_train, epoch,
- penal_coeff=0.0, clip=None, class_weight=None, loss_name='wsce'):
-
- loss_val = 0.
- total_pred = []
- total_true = []
- n_batch = 0
-
- for batch_x, batch_y in data_iter:
- batch_x = batch_x.as_in_context(ctx)
- batch_y = batch_y.as_in_context(ctx)
-
- if is_train:
- with autograd.record():
- batch_pred, l = calculate_loss(batch_x, batch_y, model, loss, class_weight, penal_coeff)
-
- # backward calculate
- l.backward()
-
- # clip gradient
- clip_params = [p.data() for p in model.collect_params().values()]
- if clip is not None:
- norm = nd.array([0.0], ctx)
- for param in clip_params:
- if param.grad is not None:
- norm += (param.grad ** 2).sum()
- norm = norm.sqrt().asscalar()
- if norm > clip:
- for param in clip_params:
- if param.grad is not None:
- param.grad[:] *= clip / norm
-
- # update parmas
- trainer.step(batch_x.shape[0])
-
- else:
- batch_pred, l = calculate_loss(batch_x, batch_y, model, loss, class_weight, penal_coeff)
-
- # keep result for metric
- batch_pred = nd.argmax(nd.softmax(batch_pred, axis=1), axis=1).asnumpy()
- batch_true = np.reshape(batch_y.asnumpy(), (-1, ))
- total_pred.extend(batch_pred.tolist())
- total_true.extend(batch_true.tolist())
-
- batch_loss = l.mean().asscalar()
-
- n_batch += 1
- loss_val += batch_loss
-
- # check the result of traing phase
- if is_train and n_batch % 400 == 0:
- print('epoch %d, batch %d, batch_train_loss %.4f, batch_train_acc %.3f' %
- (epoch, n_batch, batch_loss, accuracy_score(batch_true, batch_pred)))
-
- # metric
- F1 = f1_score(np.array(total_true), np.array(total_pred), average='weighted')
- acc = accuracy_score(np.array(total_true), np.array(total_pred))
- loss_val /= n_batch
-
- if is_train:
- print('epoch %d, learning_rate %.5f \n\t train_loss %.4f, acc_train %.3f, F1_train %.3f, ' %
- (epoch, trainer.learning_rate, loss_val, acc, F1))
- # declay lr
- if epoch % 2 == 0:
- trainer.set_learning_rate(trainer.learning_rate * 0.9)
- else:
- print('\t valid_loss %.4f, acc_valid %.3f, F1_valid %.3f, ' % (loss_val, acc, F1))
-
-```
-
-In addition, we include a helper method `train_valid` which combines the one epoch for the training data as well as the validation data, using the `is_train` boolean to swap between the two modes we discussed above.
-
-```{.python .input}
-def train_valid(data_iter_train, data_iter_valid, model, loss, trainer, ctx, nepochs,
- penal_coeff=0.0, clip=None, class_weight=None, loss_name='wsce'):
-
- for epoch in range(1, nepochs+1):
- start = time.time()
- # train
- is_train = True
- one_epoch(data_iter_train, model, loss, trainer, ctx, is_train,
- epoch, penal_coeff, clip, class_weight, loss_name)
-
- # valid
- is_train = False
- one_epoch(data_iter_valid, model, loss, trainer, ctx, is_train,
- epoch, penal_coeff, clip, class_weight, loss_name)
- end = time.time()
- print('time %.2f sec' % (end-start))
- print("*"*100)
-
-```
-
-## Training the model
-
-Now that we are actually training the model, we use `WeightedSoftmaxCE` to alleviate the problem of data categorical imbalance. We perform statistical analysis on the data in advance to retrieve a set of `class_weight`s.
-
-
-```{.python .input}
-class_weight = None
-loss_name = 'wsce'
-optim = 'adam'
-lr = 0.001
-penal_coeff = 0.1
-clip = 0.5
-nepochs = 4
-
-trainer = gluon.Trainer(model.collect_params(), optim, {'learning_rate': lr})
-
-if loss_name == 'sce':
- loss = gluon.loss.SoftmaxCrossEntropyLoss()
-elif loss_name == 'wsce':
- loss = WeightedSoftmaxCE()
- # the value of class_weight is obtained by counting data in advance. It can be seen as a hyperparameter.
- class_weight = nd.array([3.0, 5.3, 4.0, 2.0, 1.0], ctx=ctx)
-```
-
-We've simplified our lives earlier by creating the necessary helper methods so our training is as simple as the below line of code.
-
-```{.python .input}
-# train and valid
-train_valid(train_dataloader, valid_dataloader, model, loss, trainer, ctx, nepochs,
- penal_coeff=penal_coeff, clip=clip, class_weight=class_weight, loss_name=loss_name)
-```
-
-## Predictions and sampling using our model
-
-Now that the model has been trained, we can randomly input a sentence into the model and predict its emotional value tag. The range of emotional markers (or the labels) is one through five, each corresponding to the degree of negativity to positivity.
-
-```{.python .input}
-input_ar = nd.array(vocab[['This', 'movie', 'is', 'amazing']], ctx=ctx).reshape((1, -1))
-pred, att = model(input_ar)
-
-label = np.argmax(nd.softmax(pred, axis=1).asnumpy(), axis=1) + 1
-print(label)
-print(att)
-```
-
-In order to intuitively understand the role of the attention mechanism, we visualize the output of the model's attention on the predicted samples using the `matplotlib` and `seaborn` modules.
-
-```{.python .input}
-# Visualizing the attention layer
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-%matplotlib inline
-
-np.squeeze(att.asnumpy(), 0).shape
-plt.figure(figsize=(8,1))
-cmap = sns.diverging_palette(220, 10, as_cmap=True)
-sns.heatmap(np.squeeze(att.asnumpy(), 0), cmap=cmap, annot=True,
- xticklabels=['This', 'movie', 'is', 'amazing'], yticklabels=['att0', 'att1'])
-plt.show()
-```
-
-## Conclusions
-
-Word embedding can effectively represent the semantic similarity between words, which allows for many breakthroughs in complex natural language processing tasks. Attention mechanisms can intuitively grasp the important semantic features in the sentence. The LSTM captures the word-order relationship between words in a sentence. Through a combination of these three, word embeddings, LSTMs, and attention mechanisms, we can effectively represent the semantics of a sentence and apply it to many practical tasks.
-
-GluonNLP provides us with an efficient and convenient toolbox to help us experiment quickly. This greatly simplifies the tedious work of many natural language processing tasks.
-
-## References
-
-1. [A Structured Self-Attentive Sentence Embedding](https://arxiv.org/pdf/1703.03130.pdf)
-2. [Glove: Global vectors for word representation. In Proceedings of the 2014 conference on empirical methods in natural language processing](https://nlp.stanford.edu/pubs/glove.pdf)
-3. [Long short-term memory](https://www.bioinf.jku.at/publications/older/2604.pdf)
-4. [Skymind.AI A Beginner's Guide to Attention Mechanisms and Memory Networks](https://skymind.ai/wiki/attention-mechanism-memory-network)
diff --git a/docs/examples/sentiment_analysis/sentiment_analysis.md b/docs/examples/sentiment_analysis/sentiment_analysis.md
deleted file mode 100644
index 9559bff86f..0000000000
--- a/docs/examples/sentiment_analysis/sentiment_analysis.md
+++ /dev/null
@@ -1,354 +0,0 @@
-# Fine-tuning LSTM-based Language Model
-
-Now that we've covered some advanced topics using advanced models, let's return to the basics and show how these techniques can help us even when addressing the comparatively simple problem of classification. In particular, we'll look at the classic problem of sentiment analysis: taking an input consisting of a string of text and classifying its sentiment as positive or negative.
-
-In this notebook, we are going to use GluonNLP to build a sentiment analysis model whose weights are initialized based on a pre-trained language model. Using pre-trained language model weights is a common approach for semi-supervised learning in NLP. In order to do a good job with large language modeling on a large corpus of text, our model must learn representations that contain information about the structure of natural language. Intuitively, by starting with these good features, versus simply random features, we're able to converge faster towards a superior model for our downstream task.
-
-With GluonNLP, we can quickly prototype the model and it's easy to customize. The building process consists of just three simple steps. For this demonstration we'll focus on movie reviews from the Large Movie Review Dataset, also known as the IMDB dataset. Given a movie, our model will output prediction of its sentiment, which can be positive or negative.
-
-
-## Setup
-
-Firstly, we must load the required modules. Please remember to download the archive from the top of this tutorial
-if you'd like to follow along. We set the random seed so the outcome can be relatively consistent.
-
-```{.python .input}
-import warnings
-warnings.filterwarnings('ignore')
-
-import random
-import time
-import multiprocessing as mp
-import numpy as np
-
-import mxnet as mx
-from mxnet import nd, gluon, autograd
-
-import gluonnlp as nlp
-nlp.utils.check_version('0.7.0')
-
-random.seed(123)
-np.random.seed(123)
-mx.random.seed(123)
-```
-
-## Sentiment analysis model with pre-trained language model encoder
-
-So that we can easily transplant the pre-trained weights, we'll base our model architecture on the pre-trained language model (LM). Following the LSTM layer, we have one representation vector for each word in the sentence. Because we plan to make a single prediction (as opposed to one per word), we'll first pool our predictions across time steps before feeding them through a dense last layer to produce our final prediction (a single sigmoid output node).
-
-![sa-model](samodel-v3.png)
-
-Specifically, our model represents input words by their embeddings. Following the embedding layer, our model consists of a two-layer LSTM, followed by an average pooling layer, followed by a sigmoid output layer (all illustrated in the figure above).
-
-Thus, given an input sequence, the memory cells in the LSTM layer will produce a representation sequence. This representation sequence is then averaged over all time steps resulting in a fixed-length sentence representation $h$. Finally, we apply a sigmoid output layer on top of $h$. We’re using the sigmoid activation function because we’re trying to predict if this text has positive or negative sentiment. A sigmoid activation function squashes the output values to the range [0,1], allowing us to interpret this output as a probability, making our lives relatively simpler.
-
-Below we define our `MeanPoolingLayer` and basic sentiment analysis network's (`SentimentNet`) structure.
-
-```{.python .input}
-class MeanPoolingLayer(gluon.HybridBlock):
- """A block for mean pooling of encoder features"""
- def __init__(self, prefix=None, params=None):
- super(MeanPoolingLayer, self).__init__(prefix=prefix, params=params)
-
- def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
- """Forward logic"""
- # Data will have shape (T, N, C)
- masked_encoded = F.SequenceMask(data,
- sequence_length=valid_length,
- use_sequence_length=True)
- agg_state = F.broadcast_div(F.sum(masked_encoded, axis=0),
- F.expand_dims(valid_length, axis=1))
- return agg_state
-
-
-class SentimentNet(gluon.HybridBlock):
- """Network for sentiment analysis."""
- def __init__(self, dropout, prefix=None, params=None):
- super(SentimentNet, self).__init__(prefix=prefix, params=params)
- with self.name_scope():
- self.embedding = None # will set with lm embedding later
- self.encoder = None # will set with lm encoder later
- self.agg_layer = MeanPoolingLayer()
- self.output = gluon.nn.HybridSequential()
- with self.output.name_scope():
- self.output.add(gluon.nn.Dropout(dropout))
- self.output.add(gluon.nn.Dense(1, flatten=False))
-
- def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
- encoded = self.encoder(self.embedding(data)) # Shape(T, N, C)
- agg_state = self.agg_layer(encoded, valid_length)
- out = self.output(agg_state)
- return out
-```
-
-## Defining the hyperparameters and initializing the model
-
-### Hyperparameters
-
-Our model is based on a standard LSTM model. We use a hidden layer size of 200. We use bucketing for speeding up the processing of variable-length sequences. We don't configure dropout for this model as it could be deleterious to the results.
-
-```{.python .input}
-dropout = 0
-language_model_name = 'standard_lstm_lm_200'
-pretrained = True
-learning_rate, batch_size = 0.005, 32
-bucket_num, bucket_ratio = 10, 0.2
-epochs = 1
-grad_clip = None
-log_interval = 100
-```
-
-If your environment supports GPUs, keep the context value the same. If it doesn't, swap the `mx.gpu(0)` to `mx.cpu()`.
-
-```{.python .input}
-context = mx.gpu(0)
-```
-
-### Loading the pre-trained model
-
-The loading of the pre-trained model, like in previous tutorials, is as simple as one line.
-
-```{.python .input}
-lm_model, vocab = nlp.model.get_model(name=language_model_name,
- dataset_name='wikitext-2',
- pretrained=pretrained,
- ctx=context,
- dropout=dropout)
-```
-
-### Creating the sentiment analysis model from the loaded pre-trained model
-
-In the code below, we already have acquireq a pre-trained model on the Wikitext-2 dataset using `nlp.model.get_model`. We then construct a SentimentNet object, which takes as input the embedding layer and encoder of the pre-trained model.
-
-As we employ the pre-trained embedding layer and encoder, *we only need to initialize the output layer* using `net.out_layer.initialize(mx.init.Xavier(), ctx=context)`.
-
-```{.python .input}
-net = SentimentNet(dropout=dropout)
-net.embedding = lm_model.embedding
-net.encoder = lm_model.encoder
-net.hybridize()
-net.output.initialize(mx.init.Xavier(), ctx=context)
-print(net)
-```
-
-## The data pipeline
-
-In this section, we describe in detail the data pipeline, from initialization to modifying it for use in our model.
-
-### Loading the sentiment analysis dataset (IMDB reviews)
-
-In the labeled train/test sets, out of a max score of 10, a negative review has a score of no more than 4, and a positive review has a score of no less than 7. Thus reviews with more neutral ratings are not included in the train/test sets. We labeled a negative review whose score <= 4 as 0, and a
-positive review whose score >= 7 as 1. As the neural ratings are not
-included in the datasets, we can use 5 as our threshold.
-
-```{.python .input}
-# The tokenizer takes as input a string and outputs a list of tokens.
-tokenizer = nlp.data.SpacyTokenizer('en')
-
-# `length_clip` takes as input a list and outputs a list with maximum length 500.
-length_clip = nlp.data.ClipSequence(500)
-
-# Helper function to preprocess a single data point
-def preprocess(x):
- data, label = x
- label = int(label > 5)
- # A token index or a list of token indices is
- # returned according to the vocabulary.
- data = vocab[length_clip(tokenizer(data))]
- return data, label
-
-# Helper function for getting the length
-def get_length(x):
- return float(len(x[0]))
-
-# Loading the dataset
-train_dataset, test_dataset = [nlp.data.IMDB(root='data/imdb', segment=segment)
- for segment in ('train', 'test')]
-print('Tokenize using spaCy...')
-
-```
-
-Here we use the helper functions defined above to make pre-processing the dataset relatively stress-free and concise. As in a previous tutorial, `mp.Pool()` is leveraged to divide the work of preprocessing to multiple cores/machines.
-
-```{.python .input}
-def preprocess_dataset(dataset):
- start = time.time()
- with mp.Pool() as pool:
- # Each sample is processed in an asynchronous manner.
- dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
- lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
- end = time.time()
- print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
- return dataset, lengths
-
-# Doing the actual pre-processing of the dataset
-train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
-test_dataset, test_data_lengths = preprocess_dataset(test_dataset)
-```
-
-In the following code, we use FixedBucketSampler, which assigns each data sample to a fixed bucket based on its length. The bucket keys are either given or generated from the input sequence lengths and the number of buckets.
-
-```{.python .input}
-# Construct the DataLoader
-
-def get_dataloader():
-
- # Pad data, stack label and lengths
- batchify_fn = nlp.data.batchify.Tuple(
- nlp.data.batchify.Pad(axis=0, pad_val=0, ret_length=True),
- nlp.data.batchify.Stack(dtype='float32'))
- batch_sampler = nlp.data.sampler.FixedBucketSampler(
- train_data_lengths,
- batch_size=batch_size,
- num_buckets=bucket_num,
- ratio=bucket_ratio,
- shuffle=True)
- print(batch_sampler.stats())
-
- # Construct a DataLoader object for both the training and test data
- train_dataloader = gluon.data.DataLoader(
- dataset=train_dataset,
- batch_sampler=batch_sampler,
- batchify_fn=batchify_fn)
- test_dataloader = gluon.data.DataLoader(
- dataset=test_dataset,
- batch_size=batch_size,
- shuffle=False,
- batchify_fn=batchify_fn)
- return train_dataloader, test_dataloader
-
-# Use the pre-defined function to make the retrieval of the DataLoader objects simple
-train_dataloader, test_dataloader = get_dataloader()
-```
-
-## Training the model
-
-Now that all the data has been pre-processed and the model architecture has been loosely defined, we can define the helper functions for evaluation and training of the model.
-
-### Evaluation using loss and accuracy
-
-Here, we define a function `evaluate(net, dataloader, context)` to determine the loss and accuracy of our model in a concise way. The code is very similar to evaluation of other models in the previous tutorials. For more information and explanation of this code, please refer to the previous tutorial on [LSTM-based Language Models](https://gluon-nlp.mxnet.io/master/examples/language_model/language_model.html).
-
-```{.python .input}
-def evaluate(net, dataloader, context):
- loss = gluon.loss.SigmoidBCELoss()
- total_L = 0.0
- total_sample_num = 0
- total_correct_num = 0
- start_log_interval_time = time.time()
-
- print('Begin Testing...')
- for i, ((data, valid_length), label) in enumerate(dataloader):
- data = mx.nd.transpose(data.as_in_context(context))
- valid_length = valid_length.as_in_context(context).astype(np.float32)
- label = label.as_in_context(context)
- output = net(data, valid_length)
-
- L = loss(output, label)
- pred = (output > 0.5).reshape(-1)
- total_L += L.sum().asscalar()
- total_sample_num += label.shape[0]
- total_correct_num += (pred == label).sum().asscalar()
-
- if (i + 1) % log_interval == 0:
- print('[Batch {}/{}] elapsed {:.2f} s'.format(
- i + 1, len(dataloader),
- time.time() - start_log_interval_time))
- start_log_interval_time = time.time()
-
- avg_L = total_L / float(total_sample_num)
- acc = total_correct_num / float(total_sample_num)
-
- return avg_L, acc
-```
-
-In the following code, we use FixedBucketSampler, which assigns each data sample to a fixed bucket based on its length. The bucket keys are either given or generated from the input sequence lengths and number of the buckets.
-
-```{.python .input}
-def train(net, context, epochs):
- trainer = gluon.Trainer(net.collect_params(), 'ftml',
- {'learning_rate': learning_rate})
- loss = gluon.loss.SigmoidBCELoss()
-
- parameters = net.collect_params().values()
-
- # Training/Testing
- for epoch in range(epochs):
- # Epoch training stats
- start_epoch_time = time.time()
- epoch_L = 0.0
- epoch_sent_num = 0
- epoch_wc = 0
- # Log interval training stats
- start_log_interval_time = time.time()
- log_interval_wc = 0
- log_interval_sent_num = 0
- log_interval_L = 0.0
-
- for i, ((data, length), label) in enumerate(train_dataloader):
- L = 0
- wc = length.sum().asscalar()
- log_interval_wc += wc
- epoch_wc += wc
- log_interval_sent_num += data.shape[1]
- epoch_sent_num += data.shape[1]
- with autograd.record():
- output = net(data.as_in_context(context).T,
- length.as_in_context(context)
- .astype(np.float32))
- L = L + loss(output, label.as_in_context(context)).mean()
- L.backward()
- # Clip gradient
- if grad_clip:
- gluon.utils.clip_global_norm(
- [p.grad(context) for p in parameters],
- grad_clip)
- # Update parameter
- trainer.step(1)
- log_interval_L += L.asscalar()
- epoch_L += L.asscalar()
- if (i + 1) % log_interval == 0:
- print(
- '[Epoch {} Batch {}/{}] elapsed {:.2f} s, '
- 'avg loss {:.6f}, throughput {:.2f}K wps'.format(
- epoch, i + 1, len(train_dataloader),
- time.time() - start_log_interval_time,
- log_interval_L / log_interval_sent_num, log_interval_wc
- / 1000 / (time.time() - start_log_interval_time)))
- # Clear log interval training stats
- start_log_interval_time = time.time()
- log_interval_wc = 0
- log_interval_sent_num = 0
- log_interval_L = 0
- end_epoch_time = time.time()
- test_avg_L, test_acc = evaluate(net, test_dataloader, context)
- print('[Epoch {}] train avg loss {:.6f}, test acc {:.2f}, '
- 'test avg loss {:.6f}, throughput {:.2f}K wps'.format(
- epoch, epoch_L / epoch_sent_num, test_acc, test_avg_L,
- epoch_wc / 1000 / (end_epoch_time - start_epoch_time)))
-```
-
-And finally, because of all the helper functions we've defined, training our model becomes simply one line of code!
-
-```{.python .input}
-train(net, context, epochs)
-```
-
-And testing it becomes as simple as feeding in the sample sentence like below:
-
-```{.python .input}
-net(
- mx.nd.reshape(
- mx.nd.array(vocab[['This', 'movie', 'is', 'amazing']], ctx=context),
- shape=(-1, 1)), mx.nd.array([4], ctx=context)).sigmoid()
-```
-
-Indeed, we can feed in any sentence and determine the sentiment with relative ease!
-
-## Conclusion
-
-We built a Sentiment Analysis by reusing the feature extractor from the pre-trained language model. The modular design of Gluon blocks makes it very easy to put together models for various needs. GluonNLP provides powerful building blocks that substantially simplify the process of constructing an efficient data pipeline and versatile models.
-
-### More information
-
-GluonNLP documentation is here along with more tutorials to provide you with the easiest experience in getting to know and use our tool: http://gluon-nlp.mxnet.io/index.html
diff --git a/docs/examples/sequence_sampling/index.rst b/docs/examples/sequence_sampling/index.rst
deleted file mode 100644
index 7ad83d697e..0000000000
--- a/docs/examples/sequence_sampling/index.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-Text Generation
-===============
-
-.. container:: cards
-
- .. card::
- :title: Sequence Generation with Beam Search Sampler and Sequence Sampler
- :link: sequence_sampling.html
-
- Learn how to generate sentence from pre-trained language model through sampling and beam
- search.
-
-
-.. toctree::
- :hidden:
- :maxdepth: 2
-
- sequence_sampling.ipynb
-
-
-
diff --git a/docs/examples/sequence_sampling/sequence_sampling.md b/docs/examples/sequence_sampling/sequence_sampling.md
deleted file mode 100644
index 20fb865302..0000000000
--- a/docs/examples/sequence_sampling/sequence_sampling.md
+++ /dev/null
@@ -1,197 +0,0 @@
-# Sequence Generation with Beam Search Sampler and Sequence Sampler
-
-This tutorial demonstrates how to sample sequences using a
-pre-trained language model in the following two ways: with a beam search sampler
-and with a sequence sampler.
-
-Let's use `V` to denote the vocabulary size, and `T` to denote the sequence
-length. Given a language model, we can sample sequences according to the
-probability that they would occur according to our model. At each time step, a
-language model predicts the likelihood of each word occurring, given the context
-from prior time steps. The outputs at any time step can be any word from the
-vocabulary whose size is `V` and thus the number of all possible outcomes for a
-sequence of length `T` is thus $$V^T$$.
-
-While sometimes we might want to sample
-sentences according to their probability of occurring, at other times we want to
-find the sentences that *are most likely to occur*. This is especially true in
-the case of language translation where we don't just want to see *a*
-translation. We want the *best* translation. While finding the optimal outcome
-quickly becomes intractable as time increases, there are still many ways to
-sample reasonably good sequences. GluonNLP provides two samplers for generating
-from a language model: `BeamSearchSampler` and `SequenceSampler`.
-
-## Loading a pre-trained language model (LM)
-
-Firstly, let's load a pre-trained language model,
-from which we will sample sequences. GluonNLP makes this a painless process.
-
-```{.python .input}
-import mxnet as mx
-import gluonnlp as nlp
-nlp.utils.check_version('0.8.0')
-
-ctx = mx.cpu()
-lm_model, vocab = nlp.model.get_model(name='awd_lstm_lm_1150',
- dataset_name='wikitext-2',
- pretrained=True,
- ctx=ctx)
-```
-
-## Sampling a Sequence with `BeamSearchSampler`
-
-To overcome the exponential complexity in sequence decoding, beam search decodes
-greedily, keeping those sequences that are most likely based on the probability
-up to the current time step. The size of this subset is called the *beam size*.
-Suppose the beam size is `K` and the output vocabulary size is `V`. When
-selecting the beams to keep, the beam search algorithm first predicts all
-possible successor words from the previous `K` beams, each of which has `V`
-possible outputs. This becomes a total of `K*V` paths. Out of these `K*V` paths,
-beam search ranks them by their score keeping only the top `K` paths.
-
-Let's take a look how to construct a `BeamSearchSampler`. The
-`nlp.model.BeamSearchSampler` class takes the following arguments for
-customization and extension:
-
-- beam_size : the beam size
-- decoder : callable function of the one-step-ahead decoder
-- eos_id : the id of the EOS token
-- scorer: the score function used in beam search
-- max_length: the maximum search length
-
-For beam search to work, we need a scorer function.
-
-#### The scorer function
-
-In this tutorial, we will use the `BeamSearchScorer`
-as the scorer function, which implements the scoring function with length penalty in the
-[Google NMT](https://arxiv.org/pdf/1609.08144.pdf) paper:
-
-```{.python .input}
-scorer = nlp.model.BeamSearchScorer(alpha=0, K=5, from_logits=False)
-```
-
-Defining the scorer is as simple as this one line.
-
-#### The decoder function
-
-Next, we define the decoder based on the pre-trained
-language model.
-
-```{.python .input}
-class LMDecoder(object):
- def __init__(self, model):
- self._model = model
- def __call__(self, inputs, states):
- outputs, states = self._model(mx.nd.expand_dims(inputs, axis=0), states)
- return outputs[0], states
- def state_info(self, *arg, **kwargs):
- return self._model.state_info(*arg, **kwargs)
-decoder = LMDecoder(lm_model)
-```
-
-#### Beam Search Sampler
-
-Given a scorer and a decoder, we are ready to create a sampler. We use the symbol `.`
-to indicate the end of sentence (EOS). We can use vocab to get the index of the
-EOS to then feed the index to the sampler. The following code shows how to
-construct a beam search sampler. We will create a sampler with 4 beams and a
-maximum sample length of 20.
-
-```{.python .input}
-eos_id = vocab['.']
-beam_sampler = nlp.model.BeamSearchSampler(beam_size=5,
- decoder=decoder,
- eos_id=eos_id,
- scorer=scorer,
- max_length=20)
-```
-
-It's really that simple!
-
-#### Generate Sequences with Beam Search
-
-Next, we are going to generate sentences starting with "I love it" using beam
-search first. We feed ['I', 'Love'] to the language model to get the initial
-states and set the initial input to be the word 'it'. We will then print the
-top-3 generations.
-
-```{.python .input}
-bos = 'I love it'.split()
-bos_ids = [vocab[ele] for ele in bos]
-begin_states = lm_model.begin_state(batch_size=1, ctx=ctx)
-if len(bos_ids) > 1:
- _, begin_states = lm_model(mx.nd.expand_dims(mx.nd.array(bos_ids[:-1]), axis=1),
- begin_states)
-inputs = mx.nd.full(shape=(1,), ctx=ctx, val=bos_ids[-1])
-```
-
-Here we define the helper function to generate the sequences so we can simply use one line
-to generate new sequences for any given input.
-
-```{.python .input}
-def generate_sequences(sampler, inputs, begin_states, num_print_outcomes):
-
- samples, scores, valid_lengths = sampler(inputs, begin_states)
- samples = samples[0].asnumpy()
- scores = scores[0].asnumpy()
- valid_lengths = valid_lengths[0].asnumpy()
- print('Generation Result:')
-
- for i in range(num_print_outcomes):
- sentence = bos[:-1]
-
- for ele in samples[i][:valid_lengths[i]]:
- sentence.append(vocab.idx_to_token[ele])
-
- print([' '.join(sentence), scores[i]])
-```
-
-And then below, we have the one-liner to generate the sequences.
-
-```{.python .input}
-generate_sequences(beam_sampler, inputs, begin_states, 5)
-```
-
-## Sampling a Sequence with `SequenceSampler`
-
-The previous generation results
-may look a bit boring. Instead, let's now use the sequence sampler to get relatively more
-interesting results.
-
-A `SequenceSampler` samples from the contextual multinomial distribution
-produced by the language model at each time step. Since we may want to control
-how "sharp" the distribution is to tradeoff diversity with correctness, we can
-use the temperature option in `SequenceSampler`, which controls the temperature
-of the softmax activation function.
-
-For each input, sequence sampler can sample
-multiple **independent** sequences at once. The number of independent sequences
-to sample can be specified through the argument `beam_size`.
-
-Defining the `SequenceSampler` is as simple as this:
-
-```{.python .input}
-seq_sampler = nlp.model.SequenceSampler(beam_size=5,
- decoder=decoder,
- eos_id=eos_id,
- max_length=100,
- temperature=0.97)
-```
-
-
-#### Generate Sequences with Sequence Sampler
-
-Now, instead of using the beam sampler for our `generate_sequences` function, we can use the `SequenceSampler` instead to sample sequences based on the same inputs used previously.
-
-```{.python .input}
-generate_sequences(seq_sampler, inputs, begin_states, 5)
-```
-
-Et voila! We've generated the most likely sentences based on our given input.
-
-#### Exercises for the keen reader
-
-- Tweak alpha and K in BeamSearchScorer, how are the results
-changed? Does it do relatively better or worse than the sequence SequenceSampler?
-- Try different samples to decode and figure out which results the BeamSearchSampler does better than the SequenceSampler
diff --git a/docs/examples/word_embedding/data.py b/docs/examples/word_embedding/data.py
deleted file mode 120000
index fae6ca2f33..0000000000
--- a/docs/examples/word_embedding/data.py
+++ /dev/null
@@ -1 +0,0 @@
-../../model_zoo/word_embeddings/data.py
\ No newline at end of file
diff --git a/docs/examples/word_embedding/index.rst b/docs/examples/word_embedding/index.rst
index 9f4bc9d120..5dbc578ceb 100644
--- a/docs/examples/word_embedding/index.rst
+++ b/docs/examples/word_embedding/index.rst
@@ -10,33 +10,9 @@ Representation Learning
Basics on how to use word embedding with vocab in GluonNLP and apply it on word similarity and
analogy problems.
- .. card::
- :title: Word Embeddings Training and Evaluation
- :link: word_embedding_training.html
-
- Learn how to train fastText and word2vec embeddings on your own dataset, and determine
- embedding quality through intrinsic evaluation.
-
- .. card::
- :title: Extracting Sentence Features with Pre-trained ELMo
- :link: ../sentence_embedding/elmo_sentence_representation.html
-
- See how to use GluonNLP's model API to automatically download the pre-trained ELMo
- model from NAACL2018 best paper, and extract features with it.
-
- .. card::
- :title: Fine-tuning Pre-trained BERT Models
- :link: ../sentence_embedding/bert.html
-
- See how to use GluonNLP to fine-tune a sentence pair classification model with
- pre-trained BERT parameters.
-
.. toctree::
:hidden:
:maxdepth: 2
word_embedding.ipynb
- word_embedding_training.ipynb
- ../sentence_embedding/elmo_sentence_representation.ipynb
- ../sentence_embedding/bert.ipynb
\ No newline at end of file
diff --git a/docs/examples/word_embedding/model.py b/docs/examples/word_embedding/model.py
deleted file mode 120000
index aaaeb28aa3..0000000000
--- a/docs/examples/word_embedding/model.py
+++ /dev/null
@@ -1 +0,0 @@
-../../model_zoo/word_embeddings/model.py
\ No newline at end of file
diff --git a/docs/examples/word_embedding/utils.py b/docs/examples/word_embedding/utils.py
deleted file mode 120000
index 43bec44533..0000000000
--- a/docs/examples/word_embedding/utils.py
+++ /dev/null
@@ -1 +0,0 @@
-../../model_zoo/word_embeddings/utils.py
\ No newline at end of file
diff --git a/docs/examples/word_embedding/word_embedding.md b/docs/examples/word_embedding/word_embedding.md
index 475355b8ec..6eea630294 100644
--- a/docs/examples/word_embedding/word_embedding.md
+++ b/docs/examples/word_embedding/word_embedding.md
@@ -33,11 +33,12 @@ To begin, let's first import a few packages that we'll need for this example:
import warnings
warnings.filterwarnings('ignore')
-from mxnet import gluon
-from mxnet import nd
+from mxnet import gluon, nd
import gluonnlp as nlp
import re
-nlp.utils.check_version('0.7.0')
+import collections
+import numpy as np
+
```
## Creating Vocabulary with Word Embeddings
@@ -54,18 +55,18 @@ in just a few lines of code.
To begin, suppose that we have a simple text data set consisting of newline-separated strings.
```{.python .input}
-text = " hello world \n hello nice world \n hi world \n"
+text = " hello world \n hello nice world \n hi world \n goodgod"
```
To start, let's implement a simple tokenizer to separate the words and then count the frequency of each word in the data set. We can use our defined tokenizer to count word frequency in the data set.
```{.python .input}
def simple_tokenize(source_str, token_delim=' ', seq_delim='\n'):
- return filter(None, re.split(token_delim + '|' + seq_delim, source_str))
-counter = nlp.data.count_tokens(simple_tokenize(text))
+ return list(filter(None, re.split(token_delim + '|' + seq_delim, source_str)))
+counter = collections.Counter(simple_tokenize(text))
```
-The obtained `counter` behaves like a Python dictionary whose key-value pairs consist of words and their frequencies, respectively.
+The obtained `counter`'s key-value pairs consist of words and their frequencies, respectively.
We can then instantiate a `Vocab` object with a counter.
Because `counter` tracks word frequencies, we are able to specify arguments
such as `max_size` (maximum size) and `min_freq` (minimum frequency) to the `Vocab` constructor to restrict the size of the resulting vocabulary.
@@ -74,86 +75,133 @@ Suppose that we want to build indices for all the keys in counter.
If we simply want to construct a `Vocab` containing every word, then we can supply `counter` the only argument.
```{.python .input}
-vocab = nlp.Vocab(counter)
+vocab = nlp.data.Vocab(counter)
```
-A `Vocab` object associates each word with an index. We can easily access words by their indices using the `vocab.idx_to_token` attribute.
+A `Vocab` object associates each word with an index. We can easily access words by their indices using the `vocab.all_tokens` attribute.
```{.python .input}
-for word in vocab.idx_to_token:
+for word in vocab.all_tokens:
print(word)
```
-Contrarily, we can also grab an index given a token using `vocab.token_to_idx`.
+Contrarily, we can also grab an index given a token using `__getitem__` or `vocab.token_to_idx`.
```{.python .input}
-print(vocab.token_to_idx[""])
+print(vocab[""])
print(vocab.token_to_idx["world"])
```
-In Gluon NLP, for each word, there are three representations: the index of where it occurred in the original input (idx), the embedding (or vector/vec), and the token (the actual word). At any point, we may use any of the following methods to switch between the three representations: `idx_to_vec`, `idx_to_token`, `token_to_idx`.
-### Attaching word embeddings
+### Load word embeddings
-Our next step will be to attach word embeddings to the words indexed by `vocab`.
+Our next step will be to load word embeddings for a given `vocab`.
In this example, we'll use *fastText* embeddings trained on the *wiki.simple* dataset.
-First, we'll want to create a word embedding instance by calling `nlp.embedding.create`,
-specifying the embedding type `fasttext` (an unnamed argument) and the source `source='wiki.simple'` (the named argument).
-
-```{.python .input}
-fasttext_simple = nlp.embedding.create('fasttext', source='wiki.simple')
-```
-
-To attach the newly loaded word embeddings `fasttext_simple` to indexed words in `vocab`, we can simply call vocab's `set_embedding` method:
```{.python .input}
-vocab.set_embedding(fasttext_simple)
+matrix = nlp.embedding.load_embeddings(vocab, 'wiki.simple')
```
To see other available sources of pretrained word embeddings using the *fastText* algorithm,
-we can call `text.embedding.list_sources`.
+we can call `nlp.embedding.list_sources`.
```{.python .input}
nlp.embedding.list_sources('fasttext')[:5]
```
-The created vocabulary `vocab` includes four different words and a special
+The created vocabulary `vocab` includes five different words and a special
unknown token. Let us check the size of `vocab`.
```{.python .input}
len(vocab)
```
-By default, the vector of any token that is unknown to `vocab` is a zero vector.
+By default, the vector of any token that is unknown to `vocab` is the vector of `vocab.unk_token`.
+Its length is equal to the vector dimensions of the fastText word embeddings:
+(300,).
+
+```{.python .input}
+matrix[vocab['beautiful']].shape
+```
+
+Let us check the shape of the embedding of the words 'hello' from `vocab`.
+
+```{.python .input}
+matrix[vocab['hello']].shape
+```
+
+We can access the first five elements of the embedding of 'hello' and see that they are non-zero.
+
+```{.python .input}
+matrix[vocab['hello']][:5]
+```
+
+By default, the vector of any token that is in `vocab` but not in the pre-trained file
+is a vector generated by by sampling from normal distribution
+with the same std and mean of the pre-trained embedding matrix.
Its length is equal to the vector dimensions of the fastText word embeddings:
(300,).
```{.python .input}
-vocab.embedding['beautiful'].shape
+matrix[vocab['goodgod']].shape
```
-The first five elements of the vector of any unknown token are zeros.
+We can access the first five elements of the embedding of 'goodgod'.
```{.python .input}
-vocab.embedding['beautiful'][:5]
+matrix[vocab['goodgod']][:5]
```
-Let us check the shape of the embedding of the words 'hello' and 'world' from `vocab`.
+You can change the way to generate vectors for this kind of tokens by
+specifying `unk_method` in `load_embeddings` function.
+The `unk_method` is a function which receives `List[str]`
+and returns an embedding matrix(`numpy.ndarray`) for words not in the pre-trained file.
+For example,
```{.python .input}
-vocab.embedding['hello', 'world'].shape
+def simple(words):
+ return np.ones((len(words), 300))
+matrix = nlp.embedding.load_embeddings(vocab, 'wiki.simple', unk_method=simple)
```
-We can access the first five elements of the embedding of 'hello' and 'world' and see that they are non-zero.
+We can access the first five elements of the embedding of 'goodgod' and see that they are ones.
```{.python .input}
-vocab.embedding['hello', 'world'][:, :5]
+matrix[vocab['goodgod']][:5]
+```
+
+Sometimes we need to use `FastText` to compute vectors for Out-of-Vocabulary(OOV) words.
+In this case, we provide `get_fasttext_model` to return a `FastText` model for you to use.
+
+```{.python .input}
+model = nlp.embedding.get_fasttext_model('wiki.en')
+```
+
+It will return a `fasttext.FastText._FastText` object, you can get more information
+about it from `fasttext.cc`.
+
+Let us check the shape of the embedding of the OOV word 'goodgod'.
+
+```{.python .input}
+model['goodgod'].shape
+```
+
+We can access the first five elements of the embedding of 'goodgod'.
+
+```{.python .input}
+model['goodgod'][:5]
+```
+
+To see other available sources of the `FastText` model,
+we can call `nlp.embedding.list_sources`.
+
+```{.python .input}
+nlp.embedding.list_sources('fasttext.bin')[:5]
```
### Using Pre-trained Word Embeddings in Gluon
-To demonstrate how to use pre-
-trained word embeddings in Gluon, let us first obtain the indices of the words
+To demonstrate how to use pre-trained word embeddings in Gluon, let us first obtain the indices of the words
'hello' and 'world'.
```{.python .input}
@@ -161,14 +209,14 @@ vocab['hello', 'world']
```
We can obtain the vectors for the words 'hello' and 'world' by specifying their
-indices (5 and 4) and the weight or embedding matrix, which we get from calling `vocab.embedding.idx_to_vec` in
-`gluon.nn.Embedding`. We initialize a new layer and set the weights using the layer.weight.set_data method. Subsequently, we pull out the indices 5 and 4 from the weight vector and check their first five entries.
+indices (5 and 4) and the weight or embedding matrix, which we get from
+`gluon.nn.Embedding`. We initialize a new layer and set the weights using the `layer.weight.set_data` method. Subsequently, we pull out the indices 5 and 4 from the weight vector and check their first five entries.
```{.python .input}
-input_dim, output_dim = vocab.embedding.idx_to_vec.shape
+input_dim, output_dim = matrix.shape
layer = gluon.nn.Embedding(input_dim, output_dim)
layer.initialize()
-layer.weight.set_data(vocab.embedding.idx_to_vec)
+layer.weight.set_data(matrix)
layer(nd.array([5, 4]))[:, :5]
```
@@ -183,30 +231,24 @@ nlp.embedding.list_sources('glove')[:5]
```
For simplicity of demonstration, we use a smaller word embedding file, such as
-the 50-dimensional one.
-
-```{.python .input}
-glove_6b50d = nlp.embedding.create('glove', source='glove.6B.50d')
-```
-
-Now we create vocabulary by using all the tokens from `glove_6b50d`.
+the 50-dimensional one.
+Now we create vocabulary by using all the tokens from `glove.6b.50d`.
```{.python .input}
-vocab = nlp.Vocab(nlp.data.Counter(glove_6b50d.idx_to_token))
-vocab.set_embedding(glove_6b50d)
+matrix, vocab = nlp.embedding.load_embeddings(vocab=None, pretrained_name_or_dir='glove.6B.50d')
```
Below shows the size of `vocab` including a special unknown token.
```{.python .input}
-len(vocab.idx_to_token)
+len(vocab)
```
We can access attributes of `vocab`.
```{.python .input}
print(vocab['beautiful'])
-print(vocab.idx_to_token[71424])
+print(vocab.all_tokens[71424])
```
## Applications of Word Embeddings
@@ -215,18 +257,18 @@ To apply word embeddings, we need to define
cosine similarity. Cosine similarity determines the similarity between two vectors.
```{.python .input}
-from mxnet import nd
+import numpy as np
def cos_sim(x, y):
- return nd.dot(x, y) / (nd.norm(x) * nd.norm(y))
+ return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
```
The range of cosine similarity between two vectors can be between -1 and 1. The
larger the value, the larger the similarity between the two vectors.
```{.python .input}
-x = nd.array([1, 2])
-y = nd.array([10, 20])
-z = nd.array([-1, -2])
+x = np.array([1, 2])
+y = np.array([10, 20])
+z = np.array([-1, -2])
print(cos_sim(x, y))
print(cos_sim(x, z))
@@ -245,14 +287,17 @@ We can then find the indices for which the dot product is greatest (`topk`), whi
```{.python .input}
def norm_vecs_by_row(x):
- return x / nd.sqrt(nd.sum(x * x, axis=1) + 1E-10).reshape((-1,1))
-
-def get_knn(vocab, k, word):
- word_vec = vocab.embedding[word].reshape((-1, 1))
- vocab_vecs = norm_vecs_by_row(vocab.embedding.idx_to_vec)
- dot_prod = nd.dot(vocab_vecs, word_vec)
- indices = nd.topk(dot_prod.reshape((len(vocab), )), k=k+1, ret_typ='indices')
- indices = [int(i.asscalar()) for i in indices]
+ return x / np.sqrt(np.sum(x * x, axis=1) + 1E-10).reshape((-1,1))
+
+def topk(res, k):
+ part = np.argpartition(res, k)
+ return part[np.argsort(res[part])].tolist()
+
+def get_knn(vocab, matrix, k, word):
+ word_vec = matrix[vocab[word]].reshape((-1, 1))
+ vocab_vecs = norm_vecs_by_row(matrix)
+ dot_prod = np.dot(vocab_vecs, word_vec)
+ indices = topk(dot_prod.reshape((len(vocab), )), k=k+1)
# Remove unknown and input tokens.
return vocab.to_tokens(indices[1:])
```
@@ -261,31 +306,31 @@ Let us find the 5 most similar words to 'baby' from the vocabulary (size:
400,000 words).
```{.python .input}
-get_knn(vocab, 5, 'baby')
+get_knn(vocab, matrix, 5, 'baby')
```
We can verify the cosine similarity of the vectors of 'baby' and 'babies'.
```{.python .input}
-cos_sim(vocab.embedding['baby'], vocab.embedding['babies'])
+cos_sim(matrix[vocab['baby']], matrix[vocab['babies']])
```
Let us find the 5 most similar words to 'computers' from the vocabulary.
```{.python .input}
-get_knn(vocab, 5, 'computers')
+get_knn(vocab, matrix, 5, 'computers')
```
Let us find the 5 most similar words to 'run' from the given vocabulary.
```{.python .input}
-get_knn(vocab, 5, 'run')
+get_knn(vocab, matrix, 5, 'run')
```
Let us find the 5 most similar words to 'beautiful' from the vocabulary.
```{.python .input}
-get_knn(vocab, 5, 'beautiful')
+get_knn(vocab, matrix, 5, 'beautiful')
```
### Word Analogy
@@ -302,48 +347,47 @@ In this example,
we will find words that are analogous from the 400,000 indexed words in `vocab`.
```{.python .input}
-def get_top_k_by_analogy(vocab, k, word1, word2, word3):
- word_vecs = vocab.embedding[word1, word2, word3]
+def get_top_k_by_analogy(vocab, matrix, k, word1, word2, word3):
+ word_vecs = [matrix[vocab[word]] for word in [word1, word2, word3]]
word_diff = (word_vecs[1] - word_vecs[0] + word_vecs[2]).reshape((-1, 1))
- vocab_vecs = norm_vecs_by_row(vocab.embedding.idx_to_vec)
- dot_prod = nd.dot(vocab_vecs, word_diff)
- indices = nd.topk(dot_prod.reshape((len(vocab), )), k=k, ret_typ='indices')
- indices = [int(i.asscalar()) for i in indices]
+ vocab_vecs = norm_vecs_by_row(matrix)
+ dot_prod = np.dot(vocab_vecs, word_diff)
+ indices = topk(dot_prod.reshape((len(vocab), )), k=k)
return vocab.to_tokens(indices)
```
We leverage this method to find the word to complete the analogy 'man : woman :: son :'.
```{.python .input}
-get_top_k_by_analogy(vocab, 1, 'man', 'woman', 'son')
+get_top_k_by_analogy(vocab, matrix, 1, 'man', 'woman', 'son')
```
Let us verify the cosine similarity between vec('son')+vec('woman')-vec('man')
and vec('daughter').
```{.python .input}
-def cos_sim_word_analogy(vocab, word1, word2, word3, word4):
+def cos_sim_word_analogy(vocab, matrix, word1, word2, word3, word4):
words = [word1, word2, word3, word4]
- vecs = vocab.embedding[words]
+ vecs = [matrix[vocab[word]] for word in words]
return cos_sim(vecs[1] - vecs[0] + vecs[2], vecs[3])
-cos_sim_word_analogy(vocab, 'man', 'woman', 'son', 'daughter')
+cos_sim_word_analogy(vocab, matrix, 'man', 'woman', 'son', 'daughter')
```
And to perform some more tests, let's try the following analogy: 'beijing : china :: tokyo : '.
```{.python .input}
-get_top_k_by_analogy(vocab, 1, 'beijing', 'china', 'tokyo')
+get_top_k_by_analogy(vocab, matrix, 1, 'beijing', 'china', 'tokyo')
```
And another word analogy: 'bad : worst :: big : '.
```{.python .input}
-get_top_k_by_analogy(vocab, 1, 'bad', 'worst', 'big')
+get_top_k_by_analogy(vocab, matrix, 1, 'bad', 'worst', 'big')
```
And the last analogy: 'do : did :: go :'.
```{.python .input}
-get_top_k_by_analogy(vocab, 1, 'do', 'did', 'go')
+get_top_k_by_analogy(vocab, matrix, 1, 'do', 'did', 'go')
```
diff --git a/docs/examples/word_embedding/word_embedding_training.md b/docs/examples/word_embedding/word_embedding_training.md
deleted file mode 100644
index 819d239089..0000000000
--- a/docs/examples/word_embedding/word_embedding_training.md
+++ /dev/null
@@ -1,381 +0,0 @@
-# Word Embeddings Training and Evaluation
-
-```{.python .input}
-import warnings
-warnings.filterwarnings('ignore')
-
-import itertools
-import time
-import math
-import logging
-import random
-
-import mxnet as mx
-import gluonnlp as nlp
-import numpy as np
-from scipy import stats
-
-nlp.utils.check_version('0.7.0')
-
-# context = mx.cpu() # Enable this to run on CPU
-context = mx.gpu(0) # Enable this to run on GPU
-```
-
-## Data
-Here we use the Text8 corpus from the [Large Text Compression
-Benchmark](http://mattmahoney.net/dc/textdata.html) which includes the first
-100
-MB of cleaned text from Wikipedia in English.
-
-```{.python .input}
-text8 = nlp.data.Text8()
-print('# sentences:', len(text8))
-for sentence in text8[:3]:
- print('# tokens:', len(sentence), sentence[:5])
-```
-
-Given the tokenized data, we first count all tokens and then construct a
-vocabulary of all tokens that occur at least 5 times in the dataset. The
-vocabulary contains a one-to-one mapping between tokens and integers (also
-called indices or idx for short).
-
-Furthermore, we can store the frequency count of each
-token in the vocabulary as we will require this information later on for
-sampling random negative (or noise) words. Finally, we replace all tokens with
-their integer representation based on the vocabulary.
-
-```{.python .input}
-counter = nlp.data.count_tokens(itertools.chain.from_iterable(text8))
-vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None,
- bos_token=None, eos_token=None, min_freq=5)
-idx_to_counts = [counter[w] for w in vocab.idx_to_token]
-
-def code(sentence):
- return [vocab[token] for token in sentence if token in vocab]
-
-text8 = text8.transform(code, lazy=False)
-
-print('# sentences:', len(text8))
-for sentence in text8[:3]:
- print('# tokens:', len(sentence), sentence[:5])
-```
-
-Next we need to transform the coded Text8 dataset into batches that are more useful for
-training an embedding model.
-
-In this tutorial we train leveraging the SkipGram
-objective made popular by the following: [1].
-
-For SkipGram, we sample pairs of co-occurring
-words from the corpus.
-Two words are said to co-occur if they occur with
-distance less than a specified *window* size.
-The *window* size is usually
-chosen around 5. Refer to the aforementioned paper for more details.
-
-To obtain the samples from the corpus, we can shuffle the
-sentences and then proceed linearly through each sentence, considering each word
-as well as all the words in its window. In this case, we call the current word
-in focus the center word, and the words in its window, the context words.
-GluonNLP contains `gluonnlp.data.EmbeddingCenterContextBatchify` batchify
-transformation, that takes a corpus, such as the coded Text8 we have here, and
-returns a `DataStream` of batches of center and context words.
-
-To obtain good
-results, each sentence is further subsampled, meaning that words are deleted
-with a probability proportional to their frequency.
-[1] proposes to discard
-individual occurrences of words from the dataset with probability
-
-$$P(w_i) = 1 -
-\sqrt{\frac{t}{f(w_i)}}$$
-
-where $f(w_i)$ is the frequency with which a word is
-observed in a dataset and $t$ is a subsampling constant typically chosen around
-$10^{-5}$.
-[1] has also shown that the final performance is improved if the
-window size is chosen uniformly random for each center words out of the range
-[1, *window*].
-
-For this notebook, we are interested in training a fastText
-embedding model [2]. A fastText model not only associates an embedding vector with
-each token in the vocabulary, but also with a pre-specified number of subwords.
-Commonly 2 million subword vectors are obtained and each subword vector is
-associated with zero, one, or multiple character-ngrams. The mapping between
-character-ngrams and subwords is based on a hash function.
-The *final* embedding
-vector of a token is the mean of the vectors associated with the token and all
-character-ngrams occurring in the string representation of the token. Thereby a
-fastText embedding model can compute meaningful embedding vectors for tokens
-that were not seen during training.
-
-For this notebook, we have prepared a helper function `transform_data_fasttext`
-which builds a series of transformations of the `text8 Dataset` created above,
-applying the techniques we mention briefly above. It returns a `DataStream` over batches as
-well as a `batchify_fn` function that applied to a batch looks up and includes the
-fastText subwords associated with the center words. Additionally, it returns the subword
-function which can be used to obtain the subwords of a given string
-representation of a token. We will take a closer look at the subword function
-farther on.
-
-You can find the `transform_data_fasttext()` function in `data.py` in the
-archive that can be downloaded via the `Download` button at the top of this page.
-
-```{.python .input}
-from data import transform_data_fasttext
-
-batch_size=4096
-data = nlp.data.SimpleDataStream([text8]) # input is a stream of datasets, here just 1. Allows scaling to larger corpora that don't fit in memory
-data, batchify_fn, subword_function = transform_data_fasttext(
- data, vocab, idx_to_counts, cbow=False, ngrams=[3,4,5,6], ngram_buckets=100000, batch_size=batch_size, window_size=5)
-```
-
-```{.python .input}
-batches = data.transform(batchify_fn)
-```
-
-Note that the number of subwords is potentially
-different for every word. Therefore the batchify_fn represents a word with its
-subwords as a row in a compressed sparse row (CSR) matrix. For more information on CSR matrices click here:
-https://mxnet.incubator.apache.org/tutorials/sparse/csr.html
-
-Separating the batchify_fn from the previous word-pair
-sampling is useful, as it allows parallelization of the CSR matrix construction over
-multiple CPU cores for separate batches.
-
-## Subwords
-
-`GluonNLP` provides the concept of a subword function which maps
-words to a list of indices representing their subword.
-Possible subword functions
-include mapping a word to the sequence of it's characters/bytes or hashes of all
-its ngrams.
-
-FastText models use a hash function to map each ngram of a word to
-a number in range `[0, num_subwords)`. We include the same hash function.
-Above
-`transform_data_fasttext` has also returned a `subword_function` object. Let's try it with
-a few words:
-
-```{.python .input}
-idx_to_subwordidxs = subword_function(vocab.idx_to_token)
-for word, subwords in zip(vocab.idx_to_token[:3], idx_to_subwordidxs[:3]):
- print('<'+word+'>', subwords, sep = '\t')
-```
-
-## Model
-
-Here we define a SkipGram model for training fastText embeddings.
-For
-Skip-Gram, the model consists of two independent embedding networks.
-One for the
-center words, and one for the context words.
-For center words, subwords are
-taken into account while for context words only the token itself is taken into
-account.
-
-GluonNLP provides an `nlp.model.train.FasttextEmbeddingModel` block
-which defines the fastText style embedding with subword support.
-It can be used
-for training, but also supports loading models trained with the original C++
-fastText library from `.bin` files.
-After training, vectors for arbitrary words
-can be looked up via `embedding[['a', 'list', 'of', 'potentially', 'unknown',
-'words']]` where `embedding` is an `nlp.model.train.FasttextEmbeddingModel`.
-
-In
-the `model.py` script we provide a definition for the fastText model for the
-SkipGram objective.
-The model definition is a Gluon HybridBlock, meaning that
-the complete forward / backward pass are compiled and executed directly in the
-MXNet backend. Not only does the block include the `FasttextEmbeddingModel` for
-the center words and a simple embedding matrix for the context words, but it
-also takes care of sampling a specified number of noise words for each center-
-context pair. These noise words are called negatives, as the resulting center-
-negative pair is unlikely to occur in the dataset. The model then must learn
-which word-pairs are negatives and which ones are real. Thereby it obtains
-meaningful word and subword vectors for all considered tokens. The negatives are
-sampled from the smoothed unigram frequency distribution.
-
-Let's instantiate and
-initialize the model. We also create a trainer object for updating the
-parameters with AdaGrad.
-Finally we print a summary of the model.
-
-```{.python .input}
-from model import SG as SkipGramNet
-
-emsize = 300
-num_negatives = 5
-
-negatives_weights = mx.nd.array(idx_to_counts)
-embedding = SkipGramNet(
- vocab.token_to_idx, emsize, batch_size, negatives_weights, subword_function, num_negatives=5, smoothing=0.75)
-embedding.initialize(ctx=context)
-embedding.hybridize()
-trainer = mx.gluon.Trainer(embedding.collect_params(), 'adagrad', dict(learning_rate=0.05))
-
-print(embedding)
-```
-
-Let's take a look at the documentation of the forward pass.
-
-```{.python .input}
-print(SkipGramNet.hybrid_forward.__doc__)
-```
-
-Before we start training, let's examine the quality of our randomly initialized
-embeddings:
-
-```{.python .input}
-def norm_vecs_by_row(x):
- return x / (mx.nd.sum(x * x, axis=1) + 1e-10).sqrt().reshape((-1, 1))
-
-
-def get_k_closest_tokens(vocab, embedding, k, word):
- word_vec = norm_vecs_by_row(embedding[[word]])
- vocab_vecs = norm_vecs_by_row(embedding[vocab.idx_to_token])
- dot_prod = mx.nd.dot(vocab_vecs, word_vec.T)
- indices = mx.nd.topk(
- dot_prod.reshape((len(vocab.idx_to_token), )),
- k=k + 1,
- ret_typ='indices')
- indices = [int(i.asscalar()) for i in indices]
- result = [vocab.idx_to_token[i] for i in indices[1:]]
- print('closest tokens to "%s": %s' % (word, ", ".join(result)))
-```
-
-```{.python .input}
-example_token = "vector"
-get_k_closest_tokens(vocab, embedding, 10, example_token)
-```
-
-We can see that in the randomly initialized fastText model the closest tokens to
-"vector" are based on overlapping ngrams.
-
-## Training
-
-Thanks to the Gluon data pipeline and the HybridBlock handling all
-complexity, our training code is very simple.
-We iterate over all batches, move
-them to the appropriate context (GPU), do forward, backward, and parameter update
-and finally include some helpful print statements for following the training
-process.
-
-```{.python .input}
-log_interval = 500
-
-def train_embedding(num_epochs):
- for epoch in range(1, num_epochs + 1):
- start_time = time.time()
- l_avg = 0
- log_wc = 0
-
- print('Beginnign epoch %d and resampling data.' % epoch)
- for i, batch in enumerate(batches):
- batch = [array.as_in_context(context) for array in batch]
- with mx.autograd.record():
- l = embedding(*batch)
- l.backward()
- trainer.step(1)
-
- l_avg += l.mean()
- log_wc += l.shape[0]
- if i % log_interval == 0:
- mx.nd.waitall()
- wps = log_wc / (time.time() - start_time)
- l_avg = l_avg.asscalar() / log_interval
- print('epoch %d, iteration %d, loss %.2f, throughput=%.2fK wps'
- % (epoch, i, l_avg, wps / 1000))
- start_time = time.time()
- log_wc = 0
- l_avg = 0
-
- get_k_closest_tokens(vocab, embedding, 10, example_token)
- print("")
-```
-
-```{.python .input}
-train_embedding(num_epochs=1)
-```
-
-## Word Similarity and Relatedness Task
-
-Word embeddings should capture the
-relationship between words in natural language.
-In the Word Similarity and
-Relatedness Task, word embeddings are evaluated by comparing word similarity
-scores computed from a pair of words with human labels for the similarity or
-relatedness of the pair.
-
-`GluonNLP` includes a number of common datasets for
-the Word Similarity and Relatedness Task. The included datasets are listed in
-the [API documentation](http://gluon-nlp.mxnet.io/api/data.html#word-embedding-evaluation-datasets). We use several of them in the evaluation example below.
-We first show a few samples from the WordSim353 dataset, to get an overall
-feeling of the Dataset structure.
-
-## Evaluation
-
-Thanks to the subword support of the `FasttextEmbeddingModel` we
-can evaluate on all words in the evaluation dataset,
-not only on the ones that we
-observed during training.
-
-We first compute a list of tokens in our evaluation
-dataset and then create an embedding matrix for them based on the fastText model.
-
-```{.python .input}
-rw = nlp.data.RareWords()
-rw_tokens = list(set(itertools.chain.from_iterable((d[0], d[1]) for d in rw)))
-
-rw_token_embedding = nlp.embedding.TokenEmbedding(unknown_token=None, allow_extend=True)
-rw_token_embedding[rw_tokens]= embedding[rw_tokens]
-
-print('There are', len(rw_tokens), 'unique tokens in the RareWords dataset. Examples are:')
-for i in range(5):
- print('\t', rw[i])
-print('The imputed TokenEmbedding has shape', rw_token_embedding.idx_to_vec.shape)
-```
-
-```{.python .input}
-evaluator = nlp.embedding.evaluation.WordEmbeddingSimilarity(
- idx_to_vec=rw_token_embedding.idx_to_vec,
- similarity_function="CosineSimilarity")
-evaluator.initialize(ctx=context)
-evaluator.hybridize()
-```
-
-```{.python .input}
-words1, words2, scores = zip(*([rw_token_embedding.token_to_idx[d[0]],
- rw_token_embedding.token_to_idx[d[1]],
- d[2]] for d in rw))
-words1 = mx.nd.array(words1, ctx=context)
-words2 = mx.nd.array(words2, ctx=context)
-```
-
-```{.python .input}
-pred_similarity = evaluator(words1, words2)
-sr = stats.spearmanr(pred_similarity.asnumpy(), np.array(scores))
-print('Spearman rank correlation on {} pairs of {}: {}'.format(
- len(words1), rw.__class__.__name__, sr.correlation.round(3)))
-```
-
-## Further information
-
-For further information and examples on training and
-evaluating word embeddings with GluonNLP take a look at the Word Embedding
-section on the Scripts / Model Zoo page. There you will find more thorough
-evaluation techniques and other embedding models. In fact, the `data.py` and
-`model.py` files used in this example are the same as the ones used in the
-script.
-
-## References
-
-- [1] Mikolov, Tomas, et al. “Distributed representations of words and phrases
-and their compositionally.”
- Advances in neural information processing
-systems. 2013.
-
-
-- [2] Bojanowski et al., "Enriching Word Vectors with Subword
-Information" Transactions of the Association for Computational Linguistics 2017
diff --git a/docs/index.rst b/docs/index.rst
index cb7f30af41..c3d225a957 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -79,5 +79,5 @@ You may find the 60-min Gluon crash course linked from there especially helpful.
model_zoo/index
examples/index
api/index
- community/index
+ website/index
genindex
diff --git a/docs/md2ipynb.py b/docs/md2ipynb.py
index 3dfa91959b..ef9edf475a 100644
--- a/docs/md2ipynb.py
+++ b/docs/md2ipynb.py
@@ -1,22 +1,26 @@
+import argparse
import os
-import sys
import time
-import notedown
+
import nbformat
+import notedown
-assert len(sys.argv) == 2, 'usage: input.md'
+parser = argparse.ArgumentParser(description='Convert md file to ipynb files.')
+parser.add_argument('input', help='input.md', type=str)
+parser.add_argument('-d', '--disable_compute',
+ help='Disable computing python scripts', action="store_true")
+args = parser.parse_args()
# timeout for each notebook, in sec
-timeout = 40 * 60
+timeout = 90 * 60
# the files will be ignored for execution
ignore_execution = []
-input_path = sys.argv[1]
-
# Change working directory to directory of input file
-input_dir, input_fn = os.path.split(input_path)
-os.chdir(input_dir)
+input_dir, input_fn = os.path.split(args.input)
+if input_dir:
+ os.chdir(input_dir)
output_fn = '.'.join(input_fn.split('.')[:-1] + ['ipynb'])
@@ -28,8 +32,9 @@
if not any([i in input_fn for i in ignore_execution]):
tic = time.time()
- notedown.run(notebook, timeout)
- print('=== Finished evaluation in %f sec'%(time.time()-tic))
+ if not args.disable_compute:
+ notedown.run(notebook, timeout)
+ print('=== Finished evaluation in %f sec' % (time.time() - tic))
# write
# need to add language info to for syntax highlight
diff --git a/docs/model_zoo.rst b/docs/model_zoo.rst
index 8dd1d9f81f..249d128d6f 100644
--- a/docs/model_zoo.rst
+++ b/docs/model_zoo.rst
@@ -3,74 +3,9 @@ Model Zoo
.. container:: cards
- .. card::
- :title: Word Embedding
- :link: model_zoo/word_embeddings/index.html
-
- Mapping words to vectors.
-
- .. card::
- :title: Language Modeling
- :link: model_zoo/language_model/index.html
-
- Learning the distribution and representation of sequences of words.
-
.. card::
:title: Machine Translation
:link: model_zoo/machine_translation/index.html
From "Hello" to "Bonjour".
- .. card::
- :title: Text Classification
- :link: model_zoo/text_classification/index.html
-
- Categorize texts and documents.
-
- .. card::
- :title: Sentiment Analysis
- :link: model_zoo/sentiment_analysis/index.html
-
- Classifying polarity of emotions and opinions.
-
- .. card::
- :title: Parsing
- :link: model_zoo/parsing/index.html
-
- Dependency parsing.
-
- .. card::
- :title: Natural Language Inference
- :link: model_zoo/natural_language_inference/index.html
-
- Determine if the premise semantically entails the hypothesis.
-
- .. card::
- :title: Text Generation
- :link: model_zoo/text_generation/index.html
-
- Generating language from models.
-
- .. card::
- :title: BERT
- :link: model_zoo/bert/index.html
-
- Transferring pre-trained language representations to language understanding tasks.
-
- .. card::
- :title: Named Entity Recognition
- :link: model_zoo/ner/index.html
-
- Locating and classifying named entity mentioned in unstructured texts.
-
- .. card::
- :title: Intent Classification and Slot Labeling
- :link: model_zoo/intent_cls_slot_labeling/index.html
-
- Predicting the intent of the query and extracting semantic concepts in the query.
-
- .. card::
- :title: Model Conversion
- :link: model_zoo/conversion_tools/index.html
-
- Converting NLP models from other frameworks to GluonNLP.
diff --git a/docs/website/configuration.rst b/docs/website/configuration.rst
new file mode 100644
index 0000000000..3e63dae430
--- /dev/null
+++ b/docs/website/configuration.rst
@@ -0,0 +1,74 @@
+Preview GluonNLP Website Locally
+-----------------------------------------------------------------
+
+The GluonNLP docs website is at `release branch `__, or `master branch `__. Its source code is at `gluon-nlp `__.
+
+Currently the GluonNLP website is constructed from the source code via CI automatically. Here I will share:
+
+- the structure of files used for the website, and
+- how to make changes to the website and preview the website
+
+Website Structure
+~~~~~~~~~~~~~~~~~
+
+Currently the docs part contain four sections: Model Zoo, Examples, API and Community. It should be noted that the model zoo is a link redirecting to the ``scripts`` folder in the parent folder. The other three folders are used exclusively by the docs website. Also, three different sections use ``rst``, ``py``, ``md`` files and their composition for compiling - they are inconsistent. So when you work on different sections of the docs website, you should pay attention to handle the different sections with care.
+
+The main structure, the index file of the entire website, is written in ``rst`` format. It calls the index file of each different section separately. Before compiling the website, you should be aware that:
+
+- ``rst`` files are static files, they are directly displayed to the website with further styles;
+- ``md`` files are script files, the python scripts in these files will be executed and then stored into ``ipynb`` files before converting ``ipynb`` files into website files.
+
+Or more specifically, the files in the examples folder will be further executed and converted into intermediate files before writing to the final HTML files, while those in other folders don’t need further conversion or computation.
+
+Environment Configuration Instruction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Next, I will give a step by step instruction on how to compile this website from scratch.
+
+1. Preview website without displaying python output
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use the command from https://github.com/dmlc/gluon-nlp/blob/master/docs/README.txt to install the necessary packages.
+
+.. code:: bash
+
+ pip install sphinx>=1.5.5 sphinx-gallery sphinx_rtd_theme matplotlib Image recommonmark
+
+Then use the command below to build the website locally, all the ``python`` scripts are skipped and there is no output for ``python`` code blocks:
+
+.. code:: bash
+
+ make docs_local MD2IPYNB_OPTION=-d
+
+You will get full HTML result for the website after successful execution.
+
+2. Preview website with python output
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To accomplish this task, we recommend you to use the instance ``g4dn.xlarge`` on Amazon EC2. For convenience, you can search *deep learning* in the filter bar to select the deep learning-enabled machines, where you will have no need of installing addition drivers.
+
+After you have got the machine and logged to the machine, you will need to configure the packages using the command below:
+
+.. code:: bash
+
+ git clone https://github.com/dmlc/gluon-nlp
+ cd gluon-nlp
+ pip3 install --user -e '.[extras,dev]'
+
+If necessary, you might still need to configure the packages like below:
+
+Use ``python3`` command to get into the python execution screen, and then type the commands below to install the necessary packages inside python:
+
+.. code:: python
+
+ import nltk
+ nltk.download('perluniprops')
+ nltk.download('nonbreaking_prefixes')
+ nltk.download('punkt')
+
+By now, you should have installed all the necessary packages for the website. You can use the command below for previewing the website locally with all the python output:
+
+.. code:: bash
+
+ make docs_local
+
diff --git a/docs/community/contribute.rst b/docs/website/contribute.rst
similarity index 100%
rename from docs/community/contribute.rst
rename to docs/website/contribute.rst
diff --git a/docs/community/git.rst b/docs/website/git.rst
similarity index 100%
rename from docs/community/git.rst
rename to docs/website/git.rst
diff --git a/docs/community/index.rst b/docs/website/index.rst
similarity index 88%
rename from docs/community/index.rst
rename to docs/website/index.rst
index f9b1627e02..d5313d8f06 100644
--- a/docs/community/index.rst
+++ b/docs/website/index.rst
@@ -30,7 +30,7 @@ Community
:title: GluonNLP Slack Channel
:link: https://apache-mxnet.slack.com/messages/CCCDM10V9
- #gluon-nlp Slack channel. Click the `sign-up link `_ to register.
+ #gluon-nlp Slack channel. Click the `sign-up link `_ to register.
.. card::
@@ -55,3 +55,4 @@ Interested in contributing to GluonNLP? Check our contribution guide:
contribute
git
release
+ configuration
\ No newline at end of file
diff --git a/docs/community/release.rst b/docs/website/release.rst
similarity index 100%
rename from docs/community/release.rst
rename to docs/website/release.rst
diff --git a/env/cpu/py3-master.yml b/env/cpu/py3-master.yml
deleted file mode 100644
index 15d49c49fb..0000000000
--- a/env/cpu/py3-master.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-channels:
- - conda-forge
-dependencies:
- - python=3.5
- - pip
- - perl
- - pandoc=1.19.2
- - tornado=5.1.1
- - sphinx=2.2.1
- # In the -master pipeline, we test without numba. Numba is an optional
- # dependency and GluonNLP needs to work both with and without numba installed.
- - pip:
- - numpy==1.17.4
- - notedown==1.5.1
- - sphinx-gallery==0.4.0
- - recommonmark==0.6.0
- - nbconvert==5.6.1
- - nbsphinx>=0.3.4,<0.4
- - ipython
- - ipykernel
- - https://github.com/szha/mx-theme/tarball/master
- - seaborn
- - jieba
- - cython
- - boto3
- - pytype==2019.10.17
- - pytest==5.3.2
- - pytest-env==0.6.2
- - pytest-cov==2.8.1
- - pytest-xdist==1.31.0
- - pylint==2.4.4
- - pylint-quotes==0.2.1
- - flaky==3.6.1
- - flake8==3.7.9
- - mock<3
- - https://repo.mxnet.io/dist/python/cpu/mxnet-1.6.0-py2.py3-none-manylinux1_x86_64.whl
- - scipy==1.3.2
- - regex==2019.11.1
- - nltk==3.4.5
- - sacremoses==0.0.35
- - spacy==2.2.2
- - sentencepiece==0.1.83
- - sphinx-autodoc-typehints==1.7.0
diff --git a/env/cpu/py3.yml b/env/cpu/py3.yml
deleted file mode 100644
index 77a649b07c..0000000000
--- a/env/cpu/py3.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-channels:
- - conda-forge
-dependencies:
- - python=3.5
- - pip
- - perl
- - pandoc=1.19.2
- - tornado=5.1.1
- - sphinx=2.2.1
- - pip:
- - numpy==1.17.4
- - notedown==1.5.1
- - sphinx-gallery==0.4.0
- - recommonmark==0.6.0
- - nbconvert==5.6.1
- - nbsphinx>=0.3.4,<0.4
- - ipython
- - ipykernel
- - numba==0.47
- - https://github.com/szha/mx-theme/tarball/master
- - seaborn
- - jieba
- - cython
- - boto3
- - pytype==2019.10.17
- - pytest==5.3.2
- - pytest-env==0.6.2
- - pytest-cov==2.8.1
- - pytest-xdist==1.31.0
- - pylint==2.4.4
- - pylint-quotes==0.2.1
- - flaky==3.6.1
- - flake8==3.7.9
- - mock<3
- - https://lausen-public.s3.amazonaws.com/mxnet_cu100-1.6.0b20200125-py2.py3-none-manylinux1_x86_64.whl
- - scipy==1.3.2
- - regex==2019.11.1
- - nltk==3.4.5
- - sacremoses==0.0.35
- - spacy==2.2.2
- - sentencepiece==0.1.83
- - sphinx-autodoc-typehints==1.7.0
diff --git a/env/docker/py3.yml b/env/docker/py3.yml
deleted file mode 100644
index 2c8b532186..0000000000
--- a/env/docker/py3.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-channels:
- - conda-forge
-dependencies:
- - python=3.5
- - pip
- - perl
- - pandoc=1.19.2
- - tornado=5.1.1
- - sphinx=2.2.1
- - pip:
- - numpy==1.17.4
- - notedown==1.5.1
- - sphinx-gallery==0.4.0
- - recommonmark==0.6.0
- - nbconvert==5.6.1
- - nbsphinx>=0.3.4,<0.4
- - ipython
- - ipykernel
- - numba==0.47
- - https://github.com/szha/mx-theme/tarball/master
- - seaborn
- - jieba
- - scikit-learn==0.21.3
- - cython
- - pytype==2019.10.17
- - pytest==5.2.3
- - pytest-env==0.6.2
- - pytest-cov==2.8.1
- - pytest-xdist==1.30.0
- - pylint==2.4.4
- - pylint-quotes==0.2.1
- - flaky==3.6.1
- - flake8==3.7.9
- - mock<3
- - https://lausen-public.s3.amazonaws.com/mxnet_cu100-1.6.0b20200125-py2.py3-none-manylinux1_x86_64.whl
- - scipy==1.3.2
- - regex==2019.11.1
- - nltk==3.4.5
- - sacremoses==0.0.35
- - spacy==2.2.2
- - sentencepiece==0.1.83
- - sphinx-autodoc-typehints==1.7.0
diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml
deleted file mode 100644
index 593614b587..0000000000
--- a/env/gpu/py3-master.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-channels:
- - conda-forge
-dependencies:
- - python=3.5
- - pip
- - perl
- - pandoc=1.19.2
- - tornado=5.1.1
- - sphinx=2.2.1
- # In the -master pipeline, we test without numba. Numba is an optional
- # dependency and GluonNLP needs to work both with and without numba installed.
- - pip:
- - numpy==1.17.4
- - notedown==1.5.1
- - sphinx-gallery==0.4.0
- - recommonmark==0.6.0
- - nbconvert==5.6.1
- - nbsphinx>=0.3.4,<0.4
- - ipython
- - ipykernel
- - https://github.com/szha/mx-theme/tarball/master
- - seaborn
- - jieba
- - cython
- - boto3
- - pytype==2019.10.17
- - pytest==5.3.2
- - pytest-env==0.6.2
- - pytest-cov==2.8.1
- - pytest-xdist==1.31.0
- - pylint==2.4.4
- - pylint-quotes==0.2.1
- - flaky==3.6.1
- - flake8==3.7.9
- - mock<3
- - https://repo.mxnet.io/dist/python/cu100/mxnet_cu100-1.6.0-py2.py3-none-manylinux1_x86_64.whl
- - scipy==1.3.2
- - regex==2019.11.1
- - nltk==3.4.5
- - sacremoses==0.0.35
- - spacy==2.2.2
- - sentencepiece==0.1.83
- - sphinx-autodoc-typehints==1.7.0
- - seqeval
diff --git a/env/gpu/py3.yml b/env/gpu/py3.yml
deleted file mode 100644
index 1ed92f3fa5..0000000000
--- a/env/gpu/py3.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-channels:
- - conda-forge
-dependencies:
- - python=3.5
- - pip
- - perl
- - pandoc=1.19.2
- - tornado=5.1.1
- - sphinx=2.2.1
- - pip:
- - numpy==1.17.4
- - notedown==1.5.1
- - sphinx-gallery==0.4.0
- - recommonmark==0.6.0
- - nbconvert==5.6.1
- - nbsphinx>=0.3.4,<0.4
- - ipython
- - ipykernel
- - numba==0.47
- - https://github.com/szha/mx-theme/tarball/master
- - seaborn
- - jieba
- - cython
- - boto3
- - pytype==2019.10.17
- - pytest==5.3.2
- - pytest-env==0.6.2
- - pytest-cov==2.8.1
- - pytest-xdist==1.31.0
- - pylint==2.4.4
- - pylint-quotes==0.2.1
- - flaky==3.6.1
- - flake8==3.7.9
- - mock<3
- - https://lausen-public.s3.amazonaws.com/mxnet_cu100-1.6.0b20200125-py2.py3-none-manylinux1_x86_64.whl
- - scipy==1.3.2
- - regex==2019.11.1
- - nltk==3.4.5
- - sacremoses==0.0.35
- - spacy==2.2.2
- - sentencepiece==0.1.83
- - sphinx-autodoc-typehints==1.7.0
- - seqeval
diff --git a/examples b/examples
deleted file mode 120000
index 6c33de9655..0000000000
--- a/examples
+++ /dev/null
@@ -1 +0,0 @@
-docs/examples
\ No newline at end of file
diff --git a/mms/README.rst b/mms/README.rst
deleted file mode 100644
index fe8309fa19..0000000000
--- a/mms/README.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-Multi-model-server example
-==========================
-
-https://github.com/awslabs/multi-model-server/
-
-Assuming you are located in the root of the GluonNLP repo, you can run this
-example via:
-
-```
-pip install --user multi-model-server
-curl https://dist-bert.s3.amazonaws.com/demo/finetune/sst.params -o mms/sst.params
-~/.local/bin/model-archiver --model-name bert_sst --model-path mms --handler bert:handle --runtime python --export-path /tmp
-~/.local/bin/multi-model-server --start --models bert_sst.mar --model-store /tmp
-curl -X POST http://127.0.0.1:8080/bert_sst/predict -F 'data=["Positive sentiment", "Negative sentiment"]'
-```
-
-
diff --git a/mms/bert.py b/mms/bert.py
deleted file mode 100644
index fedb54632e..0000000000
--- a/mms/bert.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import json
-import logging
-
-import mxnet as mx
-import gluonnlp as nlp
-
-
-class BertHandler:
- """GluonNLP based Bert Handler"""
-
- def __init__(self):
- self.error = None
- self._context = None
- self.initialized = False
-
- def initialize(self, context):
- """
- Initialize model. This will be called during model loading time
- :param context: Initial context contains model server system properties.
- :return:
- """
- self._context = context
- gpu_id = context.system_properties["gpu_id"]
- self._mx_ctx = mx.cpu() if gpu_id is None else mx.gpu(gpu_id)
- bert, vocab = nlp.model.get_model('bert_12_768_12',
- dataset_name='book_corpus_wiki_en_uncased',
- pretrained=False, ctx=self._mx_ctx, use_pooler=True,
- use_decoder=False, use_classifier=False)
- tokenizer = nlp.data.BERTTokenizer(vocab, lower=True)
- self.sentence_transform = nlp.data.BERTSentenceTransform(tokenizer, max_seq_length=128,
- vocab=vocab, pad=True, pair=False)
- self.batchify = nlp.data.batchify.Tuple(
- nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]), # input
- nlp.data.batchify.Stack(), # length
- nlp.data.batchify.Pad(axis=0, pad_val=0)) # segment
- # Set dropout to non-zero, to match pretrained model parameter names
- self.net = nlp.model.BERTClassifier(bert, dropout=0.1)
- self.net.load_parameters('sst.params', self._mx_ctx)
- self.net.hybridize()
-
- self.initialized = True
-
- def handle(self, batch, context):
- # we're just faking batch_size==1 but allow dynamic batch size. Ie the
- # actual batch size is the len of the first element.
- try:
- assert len(batch) == 1
- batch = json.loads(batch[0]["data"].decode('utf-8'))
- except (json.JSONDecodeError, KeyError, AssertionError) as e:
- print('call like: curl -X POST http://127.0.0.1:8080/bert_sst/predict '
- '-F \'data=["sentence 1", "sentence 2"]\'')
- raise e
- model_input = self.batchify([self.sentence_transform(sentence) for sentence in batch])
-
- inputs, valid_length, token_types = [arr.as_in_context(self._mx_ctx) for arr in model_input]
- inference_output = self.net(inputs, token_types, valid_length.astype('float32'))
- inference_output = inference_output.as_in_context(mx.cpu())
-
- return [mx.nd.softmax(inference_output).argmax(axis=1).astype('int').asnumpy().tolist()]
-
-
-_service = BertHandler()
-
-
-def handle(data, context):
- if not _service.initialized:
- _service.initialize(context)
-
- if data is None:
- return None
-
- return _service.handle(data, context)
diff --git a/pytest.ini b/pytest.ini
index 44ce6ae88a..768474a0a1 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -6,13 +6,3 @@ markers =
gpu: mark a test that requires GPU.
integration: mark an integration test
skip_master: mark a test that is temporarily skipped for mxnet master validation.
- py3_only: mark a test that is intended for a python3-only feature.
-
-env =
- MXNET_HOME=tests/data
-
-filterwarnings =
- error
- # ignore warning about package resolution using __spec__ or __package__
- # can't reproduce locally
- ignore:.*can't resolve package from __spec__ or __package__.*:ImportWarning
\ No newline at end of file
diff --git a/scripts/__init__.py b/scripts/__init__.py
index dddd18e45d..e69de29bb2 100644
--- a/scripts/__init__.py
+++ b/scripts/__init__.py
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""NLP examples."""
diff --git a/scripts/benchmarks/README.md b/scripts/benchmarks/README.md
new file mode 100644
index 0000000000..097d0fe03c
--- /dev/null
+++ b/scripts/benchmarks/README.md
@@ -0,0 +1,45 @@
+# Benchmarking the Performance of NLP Backbones
+
+We benchmark the latency and peak memory usage of a single training (forward + backward) and inference (forward-only) step
+of the NLP backbones.
+For comparison, we also provide the numbers of the models in huggingface.
+
+## Backbones in HuggingFace
+
+We use the [huggingface benchmark](https://github.com/huggingface/transformers/tree/master/examples/benchmarking)
+to benchmark the training + inference speed of common workloads in NLP.
+
+```bash
+python3 -m pip install -U -r requirements.txt --user
+python3 benchmark_hf.py
+```
+
+It will generate a list of csv files:
+
+```
+├── pytorch_train_fp32.csv
+├── pytorch_train_fp16.csv
+├── pytorch_infer_fp32.csv
+├── pytorch_infer_fp16.csv
+├── pytorch_infer_fp32_ts.csv
+```
+
+## GluonNLP Backbones based on MXNet-2.0
+
+We profile three options: `NT` layout, `NT` layout with `TN` layout as the compute layout,
+and `TN` layout.
+
+```bash
+python3 -m pip install -U -r requirements.txt --user
+bash benchmark_gluonnlp.sh
+```
+
+It will generate csv files with `gluonnlp_` as the prefix
+```
+├── gluonnlp_train_fp32_NT_NT.csv
+├── gluonnlp_train_fp32_NT_TN.csv
+├── gluonnlp_train_fp32_TN_TN.csv
+├── gluonnlp_infer_fp32_NT_NT.csv
+├── gluonnlp_infer_fp32_NT_TN.csv
+├── gluonnlp_infer_fp32_TN_TN.csv
+```
diff --git a/scripts/benchmarks/benchmark_gluonnlp.py b/scripts/benchmarks/benchmark_gluonnlp.py
new file mode 100644
index 0000000000..440ffc7335
--- /dev/null
+++ b/scripts/benchmarks/benchmark_gluonnlp.py
@@ -0,0 +1,130 @@
+import mxnet as mx
+import argparse
+import os
+import pandas as pd
+from benchmark_utils import GluonNLPBackboneBenchmark
+import multiprocessing as mp
+from multiprocessing import Process
+mx.npx.set_np()
+
+
+MODELS = [
+ 'google_en_uncased_bert_base',
+ 'google_en_uncased_bert_large',
+ 'google_albert_base_v2',
+ 'google_albert_large_v2',
+ 'google_albert_xlarge_v2',
+ 'google_albert_xxlarge_v2',
+ 'google_electra_small',
+ 'google_electra_base',
+ 'google_electra_large',
+ 'google_uncased_mobilebert',
+ 'fairseq_bart_base',
+ 'fairseq_bart_large'
+]
+
+# (batch_size, seq_length)
+train_workloads =\
+ [(4, 128),
+ (8, 128),
+ (16, 128),
+ (32, 128),
+ (1, 512),
+ (2, 512),
+ (4, 512),
+ (8, 512)]
+
+
+inference_workloads = [
+ (1, 128),
+ (1, 384),
+ (1, 512),
+ (8, 32),
+ (8, 128),
+ (8, 512),
+ (32, 512),
+ (256, 128),
+ (400, 100),
+]
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(description='Process some integers.')
+ parser.add_argument('--layout', type=str, default='NT',
+ help='The layout of the computation')
+ parser.add_argument('--compute_layout', type=str, default=None,
+ help='The compute layout of the computation')
+ parser.add_argument('--mode', type=str, default='train',
+ choices=['train', 'inference'])
+ return parser
+
+
+def run_benchmark(workload, model_name, out_file_name, is_train):
+ if is_train:
+ benchmark = GluonNLPBackboneBenchmark(
+ workloads=workload,
+ model_names=model_name,
+ profile_inference=False,
+ profile_train=True,
+ to_csv=True,
+ train_out_csv_file=out_file_name)
+ benchmark.run()
+ else:
+ benchmark = GluonNLPBackboneBenchmark(
+ workloads=workload,
+ model_names=model_name,
+ profile_inference=True,
+ profile_train=False,
+ to_csv=True,
+ inference_out_csv_file=out_file_name)
+ benchmark.run()
+ return
+
+
+if __name__ == '__main__':
+ mp.set_start_method('spawn')
+ parser = get_parser()
+ args = parser.parse_args()
+ if args.compute_layout is None:
+ args.compute_layout = args.layout
+ for layout, compute_layout in [(args.layout, args.compute_layout)]:
+ if compute_layout != layout:
+ profile_models = [ele for ele in MODELS if 'bart' not in ele]
+ else:
+ profile_models = [ele for ele in MODELS]
+ if args.mode == 'inference':
+ out_dir = 'infer_fp32_{}_{}'.format(layout, compute_layout)
+ df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
+ 'latency', 'memory'])
+ os.makedirs(out_dir, exist_ok=True)
+ for model_name in profile_models:
+ for workload in inference_workloads:
+ out_path = os.path.join(out_dir, '{}_{}_{}.csv'.format(model_name, workload[0],
+ workload[1]))
+ process = Process(
+ target=run_benchmark,
+ args=(workload, model_name, out_path, False))
+ process.start()
+ process.join()
+ new_df = pd.read_csv(out_path)
+ df = df.append(new_df, ignore_index=True)
+ df.to_csv('gluonnlp_infer_fp32_{}_{}.csv'.format(layout, compute_layout))
+ elif args.mode == 'train':
+ out_dir = 'train_fp32_{}_{}'.format(layout, compute_layout)
+ df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
+ 'latency', 'memory'])
+ os.makedirs(out_dir, exist_ok=True)
+ for model_name in profile_models:
+ for workload in train_workloads:
+ out_path = os.path.join(out_dir, '{}_{}_{}.csv'.format(model_name, workload[0],
+ workload[1]))
+ process = Process(
+ target=run_benchmark,
+ args=(workload, model_name, out_path, True))
+ process.start()
+ process.join()
+ new_df = pd.read_csv(out_path)
+ df = df.append(new_df, ignore_index=True)
+ df.to_csv('gluonnlp_train_fp32_{}_{}.csv'.format(layout, compute_layout))
+ else:
+ raise NotImplementedError
diff --git a/scripts/benchmarks/benchmark_gluonnlp.sh b/scripts/benchmarks/benchmark_gluonnlp.sh
new file mode 100644
index 0000000000..ada1951864
--- /dev/null
+++ b/scripts/benchmarks/benchmark_gluonnlp.sh
@@ -0,0 +1,14 @@
+for mode in train inference
+do
+ python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode $mode
+done
+
+for mode in train inference
+do
+ python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode $mode
+done
+
+for mode in train inference
+do
+ python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode $mode
+done
diff --git a/scripts/benchmarks/benchmark_hf.py b/scripts/benchmarks/benchmark_hf.py
new file mode 100644
index 0000000000..57ccdcd422
--- /dev/null
+++ b/scripts/benchmarks/benchmark_hf.py
@@ -0,0 +1,184 @@
+import argparse
+import pandas as pd
+import math
+import os
+from multiprocessing import Process
+import torch
+from typing import Callable
+from transformers import HfArgumentParser, PyTorchBenchmark, PyTorchBenchmarkArguments
+import logging
+import timeit
+logger = logging.getLogger()
+
+
+class CustomizedPyTorchBenchmark(PyTorchBenchmark):
+ def _prepare_train_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
+ _train = super(CustomizedPyTorchBenchmark, self)._prepare_train_func(model_name,
+ batch_size,
+ sequence_length)
+ def train_fn():
+ _train()
+ torch.cuda.synchronize()
+ return train_fn
+
+ def _measure_speed(self, func) -> float:
+ try:
+ if self.args.is_tpu or self.args.torchscript:
+ # run additional 10 times to stabilize compilation for tpu and torchscript
+ logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
+ timeit.repeat(
+ func, repeat=1, number=3,
+ )
+
+ # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
+ runtimes = timeit.repeat(func, repeat=self.args.repeat, number=3,)
+
+ if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
+ import torch_xla.debug.metrics as met
+
+ self.print_fn(met.metrics_report())
+
+ return min(runtimes) / 3.0
+ except RuntimeError as e:
+ self.print_fn("Doesn't fit on GPU. {}".format(e))
+ return "N/A"
+
+
+HF_MODELS = [
+ 'bert-base-uncased',
+ 'bert-large-uncased',
+ 'albert-base-v2',
+ 'albert-large-v2',
+ 'albert-xlarge-v2',
+ 'albert-xxlarge-v2',
+ 'google/electra-small-discriminator',
+ 'google/electra-base-discriminator',
+ 'google/electra-large-discriminator',
+ 'google/mobilebert-uncased',
+ 'facebook/bart-base',
+ 'facebook/bart-large'
+]
+
+# (batch_size, seq_length)
+train_workloads =\
+ [(4, 128),
+ (8, 128),
+ (16, 128),
+ (32, 128),
+ (1, 512),
+ (2, 512),
+ (4, 512),
+ (8, 512)]
+
+
+inference_workloads = [
+ (1, 128),
+ (1, 384),
+ (1, 512),
+ (8, 32),
+ (8, 128),
+ (8, 512),
+ (32, 512),
+ (256, 128),
+ (400, 100),
+]
+
+
+if __name__ == '__main__':
+ # Profile PyTorch
+ parser = HfArgumentParser(PyTorchBenchmarkArguments)
+ # Benchmark Training
+ for use_fp16 in [False, True]:
+ df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
+ 'latency', 'memory'])
+ for model in HF_MODELS:
+ for batch_size, seq_length in train_workloads:
+ prefix = '{}_{}_{}'.format(model, batch_size, seq_length).replace('/', '_')
+ args = ['--models', model,
+ '--batch_sizes', '{}'.format(batch_size),
+ '--sequence_lengths', '{}'.format(seq_length),
+ '--train_time_csv_file', '{}.train_time.csv'.format(prefix),
+ '--train_memory_csv_file', '{}.train_memory.csv'.format(prefix),
+ '--no_env_print',
+ '--repeat', '3',
+ '--save_to_csv', '--training', '--no_inference']
+ if use_fp16:
+ args.append('--fp16')
+ benchmark_args = parser.parse_args_into_dataclasses(args)[0]
+ benchmark = CustomizedPyTorchBenchmark(args=benchmark_args)
+ p = Process(target=benchmark.run)
+ p.start()
+ p.join()
+ try:
+ train_time_df = pd.read_csv('{}.train_time.csv'.format(prefix))
+ train_memory_df = pd.read_csv('{}.train_memory.csv'.format(prefix))
+ latency = train_time_df['result'][0]
+ memory = train_memory_df['result'][0]
+ os.remove('{}.train_time.csv'.format(prefix))
+ os.remove('{}.train_memory.csv'.format(prefix))
+ except Exception:
+ latency = math.nan
+ memory = math.nan
+ new_df = pd.DataFrame({'model': [model],
+ 'batch_size': [batch_size],
+ 'sequence_length': [seq_length],
+ 'latency': [latency],
+ 'memory': [memory]})
+ df = df.append(new_df, ignore_index=True)
+ if use_fp16:
+ df.to_csv('pytorch_train_fp16.csv')
+ else:
+ df.to_csv('pytorch_train_fp32.csv')
+
+ # Benchmark Inference
+ for torch_script in [False, True]:
+ for use_fp16 in [False, True]:
+ if torch_script and use_fp16:
+ # Cannot support both torch_script and use_fp16.
+ continue
+ df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
+ 'latency', 'memory'])
+ for model in HF_MODELS:
+ for batch_size, seq_length in inference_workloads:
+ prefix = '{}_{}_{}'.format(model, batch_size, seq_length).replace('/', '_')
+ args = ['--models', model,
+ '--batch_sizes', '{}'.format(batch_size),
+ '--sequence_lengths', '{}'.format(seq_length),
+ '--inference_time_csv_file', '{}.inference_time.csv'.format(prefix),
+ '--inference_memory_csv_file', '{}.inference_memory.csv'.format(prefix),
+ '--no_env_print',
+ '--repeat', '3',
+ '--save_to_csv']
+ if use_fp16:
+ args.append('--fp16')
+ if torch_script:
+ args.append('--torchscript')
+ benchmark_args = parser.parse_args_into_dataclasses(args)[0]
+ benchmark = PyTorchBenchmark(args=benchmark_args)
+ p = Process(target=benchmark.run)
+ p.start()
+ p.join()
+ try:
+ inference_time_df = pd.read_csv('{}.inference_time.csv'.format(prefix))
+ inference_memory_df = pd.read_csv('{}.inference_memory.csv'.format(prefix))
+ latency = inference_time_df['result'][0]
+ memory = inference_memory_df['result'][0]
+ os.remove('{}.inference_time.csv'.format(prefix))
+ os.remove('{}.inference_memory.csv'.format(prefix))
+ except Exception:
+ latency = math.nan
+ memory = math.nan
+ new_df = pd.DataFrame({'model': [model],
+ 'batch_size': [batch_size],
+ 'sequence_length': [seq_length],
+ 'latency': [latency],
+ 'memory': [memory]})
+ df = df.append(new_df, ignore_index=True)
+ if use_fp16 and torch_script:
+ df.to_csv('pytorch_infer_fp16_ts.csv')
+ elif use_fp16 and not torch_script:
+ df.to_csv('pytorch_infer_fp16.csv')
+ elif not use_fp16 and torch_script:
+ df.to_csv('pytorch_infer_fp32_ts.csv')
+ else:
+ df.to_csv('pytorch_infer_fp32.csv')
diff --git a/scripts/benchmarks/benchmark_utils.py b/scripts/benchmarks/benchmark_utils.py
new file mode 100644
index 0000000000..c022caff87
--- /dev/null
+++ b/scripts/benchmarks/benchmark_utils.py
@@ -0,0 +1,1011 @@
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the HuggingFace Transformers library
+at https://github.com/huggingface/transformers/blob/master/src/transformers/benchmark/benchmark_utils.py
+and the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+
+import copy
+import csv
+import linecache
+import logging
+import os
+import platform
+import sys
+import timeit
+import numpy as np
+import gluonnlp
+from gluonnlp.models import get_backbone
+from gluonnlp.utils.misc import logging_config
+from collections import defaultdict, namedtuple
+from datetime import datetime
+import multiprocessing as mp
+from multiprocessing import Pipe, Process, Queue
+from multiprocessing.connection import Connection
+from typing import Callable, Iterable, List, NamedTuple, Optional, Union, Tuple
+
+# Try import psutil + py3nvml
+try:
+ import psutil
+except ImportError:
+ psutil = None
+
+try:
+ import py3nvml.py3nvml as nvml
+except ImportError:
+ nvml = None
+
+try:
+ import mxnet
+ num_gpus = mxnet.context.num_gpus()
+ from mxnet import profiler as mx_profiler
+ if num_gpus == 0:
+ mx_all_contexts = [mxnet.cpu()]
+ else:
+ mx_all_contexts = [mxnet.gpu(i) for i in range(num_gpus)]
+except ImportError:
+ mxnet = None
+ mx_all_contexts = None
+ mx_profiler = None
+
+try:
+ import torch
+ from torch.cuda import empty_cache as torch_empty_cache
+except ImportError:
+ torch = None
+ torch_empty_cache = None
+
+try:
+ import tensorflow
+ from tensorflow.python.eager import context as tf_context
+except ImportError:
+ tensorflow = None
+ tf_context = None
+
+
+def is_psutil_available():
+ return psutil is not None
+
+
+def is_py3nvml_available():
+ return nvml is not None
+
+
+def is_torch_available():
+ return torch is not None
+
+
+def is_tf_available():
+ return tensorflow is not None
+
+
+def is_mxnet_available():
+ return mxnet is not None
+
+
+if platform.system() == "Windows":
+ from signal import CTRL_C_EVENT as SIGKILL
+else:
+ from signal import SIGKILL
+
+
+logger = logging.getLogger(__name__) # pylint: disable=invalid-name
+logging_config(folder='gluonnlp_benchmark', name='benchmark', logger=logger)
+
+
+_is_memory_tracing_enabled = False
+
+BenchmarkOutput = namedtuple(
+ "BenchmarkOutput",
+ [
+ "inference_result",
+ "train_result",
+ ],
+)
+
+
+def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]:
+ """
+ This function wraps another function into its own separated process.
+ In order to ensure accurate memory measurements it is important that the function
+ is executed in a separate process
+
+ Args:
+ - `func`: (`callable`): function() -> ...
+ generic function which will be executed in its own separate process
+ - `do_multi_processing`: (`bool`)
+ Whether to run function on separate process or not
+ """
+ def multi_process_func(*args, **kwargs):
+ # run function in an individual
+ # process to get correct memory
+ def wrapper_func(queue: Queue, *args):
+ try:
+ result = func(*args)
+ except Exception as e:
+ logger.error(e)
+ print(e)
+ result = "N/A"
+ queue.put(result)
+
+ queue = Queue()
+ p = Process(target=wrapper_func, args=[queue] + list(args))
+ p.start()
+ result = queue.get()
+ p.join()
+ return result
+
+ if do_multi_processing:
+ logging.info("fFunction {func} is executed in its own process...")
+ return multi_process_func
+ else:
+ return func
+
+
+def is_memory_tracing_enabled():
+ global _is_memory_tracing_enabled
+ return _is_memory_tracing_enabled
+
+
+class Frame(NamedTuple):
+ """ `Frame` is a NamedTuple used to gather the current frame state.
+ `Frame` has the following fields:
+ - 'filename' (string): Name of the file currently executed
+ - 'module' (string): Name of the module currently executed
+ - 'line_number' (int): Number of the line currently executed
+ - 'event' (string): Event that triggered the tracing (default will be "line")
+ - 'line_text' (string): Text of the line in the python script
+ """
+
+ filename: str
+ module: str
+ line_number: int
+ event: str
+ line_text: str
+
+
+class UsedMemoryState(NamedTuple):
+ """ `UsedMemoryState` are named tuples with the following fields:
+ - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
+ - 'cpu_memory': CPU RSS memory state *before* executing the line
+ - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
+ """
+
+ frame: Frame
+ cpu_memory: int
+ gpu_memory: int
+
+
+class Memory(NamedTuple):
+ """ `Memory` NamedTuple have a single field `bytes` and
+ you can get a human readable str of the number of mega bytes by calling `__repr__`
+ - `byte` (integer): number of bytes,
+ """
+
+ bytes: int
+
+ def __repr__(self) -> str:
+ return str(bytes_to_mega_bytes(self.bytes))
+
+
+class MemoryState(NamedTuple):
+ """ `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
+ - `frame` (`Frame`): the current frame (see above)
+ - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
+ - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
+ - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
+ """
+
+ frame: Frame
+ cpu: Memory
+ gpu: Memory
+ cpu_gpu: Memory
+
+
+class MemorySummary(NamedTuple):
+ """ `MemorySummary` namedtuple otherwise with the fields:
+ - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
+ by substracting the memory after executing each line from the memory before executing said line.
+ - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
+ obtained by summing repeated memory increase for a line if it's executed several times.
+ The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
+ - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
+ Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
+ """
+
+ sequential: List[MemoryState]
+ cumulative: List[MemoryState]
+ current: List[MemoryState]
+ total: Memory
+
+
+MemoryTrace = List[UsedMemoryState]
+
+
+def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_idx=None) -> int:
+ """
+ measures peak cpu memory consumption of a given `function`
+ running the function for at least interval seconds
+ and at most 20 * interval seconds.
+ This function is heavily inspired by: `memory_usage`
+ of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239
+
+ Args:
+ - `function`: (`callable`): function() -> ...
+ function without any arguments to measure for which to measure the peak memory
+
+ - `interval`: (`float`, `optional`, defaults to `0.5`)
+ interval in second for which to measure the memory usage
+
+ - `device_idx`: (`int`, `optional`, defaults to `None`)
+ device id for which to measure gpu usage
+
+ Returns:
+ - `max_memory`: (`int`)
+ cosumed memory peak in Bytes
+ """
+
+ def get_cpu_memory(process_id: int) -> int:
+ """
+ measures current cpu memory usage of a given `process_id`
+
+ Args:
+ - `process_id`: (`int`)
+ process_id for which to measure memory
+
+ Returns
+ - `memory`: (`int`)
+ cosumed memory in Bytes
+ """
+ process = psutil.Process(process_id)
+ try:
+ meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info"
+ memory = getattr(process, meminfo_attr)()[0]
+ except psutil.AccessDenied:
+ raise ValueError("Error with Psutil.")
+ return memory
+
+ if not is_psutil_available():
+ logger.warning(
+ "Psutil not installed, we won't log CPU memory usage. "
+ "Install Psutil (pip install psutil) to use CPU memory tracing."
+ )
+ max_memory = "N/A"
+ else:
+
+ class MemoryMeasureProcess(Process):
+
+ """
+ `MemoryMeasureProcess` inherits from `Process` and overwrites
+ its `run()` method. Used to measure the memory usage of a process
+ """
+
+ def __init__(self, process_id: int, child_connection: Connection, interval: float):
+ super().__init__()
+ self.process_id = process_id
+ self.interval = interval
+ self.connection = child_connection
+ self.num_measurements = 1
+ self.mem_usage = get_cpu_memory(self.process_id)
+
+ def run(self):
+ self.connection.send(0)
+ stop = False
+ while True:
+ self.mem_usage = max(self.mem_usage, get_cpu_memory(self.process_id))
+ self.num_measurements += 1
+
+ if stop:
+ break
+
+ stop = self.connection.poll(self.interval)
+
+ # send results to parent pipe
+ self.connection.send(self.mem_usage)
+ self.connection.send(self.num_measurements)
+
+ while True:
+ # create child, parent connection
+ child_connection, parent_connection = Pipe()
+
+ # instantiate process
+ mem_process = MemoryMeasureProcess(os.getpid(), child_connection, interval)
+ mem_process.start()
+
+ # wait until we get memory
+ parent_connection.recv()
+
+ try:
+ # execute function
+ function()
+
+ # start parent connection
+ parent_connection.send(0)
+
+ # receive memory and num measurements
+ max_memory = parent_connection.recv()
+ num_measurements = parent_connection.recv()
+ except Exception:
+ # kill process in a clean way
+ parent = psutil.Process(os.getpid())
+ for child in parent.children(recursive=True):
+ os.kill(child.pid, SIGKILL)
+ mem_process.join(0)
+ raise RuntimeError("Process killed. Error in Process")
+
+ # run process at least 20 * interval or until it finishes
+ mem_process.join(20 * interval)
+
+ if (num_measurements > 4) or (interval < 1e-6):
+ break
+
+ # reduce interval
+ interval /= 10
+
+ return max_memory
+
+
+def start_memory_tracing(
+ modules_to_trace: Optional[Union[str, Iterable[str]]] = None,
+ modules_not_to_trace: Optional[Union[str, Iterable[str]]] = None,
+ events_to_trace: str = "line",
+ gpus_to_trace: Optional[List[int]] = None,
+) -> MemoryTrace:
+ """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
+ See `./benchmark.py` for usage examples.
+ Current memory consumption is returned using psutil and in particular is the RSS memory
+ "Resident Set Size” (the non-swapped physical memory the process is using).
+ See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
+
+ Args:
+ - `modules_to_trace`: (None, string, list/tuple of string)
+ if None, all events are recorded
+ if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
+ - `modules_not_to_trace`: (None, string, list/tuple of string)
+ if None, no module is avoided
+ if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
+ - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
+ default to line
+ - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
+
+ Return:
+ - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
+ - `UsedMemoryState` are named tuples with the following fields:
+ - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
+ - 'cpu_memory': CPU RSS memory state *before* executing the line
+ - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
+
+ `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
+ `Frame` has the following fields:
+ - 'filename' (string): Name of the file currently executed
+ - 'module' (string): Name of the module currently executed
+ - 'line_number' (int): Number of the line currently executed
+ - 'event' (string): Event that triggered the tracing (default will be "line")
+ - 'line_text' (string): Text of the line in the python script
+
+ """
+ if is_psutil_available():
+ process = psutil.Process(os.getpid())
+ else:
+ logger.warning(
+ "Psutil not installed, we won't log CPU memory usage. "
+ "Install psutil (pip install psutil) to use CPU memory tracing."
+ )
+ process = None
+
+ if is_py3nvml_available():
+ try:
+ nvml.nvmlInit()
+ devices = list(range(nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
+ nvml.nvmlShutdown()
+ except (OSError, nvml.NVMLError):
+ logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.")
+ log_gpu = False
+ else:
+ log_gpu = True
+ else:
+ logger.warning(
+ "py3nvml not installed, we won't log GPU memory usage. "
+ "Install py3nvml (pip install py3nvml) to use GPU memory tracing."
+ )
+ log_gpu = False
+
+ memory_trace = []
+
+ def traceit(frame, event, args):
+ """ Tracing method executed before running each line in a module or sub-module
+ Record memory allocated in a list with debugging information
+ """
+ global _is_memory_tracing_enabled
+
+ if not _is_memory_tracing_enabled:
+ return traceit
+
+ # Filter events
+ if events_to_trace is not None:
+ if isinstance(events_to_trace, str) and event != events_to_trace:
+ return traceit
+ elif isinstance(events_to_trace, (list, tuple)) and event not in events_to_trace:
+ return traceit
+
+ if "__name__" not in frame.f_globals:
+ return traceit
+
+ # Filter modules
+ name = frame.f_globals["__name__"]
+ if not isinstance(name, str):
+ return traceit
+ else:
+ # Filter whitelist of modules to trace
+ if modules_to_trace is not None:
+ if isinstance(modules_to_trace, str) and modules_to_trace not in name:
+ return traceit
+ elif isinstance(modules_to_trace, (list, tuple)) and all(m not in name for m in modules_to_trace):
+ return traceit
+
+ # Filter blacklist of modules not to trace
+ if modules_not_to_trace is not None:
+ if isinstance(modules_not_to_trace, str) and modules_not_to_trace in name:
+ return traceit
+ elif isinstance(modules_not_to_trace, (list, tuple)) and any(m in name for m in modules_not_to_trace):
+ return traceit
+
+ # Record current tracing state (file, location in file...)
+ lineno = frame.f_lineno
+ filename = frame.f_globals["__file__"]
+ if filename.endswith(".pyc") or filename.endswith(".pyo"):
+ filename = filename[:-1]
+ line = linecache.getline(filename, lineno).rstrip()
+ traced_state = Frame(filename, name, lineno, event, line)
+
+ # Record current memory state (rss memory) and compute difference with previous memory state
+ cpu_mem = 0
+ if process is not None:
+ mem = process.memory_info()
+ cpu_mem = mem.rss
+
+ gpu_mem = 0
+ if log_gpu:
+ # Clear GPU caches
+ if is_mxnet_available():
+ for ctx in mx_all_contexts:
+ ctx.empty_cache()
+ if is_torch_available():
+ torch_empty_cache()
+ if is_tf_available():
+ tf_context.context()._clear_caches() # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802
+
+ # Sum used memory for all GPUs
+ nvml.nvmlInit()
+
+ for i in devices:
+ handle = nvml.nvmlDeviceGetHandleByIndex(i)
+ meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
+ gpu_mem += meminfo.used
+
+ nvml.nvmlShutdown()
+
+ mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem)
+ memory_trace.append(mem_state)
+
+ return traceit
+
+ sys.settrace(traceit)
+
+ global _is_memory_tracing_enabled
+ _is_memory_tracing_enabled = True
+
+ return memory_trace
+
+
+def stop_memory_tracing(
+ memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True
+) -> Optional[MemorySummary]:
+ """ Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
+
+ Args:
+ - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
+ - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
+
+ Return:
+ - None if `memory_trace` is None
+ - `MemorySummary` namedtuple otherwise with the fields:
+ - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
+ by substracting the memory after executing each line from the memory before executing said line.
+ - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
+ obtained by summing repeated memory increase for a line if it's executed several times.
+ The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
+ - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
+ Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
+
+ `Memory` named tuple have fields
+ - `byte` (integer): number of bytes,
+ - `string` (string): same as human readable string (ex: "3.5MB")
+
+ `Frame` are namedtuple used to list the current frame state and have the following fields:
+ - 'filename' (string): Name of the file currently executed
+ - 'module' (string): Name of the module currently executed
+ - 'line_number' (int): Number of the line currently executed
+ - 'event' (string): Event that triggered the tracing (default will be "line")
+ - 'line_text' (string): Text of the line in the python script
+
+ `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
+ - `frame` (`Frame`): the current frame (see above)
+ - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
+ - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
+ - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
+ """
+ global _is_memory_tracing_enabled
+ _is_memory_tracing_enabled = False
+
+ if memory_trace is not None and len(memory_trace) > 1:
+ memory_diff_trace = []
+ memory_curr_trace = []
+
+ cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])
+
+ for ((frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem),) in zip(
+ memory_trace[:-1], memory_trace[1:]
+ ):
+ cpu_mem_inc = next_cpu_mem - cpu_mem
+ gpu_mem_inc = next_gpu_mem - gpu_mem
+ cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
+ memory_diff_trace.append(
+ MemoryState(
+ frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
+ )
+ )
+
+ memory_curr_trace.append(
+ MemoryState(
+ frame=frame,
+ cpu=Memory(next_cpu_mem),
+ gpu=Memory(next_gpu_mem),
+ cpu_gpu=Memory(next_gpu_mem + next_cpu_mem),
+ )
+ )
+
+ cumulative_memory_dict[frame][0] += cpu_mem_inc
+ cumulative_memory_dict[frame][1] += gpu_mem_inc
+ cumulative_memory_dict[frame][2] += cpu_gpu_mem_inc
+
+ cumulative_memory = sorted(
+ list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True
+ ) # order by the total CPU + GPU memory increase
+ cumulative_memory = list(
+ MemoryState(
+ frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
+ )
+ for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
+ )
+
+ memory_curr_trace = sorted(memory_curr_trace, key=lambda x: x.cpu_gpu.bytes, reverse=True)
+
+ if ignore_released_memory:
+ total_memory = sum(max(0, step_trace.cpu_gpu.bytes) for step_trace in memory_diff_trace)
+ else:
+ total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace)
+
+ total_memory = Memory(total_memory)
+
+ return MemorySummary(
+ sequential=memory_diff_trace, cumulative=cumulative_memory, current=memory_curr_trace, total=total_memory,
+ )
+
+ return None
+
+
+def bytes_to_mega_bytes(memory_amount: int) -> int:
+ """ Utility to convert a number of bytes (int) into a number of mega bytes (int)
+ """
+ return memory_amount >> 20
+
+
+class GluonNLPBackboneBenchmark:
+ """
+ Benchmarks is a simple but feature-complete benchmarking script
+ to compare memory and time performance of models in Transformers.
+ """
+ def __init__(self, workloads, model_names, use_fp16=False,
+ repeat=3, use_gpu=True, device_idx=0,
+ profile_inference=True,
+ profile_train=True,
+ env_print=True,
+ to_csv=False,
+ layout='NT',
+ compute_layout='auto',
+ inference_out_csv_file='inference_time_memory.csv',
+ train_out_csv_file='train_time_memory.csv',
+ env_info_file='env_info.csv'):
+ self._workloads = workloads
+ if not isinstance(workloads, list):
+ workloads = [workloads]
+ if not isinstance(model_names, (list, tuple)):
+ model_names = [model_names]
+ self._workloads = workloads
+ self._model_names = model_names
+ self._use_fp16 = use_fp16
+ self._repeat = repeat
+ self._use_gpu = use_gpu
+ self._device_idx = device_idx
+ self._environment_info = None
+ self._profile_inference = profile_inference
+ self._profile_train = profile_train
+ self._env_print = env_print
+ self._to_csv = to_csv
+ self._layout = layout
+ self._compute_layout = compute_layout
+ self._inference_out_csv_file = inference_out_csv_file
+ self._train_out_csv_file = train_out_csv_file
+ self._env_info_file = env_info_file
+ assert use_fp16 is False, 'Currently fp16 benchmark has not been supported yet.'
+
+ @property
+ def model_names(self):
+ return self._model_names
+
+ @property
+ def workloads(self):
+ return self._workloads
+
+ def _inference_speed_memory(self, model_name: str, batch_size: int, sequence_length: int)\
+ -> Tuple[float, Memory]:
+ if self._use_gpu:
+ ctx = mxnet.gpu()
+ else:
+ ctx = mxnet.cpu()
+ model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name)
+ # TODO Support fp16 profiling
+ cfg.defrost()
+ cfg.MODEL.layout = self._layout
+ if model_cls.__name__ not in ['BartModel']:
+ cfg.MODEL.compute_layout = self._compute_layout
+ cfg.freeze()
+ if model_cls.__name__ in ['BartModel']:
+ model = model_cls.from_cfg(cfg, extract_feature=True)
+ else:
+ model = model_cls.from_cfg(cfg)
+ model.load_parameters(backbone_param_path, ctx=ctx)
+ model.hybridize()
+ vocab_size = cfg.MODEL.vocab_size
+ if self._layout == 'NT':
+ input_ids = mxnet.np.random.randint(0, vocab_size, (batch_size, sequence_length),
+ dtype=np.int32, ctx=ctx)
+ token_types = mxnet.np.zeros((batch_size, sequence_length), dtype=np.int32, ctx=ctx)
+ valid_length = mxnet.np.full((batch_size,), sequence_length,
+ dtype=np.int32, ctx=ctx)
+ elif self._layout == 'TN':
+ input_ids = mxnet.np.random.randint(0, vocab_size, (sequence_length, batch_size),
+ dtype=np.int32, ctx=ctx)
+ token_types = mxnet.np.zeros((sequence_length, batch_size), dtype=np.int32, ctx=ctx)
+ valid_length = mxnet.np.full((batch_size,), sequence_length,
+ dtype=np.int32, ctx=ctx)
+ else:
+ raise NotImplementedError
+ mxnet.npx.waitall()
+
+ def run_forward():
+ if 'roberta' in model_name or 'xlmr' in model_name:
+ out = model(input_ids, valid_length)
+ elif 'bart' in model_name:
+ out = model(input_ids, valid_length, input_ids, valid_length)
+ else:
+ out = model(input_ids, token_types, valid_length)
+ if isinstance(out, list):
+ for ele in out:
+ ele.wait_to_read()
+ else:
+ out.wait_to_read()
+
+ timeit.repeat(run_forward, repeat=1, number=3)
+ runtimes = timeit.repeat(run_forward, repeat=self._repeat, number=3)
+ mxnet.npx.waitall()
+ # Profile memory
+ if self._use_gpu:
+ nvml.nvmlInit()
+ run_forward()
+ mxnet.npx.waitall()
+ handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx)
+ meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
+ max_bytes_in_use = meminfo.used
+ memory = Memory(max_bytes_in_use)
+ # shutdown nvml
+ nvml.nvmlShutdown()
+ else:
+ # cpu
+ memory_bytes = measure_peak_memory_cpu(run_forward)
+ memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
+ return float(np.min(runtimes) / 3.0), memory
+
+ def _train_speed_memory(self, model_name: str, batch_size: int, sequence_length: int)\
+ -> Tuple[float, Memory]:
+ if self._use_gpu:
+ ctx = mxnet.gpu()
+ else:
+ ctx = mxnet.cpu()
+ model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name)
+ # TODO Support fp16 profiling
+ cfg.defrost()
+ cfg.MODEL.layout = self._layout
+ if model_cls.__name__ not in ['BartModel']:
+ cfg.MODEL.compute_layout = self._compute_layout
+ cfg.freeze()
+ if model_cls.__name__ in ['BartModel']:
+ model = model_cls.from_cfg(cfg, extract_feature=True)
+ else:
+ model = model_cls.from_cfg(cfg)
+ model.load_parameters(backbone_param_path, ctx=ctx)
+ model.hybridize()
+ vocab_size = cfg.MODEL.vocab_size
+ if hasattr(cfg.MODEL, 'units'):
+ out_units = cfg.MODEL.units
+ else:
+ out_units = cfg.MODEL.DECODER.units
+ if self._layout == 'NT':
+ input_ids = mxnet.np.random.randint(0, vocab_size, (batch_size, sequence_length),
+ dtype=np.int32, ctx=ctx)
+ token_types = mxnet.np.zeros((batch_size, sequence_length), dtype=np.int32, ctx=ctx)
+ valid_length = mxnet.np.full((batch_size,), sequence_length,
+ dtype=np.int32, ctx=ctx)
+ contextual_embedding_ograd = mxnet.np.random.normal(
+ 0, 1, (batch_size, sequence_length, out_units),
+ dtype=np.float32, ctx=ctx)
+ pooled_out_ograd = mxnet.np.random.normal(
+ 0, 1, (batch_size, out_units), dtype=np.float32, ctx=ctx)
+ elif self._layout == 'TN':
+ input_ids = mxnet.np.random.randint(0, vocab_size, (sequence_length, batch_size),
+ dtype=np.int32, ctx=ctx)
+ token_types = mxnet.np.zeros((sequence_length, batch_size), dtype=np.int32, ctx=ctx)
+ valid_length = mxnet.np.full((batch_size,), sequence_length,
+ dtype=np.int32, ctx=ctx)
+ contextual_embedding_ograd = mxnet.np.random.normal(
+ 0, 1, (sequence_length, batch_size, out_units),
+ dtype=np.float32, ctx=ctx)
+ pooled_out_ograd = mxnet.np.random.normal(0, 1, (batch_size, out_units),
+ dtype=np.float32,
+ ctx=ctx)
+ else:
+ raise NotImplementedError
+ if model_cls.__name__ in ['BertModel', 'AlbertModel', 'ElectraModel', 'MobileBertModel']:
+ def train_step():
+ with mxnet.autograd.record():
+ contextual_embedding, pooled_out = model(input_ids, token_types, valid_length)
+ # We'd like to set the head gradient of
+ # contextual_embedding to contextual_embedding_ograd
+ # and the head gradient of pooled_out to pooled_out_ograd
+ # Thus, we simply doing two hadamard product and sum up the results.
+ fake_loss = mxnet.np.sum(contextual_embedding * contextual_embedding_ograd)\
+ + mxnet.np.sum(pooled_out * pooled_out_ograd)
+ fake_loss.backward()
+ mxnet.npx.waitall()
+ elif model_cls.__name__ in ['BartModel']:
+ def train_step():
+ with mxnet.autograd.record():
+ contextual_embedding, pooled_out = model(input_ids, valid_length,
+ input_ids, valid_length)
+ fake_loss = (contextual_embedding * contextual_embedding_ograd).sum() \
+ + (pooled_out * pooled_out_ograd).sum()
+ fake_loss.backward()
+ mxnet.npx.waitall()
+ else:
+ raise NotImplementedError
+ timeit.repeat(train_step, repeat=1, number=3)
+ mxnet.npx.waitall()
+ for ctx in mx_all_contexts:
+ ctx.empty_cache()
+ runtimes = timeit.repeat(train_step, repeat=self._repeat, number=3)
+ mxnet.npx.waitall()
+ for ctx in mx_all_contexts:
+ ctx.empty_cache()
+ mxnet.npx.waitall()
+ # Profile memory
+ if self._use_gpu:
+ nvml.nvmlInit()
+ train_step()
+ mxnet.npx.waitall()
+ handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx)
+ meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
+ max_bytes_in_use = meminfo.used
+ memory = Memory(max_bytes_in_use)
+ # shutdown nvml
+ nvml.nvmlShutdown()
+ else:
+ # cpu
+ memory_bytes = measure_peak_memory_cpu(train_step)
+ memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
+ return float(np.min(runtimes) / 3.0), memory
+
+ def inference_speed_memory(self, *args, **kwargs) -> float:
+ return separate_process_wrapper_fn(self._inference_speed_memory, False)(*args, **kwargs)
+
+ def train_speed_memory(self, *args, **kwargs) -> float:
+ return separate_process_wrapper_fn(self._train_speed_memory, False)(*args, **kwargs)
+
+ def run(self):
+ result_dict = {model_name: {} for model_name in self._model_names}
+ inference_result = copy.deepcopy(result_dict)
+ train_result = copy.deepcopy(result_dict)
+
+ for c, model_name in enumerate(self.model_names):
+ logger.info(f"{c + 1} / {len(self.model_names)}")
+ inference_result[model_name] = dict()
+ train_result[model_name] = dict()
+
+ for workload in self._workloads:
+ batch_size, sequence_length = workload
+ if self._profile_inference:
+ try:
+ infer_time, infer_memory = self.inference_speed_memory(model_name,
+ batch_size,
+ sequence_length)
+ except Exception as e:
+ logger.info(e)
+ infer_time = np.nan
+ infer_memory = np.nan
+ inference_result[model_name][workload] = (infer_time, infer_memory)
+ for ctx in mx_all_contexts:
+ ctx.empty_cache()
+ mxnet.npx.waitall()
+ self.save_to_csv(inference_result, self._inference_out_csv_file)
+ if self._profile_train:
+ try:
+ train_time, train_memory = self.train_speed_memory(model_name,
+ batch_size,
+ sequence_length)
+ except Exception as e:
+ logger.info(e)
+ train_time = np.nan
+ train_memory = np.nan
+ train_result[model_name][workload] = (train_time, train_memory)
+ for ctx in mx_all_contexts:
+ ctx.empty_cache()
+ mxnet.npx.waitall()
+ self.save_to_csv(train_result, self._train_out_csv_file)
+
+ if self._profile_inference:
+ logger.info("\n" + 20 * "=" + ("INFERENCE - RESULT - SPEED - MEMORY").center(55) + 20 * "=")
+ self.print_results(inference_result)
+
+ if self._profile_train:
+ logger.info("\n" + 20 * "=" + ("TRAIN - RESULT - SPEED - RESULTS").center(55) + 20 * "=")
+ self.print_results(train_result)
+
+ if self._env_print:
+ logger.info("\n" + 20 * "=" + ("ENVIRONMENT INFORMATION").center(40) + 20 * "=")
+ logger.info(
+ "\n".join(["- {}: {}".format(prop, val)
+ for prop, val in self.environment_info.items()]) + "\n"
+ )
+
+ if self._to_csv:
+ with open(self._env_info_file, mode="w", newline="") as csv_file:
+ writer = csv.writer(csv_file)
+ for key, value in self.environment_info.items():
+ writer.writerow([key, value])
+
+ return BenchmarkOutput(
+ inference_result,
+ train_result
+ )
+
+ @property
+ def environment_info(self):
+ if self._environment_info is None:
+ info = {}
+ info["gluonnlp_version"] = gluonnlp.__version__
+ info["framework_version"] = mxnet.__version__
+ info["python_version"] = platform.python_version()
+ info["system"] = platform.system()
+ info["cpu"] = platform.processor()
+ info["architecture"] = platform.architecture()[0]
+ info["date"] = datetime.date(datetime.now())
+ info["time"] = datetime.time(datetime.now())
+ info["fp16"] = self._use_fp16
+
+ if is_psutil_available():
+ info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total)
+ else:
+ logger.warning(
+ "Psutil not installed, we won't log available CPU memory."
+ "Install psutil (pip install psutil) to log available CPU memory."
+ )
+ info["cpu_ram_mb"] = "N/A"
+
+ info["use_gpu"] = self._use_gpu
+ if self._use_gpu:
+ info["num_gpus"] = 1
+ if is_py3nvml_available():
+ nvml.nvmlInit()
+ handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx)
+ info["gpu"] = nvml.nvmlDeviceGetName(handle)
+ info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total)
+ info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
+ info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle)
+ nvml.nvmlShutdown()
+ else:
+ logger.warning(
+ "py3nvml not installed, we won't log GPU memory usage. "
+ "Install py3nvml (pip install py3nvml) to log information about GPU."
+ )
+ info["gpu"] = "N/A"
+ info["gpu_ram_mb"] = "N/A"
+ info["gpu_power_watts"] = "N/A"
+ info["gpu_performance_state"] = "N/A"
+ self._environment_info = info
+ return self._environment_info
+
+ def print_results(self, result_dict):
+ logger.info(95 * "-")
+ logger.info(
+ "Model Name".center(30)
+ + "Batch Size".center(15) + "Seq Length".center(15)
+ + "Latency (ms)".center(15) + "Memory".center(15)
+ )
+ logger.info(95 * "-")
+ for model_name in self._model_names:
+ for (batch_size, sequence_length), (time_spent, memory)\
+ in result_dict[model_name].items():
+ if np.isnan(time_spent):
+ time_spent = str(time_spent)
+ else:
+ time_spent = round(1000 * time_spent)
+ time_spent = str(time_spent)
+ memory = str(memory)
+ logger.info(
+ model_name[:30].center(30) + str(batch_size).center(15) +
+ str(sequence_length).center(15) +
+ time_spent.center(15) + memory.center(15)
+ )
+ logger.info(95 * "-")
+
+ def print_memory_trace_statistics(self, summary: MemorySummary):
+ logger.info(
+ "\nLine by line memory consumption:\n"
+ + "\n".join(
+ f"{state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
+ for state in summary.sequential
+ )
+ )
+ logger.info(
+ "\nLines with top memory consumption:\n"
+ + "\n".join(
+ f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
+ for state in summary.cumulative[:6]
+ )
+ )
+ logger.info(
+ "\nLines with lowest memory consumption:\n"
+ + "\n".join(
+ f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
+ for state in summary.cumulative[-6:]
+ )
+ )
+ logger.info(f"\nTotal memory increase: {summary.total}")
+
+ def save_to_csv(self, result_dict, filename):
+ if not self._to_csv:
+ return
+ logger.info("Saving results to csv {}.".format(filename))
+ with open(filename, mode="w") as csv_file:
+
+ assert len(self._model_names) > 0, "At least 1 model should be defined, but got {}".format(
+ self._model_names
+ )
+
+ fieldnames = ["model", "batch_size", "sequence_length"]
+ writer = csv.DictWriter(csv_file, fieldnames=fieldnames + ["latency", "memory"])
+ writer.writeheader()
+
+ for model_name in self._model_names:
+ result_dict_model = result_dict[model_name]
+ for (bs, ss), (latency, memory) in result_dict_model.items():
+ writer.writerow(
+ {
+ "model": model_name,
+ "batch_size": bs,
+ "sequence_length": ss,
+ 'latency': str(latency),
+ 'memory': str(memory),
+ }
+ )
diff --git a/scripts/benchmarks/requirements.txt b/scripts/benchmarks/requirements.txt
new file mode 100644
index 0000000000..41332a1cec
--- /dev/null
+++ b/scripts/benchmarks/requirements.txt
@@ -0,0 +1,4 @@
+transformers
+py3nvml
+torch
+torchvision
diff --git a/scripts/bert/__init__.py b/scripts/bert/__init__.py
deleted file mode 100644
index ea93605437..0000000000
--- a/scripts/bert/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""BERT Module."""
-from . import model, data
diff --git a/scripts/bert/bert_qa_evaluate.py b/scripts/bert/bert_qa_evaluate.py
deleted file mode 100644
index 1ba6989ac6..0000000000
--- a/scripts/bert/bert_qa_evaluate.py
+++ /dev/null
@@ -1,394 +0,0 @@
-
-# Copyright 2018 The Google AI Language Team Authors, Allenai and DMLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Bert SQuAD evaluate."""
-import re
-import string
-from collections import Counter, namedtuple, OrderedDict
-
-from mxnet import nd
-
-PredResult = namedtuple('PredResult', ['start', 'end'])
-
-def _get_best_indexes(logits, n_best_size):
- """Get the n-best logits from a list."""
- index_and_score = sorted(
- enumerate(logits), key=lambda x: x[1], reverse=True)
-
- best_indexes = []
- for i, _ in enumerate(index_and_score):
- if i >= n_best_size:
- break
- best_indexes.append(index_and_score[i][0])
- return best_indexes
-
-
-def get_final_text(pred_text, orig_text, tokenizer):
- """Project the tokenized prediction back to the original text."""
-
- # When we created the data, we kept track of the alignment between original
- # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
- # now `orig_text` contains the span of our original text corresponding to the
- # span that we predicted.
- #
- # However, `orig_text` may contain extra characters that we don't want in
- # our prediction.
- #
- # For example, let's say:
- # pred_text = steve smith
- # orig_text = Steve Smith's
- #
- # We don't want to return `orig_text` because it contains the extra "'s".
- #
- # We don't want to return `pred_text` because it's already been normalized
- # (the SQuAD eval script also does punctuation stripping/lower casing but
- # our tokenizer does additional normalization like stripping accent
- # characters).
- #
- # What we really want to return is "Steve Smith".
- #
- # Therefore, we have to apply a semi-complicated alignment heruistic between
- # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
- # can fail in certain cases in which case we just return `orig_text`.
-
- def _strip_spaces(text):
- ns_chars = []
- ns_to_s_map = OrderedDict()
- for (i, c) in enumerate(text):
- if c == ' ':
- continue
- ns_to_s_map[len(ns_chars)] = i
- ns_chars.append(c)
- ns_text = ''.join(ns_chars)
- return (ns_text, ns_to_s_map)
-
- # We first tokenize `orig_text`, strip whitespace from the result
- # and `pred_text`, and check if they are the same length. If they are
- # NOT the same length, the heuristic has failed. If they are the same
- # length, we assume the characters are one-to-one aligned.
-
- tok_text = ' '.join(tokenizer(orig_text))
-
- start_position = tok_text.find(pred_text)
- if start_position == -1:
- return orig_text
- end_position = start_position + len(pred_text) - 1
-
- (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
- (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
- if len(orig_ns_text) != len(tok_ns_text):
- return orig_text
-
- # We then project the characters in `pred_text` back to `orig_text` using
- # the character-to-character alignment.
- tok_s_to_ns_map = {}
- for i in tok_ns_to_s_map.keys():
- tok_index = tok_ns_to_s_map[i]
- tok_s_to_ns_map[tok_index] = i
-
- orig_start_position = None
- if start_position in tok_s_to_ns_map:
- ns_start_position = tok_s_to_ns_map[start_position]
- if ns_start_position in orig_ns_to_s_map:
- orig_start_position = orig_ns_to_s_map[ns_start_position]
-
- if orig_start_position is None:
- return orig_text
-
- orig_end_position = None
- if end_position in tok_s_to_ns_map:
- ns_end_position = tok_s_to_ns_map[end_position]
- if ns_end_position in orig_ns_to_s_map:
- orig_end_position = orig_ns_to_s_map[ns_end_position]
-
- if orig_end_position is None:
- return orig_text
-
- output_text = orig_text[orig_start_position:(orig_end_position + 1)]
- return output_text
-
-
-def predict(features,
- results,
- tokenizer,
- max_answer_length=64,
- null_score_diff_threshold=0.0,
- n_best_size=10,
- version_2=False):
- """Get prediction results.
-
- Parameters
- ----------
- features : list of SQuADFeature
- List of squad features for the example.
- results : list of data.qa.PredResult
- List of model predictions for span start and span end.
- tokenizer: callable
- Tokenizer function.
- max_answer_length: int, default 64
- Maximum length of the answer tokens.
- null_score_diff_threshold: float, default 0.0
- If null_score - best_non_null is greater than the threshold predict null.
- n_best_size: int, default 10
- The total number of n-best predictions.
- version_2: bool, default False
- If true, the SQuAD examples contain some that do not have an answer.
-
- Returns
- -------
- prediction: str
- The final prediction.
- nbest : list of (str, float)
- n-best predictions with their probabilities.
- """
-
- _PrelimPrediction = namedtuple('PrelimPrediction',
- ['feature_index', 'start_index', 'end_index',
- 'pred_start', 'pred_end'])
-
- _NbestPrediction = namedtuple(
- 'NbestPrediction', ['text', 'pred_start', 'pred_end'])
-
- prelim_predictions = []
- score_diff = None
-
- score_null = 1000000 # large and positive
- min_null_feature_index = 0 # the paragraph slice with min mull score
- null_pred_start = 0 # the start logit at the slice with min null score
- null_pred_end = 0 # the end logit at the slice with min null score
-
- for features_id, (result, feature) in enumerate(zip(results, features)):
- start_indexes = _get_best_indexes(result.start, n_best_size)
- end_indexes = _get_best_indexes(result.end, n_best_size)
-
- if version_2:
- feature_null_score = result.start[0] + \
- result.end[0]
- if feature_null_score < score_null:
- score_null = feature_null_score
- min_null_feature_index = features_id
- null_pred_start = result.start[0]
- null_pred_end = result.end[0]
-
- for start_index in start_indexes:
- for end_index in end_indexes:
- # We could hypothetically create invalid predictions, e.g., predict
- # that the start of the span is in the question. We throw out all
- # invalid predictions.
- if start_index >= len(feature.tokens):
- continue
- if end_index >= len(feature.tokens):
- continue
- if start_index not in feature.token_to_orig_map:
- continue
- if end_index not in feature.token_to_orig_map:
- continue
- if not feature.token_is_max_context.get(start_index, False):
- continue
- if end_index < start_index:
- continue
- length = end_index - start_index + 1
- if length > max_answer_length:
- continue
- prelim_predictions.append(
- _PrelimPrediction(
- feature_index=features_id,
- start_index=start_index,
- end_index=end_index,
- pred_start=result.start[start_index],
- pred_end=result.end[end_index]))
-
- if version_2:
- prelim_predictions.append(
- _PrelimPrediction(
- feature_index=min_null_feature_index,
- start_index=0,
- end_index=0,
- pred_start=null_pred_start,
- pred_end=null_pred_end))
-
- prelim_predictions = sorted(
- prelim_predictions,
- key=lambda x: (x.pred_start + x.pred_end),
- reverse=True)
-
- seen_predictions = {}
- nbest = []
- for pred in prelim_predictions:
- if len(nbest) >= n_best_size:
- break
- feature = features[pred.feature_index]
- if pred.start_index > 0: # this is a non-null prediction
- tok_tokens = feature.tokens[pred.start_index:(
- pred.end_index + 1)]
- orig_doc_start = feature.token_to_orig_map[pred.start_index]
- orig_doc_end = feature.token_to_orig_map[pred.end_index]
- orig_tokens = feature.doc_tokens[orig_doc_start:(
- orig_doc_end + 1)]
- tok_text = ' '.join(tok_tokens)
-
- # De-tokenize WordPieces that have been split off.
- tok_text = tok_text.replace(' ##', '')
- tok_text = tok_text.replace('##', '')
-
- # Clean whitespace
- tok_text = tok_text.strip()
- tok_text = ' '.join(tok_text.split())
- orig_text = ' '.join(orig_tokens)
-
- final_text = get_final_text(tok_text, orig_text, tokenizer)
- if final_text in seen_predictions:
- continue
-
- seen_predictions[final_text] = True
- else:
- final_text = ''
- seen_predictions[final_text] = True
-
- nbest.append(
- _NbestPrediction(
- text=final_text,
- pred_start=pred.pred_start,
- pred_end=pred.pred_end))
-
- # if we didn't inlude the empty option in the n-best, inlcude it
- if version_2:
- if '' not in seen_predictions:
- nbest.append(
- _NbestPrediction(
- text='',
- pred_start=null_pred_start,
- pred_end=null_pred_end))
- # In very rare edge cases we could have no valid predictions. So we
- # just create a nonce prediction in this case to avoid failure.
- if not nbest:
- nbest.append(
- _NbestPrediction(text='empty', pred_start=0.0, pred_end=0.0))
-
- assert len(nbest) >= 1
-
- total_scores = []
- best_non_null_entry = None
- for entry in nbest:
- total_scores.append(entry.pred_start + entry.pred_end)
- if not best_non_null_entry:
- if entry.text:
- best_non_null_entry = entry
-
- probs = nd.softmax(nd.array(total_scores)).asnumpy()
-
- nbest_json = []
-
- for (i, entry) in enumerate(nbest):
- nbest_json.append((entry.text, float(probs[i])))
-
- if not version_2:
- prediction = nbest_json[0][0]
- else:
- # predict '' iff the null score - the score of best non-null > threshold
- score_diff = score_null - best_non_null_entry.pred_start - \
- best_non_null_entry.pred_end
-
- if score_diff > null_score_diff_threshold:
- prediction = ''
- else:
- prediction = best_non_null_entry.text
- return prediction, nbest_json
-
-
-def normalize_answer(s):
- """Lower text and remove punctuation, articles and extra whitespace."""
-
- def remove_articles(text):
- return re.sub(r'\b(a|an|the)\b', ' ', text)
-
- def white_space_fix(text):
- return ' '.join(text.split())
-
- def remove_punc(text):
- exclude = set(string.punctuation)
- return ''.join(ch for ch in text if ch not in exclude)
-
- def lower(text):
- return text.lower()
-
- return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def f1_score(prediction, ground_truth):
- """Calculate the F1 scores.
- """
- prediction_tokens = normalize_answer(prediction).split()
- ground_truth_tokens = normalize_answer(ground_truth).split()
- common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
- num_same = sum(common.values())
- if num_same == 0:
- return 0
- precision = 1.0 * num_same / len(prediction_tokens)
- recall = 1.0 * num_same / len(ground_truth_tokens)
- f1 = (2 * precision * recall) / (precision + recall)
- return f1
-
-
-def exact_match_score(prediction, ground_truth):
- """Calculate the EM scores.
- """
- return (normalize_answer(prediction) == normalize_answer(ground_truth))
-
-
-def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
- scores_for_ground_truths = []
- for ground_truth in ground_truths:
- score = metric_fn(prediction, ground_truth)
- scores_for_ground_truths.append(score)
- return max(scores_for_ground_truths)
-
-
-def get_F1_EM(dataset, predict_data):
- """Calculate the F1 and EM scores of the predicted results.
- Use only with the SQuAD1.1 dataset.
-
- Parameters
- ----------
- dataset_file: string
- Path to the data file.
- predict_data: dict
- All final predictions.
-
- Returns
- -------
- scores: dict
- F1 and EM scores.
- """
- f1 = exact_match = total = 0
- for record in dataset:
- total += 1
- if record[1] not in predict_data:
- message = 'Unanswered question ' + record[1] + \
- ' will receive score 0.'
- print(message)
- continue
- ground_truths = record[4]
- prediction = predict_data[record[1]]
- exact_match += metric_max_over_ground_truths(
- exact_match_score, prediction, ground_truths)
- f1 += metric_max_over_ground_truths(f1_score, prediction,
- ground_truths)
- exact_match = 100.0 * exact_match / total
- f1 = 100.0 * f1 / total
-
- scores = {'exact_match': exact_match, 'f1': f1}
-
- return scores
diff --git a/scripts/bert/data/__init__.py b/scripts/bert/data/__init__.py
deleted file mode 100644
index 643fa2f832..0000000000
--- a/scripts/bert/data/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""BERT data."""
-
-from . import embedding, transform
diff --git a/scripts/bert/data/create_pretraining_data.py b/scripts/bert/data/create_pretraining_data.py
deleted file mode 100644
index 088e55e653..0000000000
--- a/scripts/bert/data/create_pretraining_data.py
+++ /dev/null
@@ -1,688 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and DMLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Create masked LM/next sentence masked_lm examples for BERT."""
-
-
-import argparse
-import logging
-import io
-import os
-import glob
-import collections
-import warnings
-import random
-import time
-from multiprocessing import Pool
-import numpy as np
-import gluonnlp as nlp
-from gluonnlp.data import BERTTokenizer
-
-
-class TrainingInstance:
- """A single training instance (sentence pair)."""
-
- def __init__(self, tokens, segment_ids, masked_lm_positions,
- masked_lm_labels, is_random_next, vocab):
- self.tokens = tokens
- self.segment_ids = segment_ids
- self.is_random_next = is_random_next
- self.masked_lm_positions = masked_lm_positions
- self.masked_lm_labels = masked_lm_labels
- self.vocab = vocab
-
- def __str__(self):
- tks = self.vocab.to_tokens(self.tokens)
- mask_tks = self.vocab.to_tokens(self.masked_lm_labels)
- s = ''
- s += 'tokens: %s\n' % (' '.join(tks))
- s += 'segment_ids: %s\n' % (' '.join(
- [str(x) for x in self.segment_ids]))
- s += 'is_random_next: %s\n' % self.is_random_next
- s += 'masked_lm_positions: %s\n' % (' '.join(
- [str(x) for x in self.masked_lm_positions]))
- s += 'masked_lm_labels: %s\n' % (' '.join(mask_tks))
- s += '\n'
- return s
-
- def __repr__(self):
- return self.__str__()
-
-def transform(instance, max_seq_length):
- """Transform instance to inputs for MLM and NSP."""
- input_ids = instance.tokens
- assert len(input_ids) <= max_seq_length
- segment_ids = instance.segment_ids
- masked_lm_positions = instance.masked_lm_positions
- valid_lengths = len(input_ids)
-
- masked_lm_ids = instance.masked_lm_labels
- masked_lm_weights = [1.0] * len(masked_lm_ids)
-
- next_sentence_label = 1 if instance.is_random_next else 0
-
- features = {}
- features['input_ids'] = input_ids
- features['segment_ids'] = segment_ids
- features['masked_lm_positions'] = masked_lm_positions
- features['masked_lm_ids'] = masked_lm_ids
- features['masked_lm_weights'] = masked_lm_weights
- features['next_sentence_labels'] = [next_sentence_label]
- features['valid_lengths'] = [valid_lengths]
- return features
-
-def print_example(instance, features):
- logging.debug('*** Example Instance ***')
- logging.debug('\n%s', instance)
-
- for feature_name in features.keys():
- feature = features[feature_name]
- logging.debug('Generated %s: %s', feature_name, feature)
-
-def write_to_files_np(features, tokenizer, max_seq_length,
- max_predictions_per_seq, output_files):
- # pylint: disable=unused-argument
- """Write to numpy files from `TrainingInstance`s."""
- next_sentence_labels = []
- valid_lengths = []
-
- assert len(output_files) == 1, 'numpy format only support single output file'
- output_file = output_files[0]
- (input_ids, segment_ids, masked_lm_positions, masked_lm_ids,
- masked_lm_weights, next_sentence_labels, valid_lengths) = features
- total_written = len(next_sentence_labels)
-
- # store variable length numpy array object directly.
- outputs = collections.OrderedDict()
- outputs['input_ids'] = np.array(input_ids, dtype=object)
- outputs['segment_ids'] = np.array(segment_ids, dtype=object)
- outputs['masked_lm_positions'] = np.array(masked_lm_positions, dtype=object)
- outputs['masked_lm_ids'] = np.array(masked_lm_ids, dtype=object)
- outputs['masked_lm_weights'] = np.array(masked_lm_weights, dtype=object)
- outputs['next_sentence_labels'] = np.array(next_sentence_labels, dtype='int32')
- outputs['valid_lengths'] = np.array(valid_lengths, dtype='int32')
-
- np.savez_compressed(output_file, **outputs)
- logging.info('Wrote %d total instances', total_written)
-
-def tokenize_lines_fn(x):
- """Worker function to tokenize lines based on the tokenizer, and perform vocabulary lookup."""
- lines, tokenizer, vocab = x
- results = []
- for line in lines:
- if not line:
- break
- line = line.strip()
- # Empty lines are used as document delimiters
- if not line:
- results.append([])
- else:
- tokens = vocab[tokenizer(line)]
- if tokens:
- results.append(tokens)
- return results
-
-def convert_to_npz(instances, max_seq_length):
- """Create masked language model and next sentence prediction samples as numpy arrays."""
- input_ids = []
- segment_ids = []
- masked_lm_positions = []
- masked_lm_ids = []
- masked_lm_weights = []
- next_sentence_labels = []
- valid_lengths = []
-
- for inst_index, instance in enumerate(instances):
- features = transform(instance, max_seq_length)
- input_id = features['input_ids']
- segment_id = features['segment_ids']
- masked_lm_position = features['masked_lm_positions']
- masked_lm_id = features['masked_lm_ids']
- masked_lm_weight = features['masked_lm_weights']
- next_sentence_label = features['next_sentence_labels'][0]
- valid_length = features['valid_lengths'][0]
-
- input_ids.append(np.ascontiguousarray(input_id, dtype='int32'))
- segment_ids.append(np.ascontiguousarray(segment_id, dtype='int32'))
- masked_lm_positions.append(np.ascontiguousarray(masked_lm_position, dtype='int32'))
- masked_lm_ids.append(np.ascontiguousarray(masked_lm_id, dtype='int32'))
- masked_lm_weights.append(np.ascontiguousarray(masked_lm_weight, dtype='float32'))
- next_sentence_labels.append(next_sentence_label)
- valid_lengths.append(valid_length)
- # debugging information
- if inst_index < 1:
- print_example(instance, features)
- return input_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights,\
- next_sentence_labels, segment_ids, valid_lengths
-
-def create_training_instances(x):
- """Create `TrainingInstance`s from raw text.
-
- The expected input file format is the following:
-
- (1) One sentence per line. These should ideally be actual sentences, not
- entire paragraphs or arbitrary spans of text. (Because we use the
- sentence boundaries for the "next sentence prediction" task).
- (2) Blank lines between documents. Document boundaries are needed so
- that the "next sentence prediction" task doesn't span between documents.
-
- The function expect arguments packed in a tuple as described below.
-
- Parameters
- ----------
- input_files : list of str
- List of paths to input text files.
- tokenizer : BERTTokenizer
- The BERT tokenizer
- max_seq_length : int
- The hard limit of maximum sequence length of sentence pairs
- dupe_factor : int
- Duplication factor.
- short_seq_prob : float
- The probability of sampling sequences shorter than the max_seq_length.
- masked_lm_prob : float
- The probability of replacing texts with masks/random words/original words.
- max_predictions_per_seq : int
- The hard limit of the number of predictions for masked words
- whole_word_mask : bool
- Whether to do masking for whole words
- vocab : BERTVocab
- The BERTVocab
- nworker : int
- The number of processes to help processing texts in parallel
- worker_pool : multiprocessing.Pool
- Must be provided if nworker > 1. The caller is responsible for the destruction of
- the worker pool.
- output_file : str or None
- Path to the output file. If None, the result is not serialized. If provided,
- results are stored in the order of (input_ids, segment_ids, masked_lm_positions,
- masked_lm_ids, masked_lm_weights, next_sentence_labels, valid_lengths).
-
- Returns
- -------
- A tuple of np.ndarray : input_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights
- next_sentence_labels, segment_ids, valid_lengths
- """
- (input_files, tokenizer, max_seq_length, short_seq_prob,
- masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab,
- dupe_factor, nworker, worker_pool, output_file) = x
-
- time_start = time.time()
- if nworker > 1:
- assert worker_pool is not None
-
- all_documents = [[]]
-
- for input_file in input_files:
- logging.debug('*** Tokenizing file %s***', input_file)
- with io.open(input_file, 'r', encoding='utf-8') as reader:
- lines = reader.readlines()
- num_lines = len(lines)
- num_lines_per_worker = (num_lines + nworker - 1) // nworker
- process_args = []
-
- # tokenize in parallel
- for worker_idx in range(nworker):
- start = worker_idx * num_lines_per_worker
- end = min((worker_idx + 1) * num_lines_per_worker, num_lines)
- process_args.append((lines[start:end], tokenizer, vocab))
- if worker_pool:
- tokenized_results = worker_pool.map(tokenize_lines_fn, process_args)
- else:
- tokenized_results = [tokenize_lines_fn(process_args[0])]
-
- for tokenized_result in tokenized_results:
- for line in tokenized_result:
- if not line:
- if all_documents[-1]:
- all_documents.append([])
- else:
- all_documents[-1].append(line)
-
- # remove the empty document if any
- all_documents = [x for x in all_documents if x]
- random.shuffle(all_documents)
-
- # generate training instances
- instances = []
- if worker_pool:
- process_args = []
- for document_index in range(len(all_documents)):
- process_args.append((all_documents, document_index, max_seq_length, short_seq_prob,
- masked_lm_prob, max_predictions_per_seq, whole_word_mask,
- vocab, tokenizer))
- for _ in range(dupe_factor):
- instances_results = worker_pool.map(create_instances_from_document, process_args)
- for instances_result in instances_results:
- instances.extend(instances_result)
- random.shuffle(instances)
- npz_instances = worker_pool.apply(convert_to_npz, (instances, max_seq_length))
- else:
- for _ in range(dupe_factor):
- for document_index in range(len(all_documents)):
- instances.extend(
- create_instances_from_document(
- (all_documents, document_index, max_seq_length, short_seq_prob,
- masked_lm_prob, max_predictions_per_seq, whole_word_mask,
- vocab, tokenizer)))
- random.shuffle(instances)
- npz_instances = convert_to_npz(instances, max_seq_length)
-
- (input_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights,
- next_sentence_labels, segment_ids, valid_lengths) = npz_instances
-
- # write output to files. Used when pre-generating files
- if output_file:
- features = (input_ids, segment_ids, masked_lm_positions, masked_lm_ids,
- masked_lm_weights, next_sentence_labels, valid_lengths)
- logging.debug('*** Writing to output file %s ***', output_file)
- write_to_files_np(features, tokenizer, max_seq_length,
- max_predictions_per_seq, [output_file])
- features = None
- else:
- features = (input_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights,
- next_sentence_labels, segment_ids, valid_lengths)
- time_end = time.time()
- logging.debug('Process %d files took %.1f s', len(input_files), time_end - time_start)
- return features
-
-def create_instances_from_document(x):
- """Creates `TrainingInstance`s for a single document."""
- (all_documents, document_index, max_seq_length, short_seq_prob,
- masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab, tokenizer) = x
- document = all_documents[document_index]
- _MASK_TOKEN = vocab[vocab.mask_token]
- _CLS_TOKEN = vocab[vocab.cls_token]
- _SEP_TOKEN = vocab[vocab.sep_token]
-
- # Account for [CLS], [SEP], [SEP]
- max_num_tokens = max_seq_length - 3
-
- # According to the original tensorflow implementation:
- # We *usually* want to fill up the entire sequence since we are padding
- # to `max_seq_length` anyways, so short sequences are generally wasted
- # computation. However, we *sometimes*
- # (i.e., short_seq_prob == 0.1, 10% of the time) want to use shorter
- # sequences to minimize the mismatch between pre-training and fine-tuning.
- # The `target_seq_length` is just a rough target however, whereas
- # `max_seq_length` is a hard limit.
- target_seq_length = max_num_tokens
- if random.random() < short_seq_prob:
- target_seq_length = random.randint(2, max_num_tokens)
-
- # We DON'T just concatenate all of the tokens from a document into a long
- # sequence and choose an arbitrary split point because this would make the
- # next sentence prediction task too easy. Instead, we split the input into
- # segments "A" and "B" based on the actual "sentences" provided by the user
- # input.
- instances = []
- current_chunk = []
- current_length = 0
- i = 0
- while i < len(document): # pylint: disable=R1702
- segment = document[i]
- current_chunk.append(segment)
- current_length += len(segment)
- if i == len(document) - 1 or current_length >= target_seq_length:
- if current_chunk:
- # `a_end` is how many segments from `current_chunk` go into the `A`
- # (first) sentence.
- a_end = 1
- if len(current_chunk) >= 2:
- a_end = random.randint(1, len(current_chunk) - 1)
-
- tokens_a = []
- for j in range(a_end):
- tokens_a.extend(current_chunk[j])
-
- tokens_b = []
- # Random next
- is_random_next = False
- if len(current_chunk) == 1 or random.random() < 0.5:
- is_random_next = True
- target_b_length = target_seq_length - len(tokens_a)
-
- # randomly choose a document other than itself
- random_document_index = random.randint(0, len(all_documents) - 2)
- if random_document_index == document_index:
- random_document_index = len(all_documents) - 1
-
- random_document = all_documents[random_document_index]
- random_start = random.randint(0, len(random_document) - 1)
- for j in range(random_start, len(random_document)):
- tokens_b.extend(random_document[j])
- if len(tokens_b) >= target_b_length:
- break
- # We didn't actually use these segments so we 'put them back' so
- # they don't go to waste.
- num_unused_segments = len(current_chunk) - a_end
- i -= num_unused_segments
- # Actual next
- else:
- is_random_next = False
- for j in range(a_end, len(current_chunk)):
- tokens_b.extend(current_chunk[j])
- truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)
-
- assert len(tokens_a) >= 1
- assert len(tokens_b) >= 1
-
- tokens = []
- segment_ids = []
- tokens.append(_CLS_TOKEN)
- segment_ids.append(0)
- for token in tokens_a:
- tokens.append(token)
- segment_ids.append(0)
- tokens.append(_SEP_TOKEN)
- segment_ids.append(0)
-
- for token in tokens_b:
- tokens.append(token)
- segment_ids.append(1)
- tokens.append(_SEP_TOKEN)
- segment_ids.append(1)
-
- (tokens, masked_lm_positions,
- masked_lm_labels) = create_masked_lm_predictions(
- tokens, masked_lm_prob, max_predictions_per_seq,
- whole_word_mask, vocab, tokenizer,
- _MASK_TOKEN, _CLS_TOKEN, _SEP_TOKEN)
- instance = TrainingInstance(
- tokens=tokens,
- segment_ids=segment_ids,
- is_random_next=is_random_next,
- masked_lm_positions=masked_lm_positions,
- masked_lm_labels=masked_lm_labels,
- vocab=vocab)
- instances.append(instance)
- current_chunk = []
- current_length = 0
- i += 1
-
- return instances
-
-
-MaskedLmInstance = collections.namedtuple('MaskedLmInstance',
- ['index', 'label'])
-
-
-def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq,
- whole_word_mask, vocab, tokenizer,
- _MASK_TOKEN, _CLS_TOKEN, _SEP_TOKEN):
- """Creates the predictions for the masked LM objective."""
- cand_indexes = []
- for (i, token) in enumerate(tokens):
- if token in [_CLS_TOKEN, _SEP_TOKEN]:
- continue
- # Whole Word Masking means that if we mask all of the subwords
- # corresponding to an original word. When a word has been split into
- # subwords, the first token does not have any marker and any subsequence
- # tokens are prefixed with ##. So whenever we see the ## token, we
- # append it to the previous set of word indexes.
- #
- # Note that Whole Word Masking does *not* change the training code
- # at all -- we still predict each subword independently, softmaxed
- # over the entire vocabulary.
- if whole_word_mask and len(cand_indexes) >= 1 and \
- not tokenizer.is_first_subword(vocab.idx_to_token[token]):
- cand_indexes[-1].append(i)
- else:
- cand_indexes.append([i])
-
- random.shuffle(cand_indexes)
-
- output_tokens = list(tokens)
-
- num_to_predict = min(max_predictions_per_seq,
- max(1, int(round(len(tokens) * masked_lm_prob))))
-
- masked_lms = []
- covered_indexes = set()
- for index_set in cand_indexes:
- if len(masked_lms) >= num_to_predict:
- break
- # If adding a whole-word mask would exceed the maximum number of
- # predictions, then just skip this candidate.
- if len(masked_lms) + len(index_set) > num_to_predict:
- continue
- is_any_index_covered = False
- for index in index_set:
- if index in covered_indexes:
- is_any_index_covered = True
- break
- if is_any_index_covered:
- continue
- for index in index_set:
- covered_indexes.add(index)
- masked_token = None
- # 80% of the time, replace with [MASK]
- if random.random() < 0.8:
- masked_token = _MASK_TOKEN
- else:
- # 10% of the time, keep original
- if random.random() < 0.5:
- masked_token = tokens[index]
- # 10% of the time, replace with random word
- else:
- # generate a random word in [0, vocab_size - 1]
- masked_token = random.randint(0, len(vocab) - 1)
-
- output_tokens[index] = masked_token
-
- masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
- assert len(masked_lms) <= num_to_predict
- masked_lms = sorted(masked_lms, key=lambda x: x.index)
-
- masked_lm_positions = []
- masked_lm_labels = []
- for p in masked_lms:
- masked_lm_positions.append(p.index)
- masked_lm_labels.append(p.label)
-
- return (output_tokens, masked_lm_positions, masked_lm_labels)
-
-
-def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
- """Truncates a pair of sequences to a maximum sequence length."""
- while True:
- total_length = len(tokens_a) + len(tokens_b)
- if total_length <= max_num_tokens:
- break
-
- trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
- assert len(trunc_tokens) >= 1
-
- # We want to sometimes truncate from the front and sometimes from the
- # back to add more randomness and avoid biases.
- if random.random() < 0.5:
- del trunc_tokens[0]
- else:
- trunc_tokens.pop()
-
-
-def main():
- """Main function."""
- time_start = time.time()
-
- # random seed
- random.seed(args.random_seed)
-
- # create output dir
- output_dir = os.path.expanduser(args.output_dir)
- nlp.utils.mkdir(output_dir)
-
- # vocabulary and tokenizer
- if args.sentencepiece:
- logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
- if args.dataset_name:
- warnings.warn('Both --dataset_name and --sentencepiece are provided. '
- 'The vocabulary will be loaded based on --sentencepiece.')
- vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
- tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, num_best=args.sp_nbest,
- alpha=args.sp_alpha, lower=not args.cased)
- else:
- logging.info('loading vocab file from pre-defined dataset: %s', args.dataset_name)
- vocab = nlp.data.utils._load_pretrained_vocab(args.dataset_name, root=output_dir,
- cls=nlp.vocab.BERTVocab)
- tokenizer = BERTTokenizer(vocab=vocab, lower='uncased' in args.dataset_name)
-
- # count the number of input files
- input_files = []
- for input_pattern in args.input_file.split(','):
- input_files.extend(glob.glob(os.path.expanduser(input_pattern)))
- for input_file in input_files:
- logging.info('\t%s', input_file)
- num_inputs = len(input_files)
- num_outputs = min(args.num_outputs, len(input_files))
- logging.info('*** Reading from %d input files ***', num_inputs)
-
- # calculate the number of splits
- file_splits = []
- split_size = (num_inputs + num_outputs - 1) // num_outputs
- for i in range(num_outputs):
- split_start = i * split_size
- split_end = min(num_inputs, (i + 1) * split_size)
- file_splits.append(input_files[split_start:split_end])
-
- # prepare workload
- count = 0
- process_args = []
-
- for i, file_split in enumerate(file_splits):
- output_file = os.path.join(output_dir, 'part-{}.npz'.format(str(i).zfill(3)))
- count += len(file_split)
- process_args.append((file_split, tokenizer, args.max_seq_length, args.short_seq_prob,
- args.masked_lm_prob, args.max_predictions_per_seq,
- args.whole_word_mask,
- vocab, args.dupe_factor, 1, None, output_file))
-
- # sanity check
- assert count == len(input_files)
-
- # dispatch to workers
- nworker = args.num_workers
- if nworker > 1:
- pool = Pool(nworker)
- pool.map(create_training_instances, process_args)
- else:
- for process_arg in process_args:
- create_training_instances(process_arg)
-
- time_end = time.time()
- logging.info('Time cost=%.1f', time_end - time_start)
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(
- description='Pre-training data generator for BERT',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
- parser.add_argument(
- '--input_file',
- type=str,
- required=True,
- help='Input files, separated by comma. For example, "~/data/*.txt"')
-
- parser.add_argument(
- '--output_dir',
- type=str,
- required=True,
- help='Output directory.')
-
- parser.add_argument(
- '--dataset_name',
- type=str,
- default=None,
- choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased',
- 'wiki_multilingual_uncased', 'wiki_multilingual_cased', 'wiki_cn_cased'],
- help='The dataset name for the vocab file BERT model was trained on. For example, '
- '"book_corpus_wiki_en_uncased"')
-
- parser.add_argument(
- '--sentencepiece',
- type=str,
- default=None,
- help='Path to the sentencepiece .model file for both tokenization and vocab.')
-
- parser.add_argument(
- '--cased',
- action='store_true',
- help='Effective only if --sentencepiece is set')
-
- parser.add_argument('--sp_nbest', type=int, default=0,
- help='Number of best candidates for sampling subwords with sentencepiece. ')
-
- parser.add_argument('--sp_alpha', type=float, default=1.0,
- help='Inverse temperature for probability rescaling for sentencepiece '
- 'unigram sampling')
-
- parser.add_argument(
- '--whole_word_mask',
- action='store_true',
- help='Whether to use whole word masking rather than per-subword masking.')
-
- parser.add_argument(
- '--max_seq_length', type=int, default=512, help='Maximum sequence length.')
-
- parser.add_argument(
- '--max_predictions_per_seq',
- type=int,
- default=80,
- help='Maximum number of masked LM predictions per sequence. ')
-
- parser.add_argument(
- '--random_seed',
- type=int,
- default=12345,
- help='Random seed for data generation.')
-
- parser.add_argument(
- '--dupe_factor',
- type=int,
- default=5,
- help='Number of times to duplicate the input data (with different masks).')
-
- parser.add_argument(
- '--masked_lm_prob',
- type=float,
- default=0.15,
- help='Masked LM probability.')
-
- parser.add_argument(
- '--short_seq_prob',
- type=float,
- default=0.1,
- help='Probability of creating sequences which are shorter than the '
- 'maximum length. ')
-
- parser.add_argument(
- '--verbose',
- action='store_true',
- help='Print debug information')
-
- parser.add_argument(
- '--num_workers',
- type=int,
- default=8,
- help='Number of workers for parallel processing, where each generates an output file.')
-
- parser.add_argument(
- '--num_outputs',
- type=int,
- default=1,
- help='Number of desired output files, where each is processed independently by a worker.')
-
- args = parser.parse_args()
- logging.getLogger().setLevel(logging.DEBUG if args.verbose else logging.INFO)
- logging.info(args)
- main()
diff --git a/scripts/bert/data/embedding.py b/scripts/bert/data/embedding.py
deleted file mode 100644
index 7a609cc6de..0000000000
--- a/scripts/bert/data/embedding.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and DMLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT embedding datasets."""
-from mxnet.gluon.data import Dataset
-
-__all__ = ['BertEmbeddingDataset']
-
-class BertEmbeddingDataset(Dataset):
- """Dataset for BERT Embedding
-
- Parameters
- ----------
- sentences : List[str].
- Sentences for embeddings.
- transform : BERTDatasetTransform, default None.
- transformer for BERT input format
- """
-
- def __init__(self, sentences, transform=None):
- """Dataset for BERT Embedding
-
- Parameters
- ----------
- sentences : List[str].
- Sentences for embeddings.
- transform : BERTDatasetTransform, default None.
- transformer for BERT input format
- """
- self.sentences = sentences
- self.transform = transform
-
- def __getitem__(self, idx):
- sentence = (self.sentences[idx], 0)
- if self.transform:
- return self.transform(sentence)
- else:
- return sentence
-
- def __len__(self):
- return len(self.sentences)
diff --git a/scripts/bert/data/transform.py b/scripts/bert/data/transform.py
deleted file mode 100644
index d8bef6efc2..0000000000
--- a/scripts/bert/data/transform.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and DMLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT dataset transform."""
-
-
-__all__ = ['BERTDatasetTransform']
-
-import numpy as np
-from gluonnlp.data import BERTSentenceTransform
-
-class BERTDatasetTransform:
- """Dataset transformation for BERT-style sentence classification or regression.
-
- Parameters
- ----------
- tokenizer : BERTTokenizer.
- Tokenizer for the sentences.
- max_seq_length : int.
- Maximum sequence length of the sentences.
- vocab : Vocab or BERTVocab
- The vocabulary.
- labels : list of int , float or None. defaults None
- List of all label ids for the classification task and regressing task.
- If labels is None, the default task is regression
- pad : bool, default True
- Whether to pad the sentences to maximum length.
- pair : bool, default True
- Whether to transform sentences or sentence pairs.
- label_dtype: int32 or float32, default float32
- label_dtype = int32 for classification task
- label_dtype = float32 for regression task
- """
-
- def __init__(self,
- tokenizer,
- max_seq_length,
- vocab=None,
- class_labels=None,
- label_alias=None,
- pad=True,
- pair=True,
- has_label=True):
- self.class_labels = class_labels
- self.has_label = has_label
- self._label_dtype = 'int32' if class_labels else 'float32'
- if has_label and class_labels:
- self._label_map = {}
- for (i, label) in enumerate(class_labels):
- self._label_map[label] = i
- if label_alias:
- for key in label_alias:
- self._label_map[key] = self._label_map[label_alias[key]]
- self._bert_xform = BERTSentenceTransform(
- tokenizer, max_seq_length, vocab=vocab, pad=pad, pair=pair)
-
- def __call__(self, line):
- """Perform transformation for sequence pairs or single sequences.
-
- The transformation is processed in the following steps:
- - tokenize the input sequences
- - insert [CLS], [SEP] as necessary
- - generate type ids to indicate whether a token belongs to the first
- sequence or the second sequence.
- - generate valid length
-
- For sequence pairs, the input is a tuple of 3 strings:
- text_a, text_b and label.
-
- Inputs:
- text_a: 'is this jacksonville ?'
- text_b: 'no it is not'
- label: '0'
- Tokenization:
- text_a: 'is this jack ##son ##ville ?'
- text_b: 'no it is not .'
- Processed:
- tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
- type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
- valid_length: 14
- label: 0
-
- For single sequences, the input is a tuple of 2 strings: text_a and label.
- Inputs:
- text_a: 'the dog is hairy .'
- label: '1'
- Tokenization:
- text_a: 'the dog is hairy .'
- Processed:
- text_a: '[CLS] the dog is hairy . [SEP]'
- type_ids: 0 0 0 0 0 0 0
- valid_length: 7
- label: 1
-
- Parameters
- ----------
- line: tuple of str
- Input strings. For sequence pairs, the input is a tuple of 3 strings:
- (text_a, text_b, label). For single sequences, the input is a tuple
- of 2 strings: (text_a, label).
-
- Returns
- -------
- np.array: input token ids in 'int32', shape (batch_size, seq_length)
- np.array: valid length in 'int32', shape (batch_size,)
- np.array: input token type ids in 'int32', shape (batch_size, seq_length)
- np.array: classification task: label id in 'int32', shape (batch_size, 1),
- regression task: label in 'float32', shape (batch_size, 1)
- """
- if self.has_label:
- input_ids, valid_length, segment_ids = self._bert_xform(line[:-1])
- label = line[-1]
- # map to int if class labels are available
- if self.class_labels:
- label = self._label_map[label]
- label = np.array([label], dtype=self._label_dtype)
- return input_ids, segment_ids, valid_length, label
- else:
- input_ids, valid_length, segment_ids = self._bert_xform(line)
- return input_ids, segment_ids, valid_length
diff --git a/scripts/bert/embedding.py b/scripts/bert/embedding.py
deleted file mode 100644
index 248fba3d32..0000000000
--- a/scripts/bert/embedding.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""BERT embedding."""
-
-import argparse
-import io
-import logging
-import os
-
-import numpy as np
-import mxnet as mx
-
-from mxnet.gluon.data import DataLoader
-
-import gluonnlp
-from gluonnlp.data import BERTTokenizer, BERTSentenceTransform, BERTSPTokenizer
-from gluonnlp.base import get_home_dir
-
-try:
- from data.embedding import BertEmbeddingDataset
-except ImportError:
- from .data.embedding import BertEmbeddingDataset
-
-
-__all__ = ['BertEmbedding']
-
-
-logger = logging.getLogger(__name__)
-
-
-class BertEmbedding:
- """
- Encoding from BERT model.
-
- Parameters
- ----------
- ctx : Context.
- running BertEmbedding on which gpu device id.
- dtype: str
- data type to use for the model.
- model : str, default bert_12_768_12.
- pre-trained BERT model
- dataset_name : str, default book_corpus_wiki_en_uncased.
- pre-trained model dataset
- params_path: str, default None
- path to a parameters file to load instead of the pretrained model.
- max_seq_length : int, default 25
- max length of each sequence
- batch_size : int, default 256
- batch size
- sentencepiece : str, default None
- Path to the sentencepiece .model file for both tokenization and vocab
- root : str, default '$MXNET_HOME/models' with MXNET_HOME defaults to '~/.mxnet'
- Location for keeping the model parameters.
- """
- def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12',
- dataset_name='book_corpus_wiki_en_uncased', params_path=None,
- max_seq_length=25, batch_size=256, sentencepiece=None,
- root=os.path.join(get_home_dir(), 'models')):
- self.ctx = ctx
- self.dtype = dtype
- self.max_seq_length = max_seq_length
- self.batch_size = batch_size
- self.dataset_name = dataset_name
-
- # use sentencepiece vocab and a checkpoint
- # we need to set dataset_name to None, otherwise it uses the downloaded vocab
- if params_path and sentencepiece:
- dataset_name = None
- else:
- dataset_name = self.dataset_name
- if sentencepiece:
- vocab = gluonnlp.vocab.BERTVocab.from_sentencepiece(sentencepiece)
- else:
- vocab = None
-
- self.bert, self.vocab = gluonnlp.model.get_model(model,
- dataset_name=dataset_name,
- pretrained=params_path is None,
- ctx=self.ctx,
- use_pooler=False,
- use_decoder=False,
- use_classifier=False,
- root=root, vocab=vocab)
-
- self.bert.cast(self.dtype)
- if params_path:
- logger.info('Loading params from %s', params_path)
- self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True, cast_dtype=True)
-
- lower = 'uncased' in self.dataset_name
- if sentencepiece:
- self.tokenizer = BERTSPTokenizer(sentencepiece, self.vocab, lower=lower)
- else:
- self.tokenizer = BERTTokenizer(self.vocab, lower=lower)
- self.transform = BERTSentenceTransform(tokenizer=self.tokenizer,
- max_seq_length=self.max_seq_length,
- pair=False)
-
- def __call__(self, sentences, oov_way='avg'):
- return self.embedding(sentences, oov_way='avg')
-
- def embedding(self, sentences, oov_way='avg'):
- """
- Get tokens, tokens embedding
-
- Parameters
- ----------
- sentences : List[str]
- sentences for encoding.
- oov_way : str, default avg.
- use **avg**, **sum** or **last** to get token embedding for those out of
- vocabulary words
-
- Returns
- -------
- List[(List[str], List[ndarray])]
- List of tokens, and tokens embedding
- """
- data_iter = self.data_loader(sentences=sentences)
- batches = []
- for token_ids, valid_length, token_types in data_iter:
- token_ids = token_ids.as_in_context(self.ctx)
- valid_length = valid_length.as_in_context(self.ctx)
- token_types = token_types.as_in_context(self.ctx)
- sequence_outputs = self.bert(token_ids, token_types,
- valid_length.astype(self.dtype))
- for token_id, sequence_output in zip(token_ids.asnumpy(),
- sequence_outputs.asnumpy()):
- batches.append((token_id, sequence_output))
- return self.oov(batches, oov_way)
-
- def data_loader(self, sentences, shuffle=False):
- """Load, tokenize and prepare the input sentences."""
- dataset = BertEmbeddingDataset(sentences, self.transform)
- return DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=shuffle)
-
- def oov(self, batches, oov_way='avg'):
- """
- How to handle oov. Also filter out [CLS], [SEP] tokens.
-
- Parameters
- ----------
- batches : List[(tokens_id, sequence_outputs)].
- batch token_ids shape is (max_seq_length,),
- sequence_outputs shape is (max_seq_length, dim)
- oov_way : str
- use **avg**, **sum** or **last** to get token embedding for those out of
- vocabulary words
-
- Returns
- -------
- List[(List[str], List[ndarray])]
- List of tokens, and tokens embedding
- """
- sentences = []
- padding_idx, cls_idx, sep_idx = None, None, None
- if self.vocab.padding_token:
- padding_idx = self.vocab[self.vocab.padding_token]
- if self.vocab.cls_token:
- cls_idx = self.vocab[self.vocab.cls_token]
- if self.vocab.sep_token:
- sep_idx = self.vocab[self.vocab.sep_token]
- for token_ids, sequence_outputs in batches:
- tokens = []
- tensors = []
- oov_len = 1
- for token_id, sequence_output in zip(token_ids, sequence_outputs):
- # [PAD] token, sequence is finished.
- if padding_idx and token_id == padding_idx:
- break
- # [CLS], [SEP]
- if cls_idx and token_id == cls_idx:
- continue
- if sep_idx and token_id == sep_idx:
- continue
- token = self.vocab.idx_to_token[token_id]
- if not self.tokenizer.is_first_subword(token):
- tokens.append(token)
- if oov_way == 'last':
- tensors[-1] = sequence_output
- else:
- tensors[-1] += sequence_output
- if oov_way == 'avg':
- oov_len += 1
- else: # iv, avg last oov
- if oov_len > 1:
- tensors[-1] /= oov_len
- oov_len = 1
- tokens.append(token)
- tensors.append(sequence_output)
- if oov_len > 1: # if the whole sentence is one oov, handle this special case
- tensors[-1] /= oov_len
- sentences.append((tokens, tensors))
- return sentences
-
-
-if __name__ == '__main__':
- np.set_printoptions(threshold=5)
- parser = argparse.ArgumentParser(description='Get embeddings from BERT',
- formatter_class=argparse.RawTextHelpFormatter)
- parser.add_argument('--gpu', type=int, default=None,
- help='id of the gpu to use. Set it to empty means to use cpu.')
- parser.add_argument('--dtype', type=str, default='float32', help='data dtype')
- parser.add_argument('--model', type=str, default='bert_12_768_12',
- help='pre-trained model')
- parser.add_argument('--dataset_name', type=str, default='book_corpus_wiki_en_uncased',
- help='name of the dataset used for pre-training')
- parser.add_argument('--params_path', type=str, default=None,
- help='path to a params file to load instead of the pretrained model.')
- parser.add_argument('--sentencepiece', type=str, default=None,
- help='Path to the sentencepiece .model file for tokenization and vocab.')
- parser.add_argument('--max_seq_length', type=int, default=128,
- help='max length of each sequence')
- parser.add_argument('--batch_size', type=int, default=256,
- help='batch size')
- parser.add_argument('--oov_way', type=str, default='avg',
- help='how to handle subword embeddings\n'
- 'avg: average all subword embeddings to represent the original token\n'
- 'sum: sum all subword embeddings to represent the original token\n'
- 'last: use last subword embeddings to represent the original token\n')
- parser.add_argument('--sentences', type=str, nargs='+', default=None,
- help='sentence for encoding')
- parser.add_argument('--file', type=str, default=None,
- help='file for encoding')
- parser.add_argument('--verbose', action='store_true', help='verbose logging')
- args = parser.parse_args()
-
- level = logging.DEBUG if args.verbose else logging.INFO
- logging.getLogger().setLevel(level)
- logging.info(args)
-
- if args.gpu is not None:
- context = mx.gpu(args.gpu)
- else:
- context = mx.cpu()
- bert_embedding = BertEmbedding(ctx=context, model=args.model, dataset_name=args.dataset_name,
- max_seq_length=args.max_seq_length, batch_size=args.batch_size,
- params_path=args.params_path, sentencepiece=args.sentencepiece)
- result = []
- sents = []
- if args.sentences:
- sents = args.sentences
- result = bert_embedding(sents, oov_way=args.oov_way)
- elif args.file:
- with io.open(args.file, 'r', encoding='utf8') as in_file:
- for line in in_file:
- sents.append(line.strip())
- result = bert_embedding(sents, oov_way=args.oov_way)
- else:
- logger.error('Please specify --sentence or --file')
-
- if result:
- for _, embeddings in zip(sents, result):
- sent, tokens_embedding = embeddings
- print('Text: {}'.format(' '.join(sent)))
- print('Tokens embedding: {}'.format(tokens_embedding))
diff --git a/scripts/bert/export.py b/scripts/bert/export.py
deleted file mode 100644
index 92778d7975..0000000000
--- a/scripts/bert/export.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-"""
-Export the BERT Model for Deployment
-====================================
-
-This script exports the BERT model to a hybrid model serialized as a symbol.json file,
-which is suitable for deployment, or use with MXNet Module API.
-
-@article{devlin2018bert,
- title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
- author={Devlin, Jacob and Chang, Ming- \
- Wei and Lee, Kenton and Toutanova, Kristina},
- journal={arXiv preprint arXiv:1810.04805},
- year={2018}
-}
-"""
-
-import argparse
-import logging
-import warnings
-import os
-import time
-
-import mxnet as mx
-import gluonnlp as nlp
-from gluonnlp.model import get_model, BERTClassifier
-from model.qa import BertForQA
-
-nlp.utils.check_version('0.8.1')
-
-parser = argparse.ArgumentParser(description='Export hybrid BERT base model.')
-
-parser.add_argument('--model_parameters',
- type=str,
- default=None,
- help='The model parameter file saved from training.')
-
-parser.add_argument('--model_name',
- type=str,
- default='bert_12_768_12',
- choices=['bert_12_768_12', 'bert_24_1024_16'],
- help='BERT model name. Options are "bert_12_768_12" and "bert_24_1024_16"')
-
-parser.add_argument('--task',
- type=str,
- choices=['classification', 'regression', 'question_answering'],
- required=True,
- help='Task to export. Options are "classification", "regression", '
- '"question_answering"')
-
-parser.add_argument('--dataset_name',
- type=str,
- default='book_corpus_wiki_en_uncased',
- choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased',
- 'wiki_multilingual_uncased', 'wiki_multilingual_cased',
- 'wiki_cn_cased'],
- help='BERT dataset name. Options include '
- '"book_corpus_wiki_en_uncased", "book_corpus_wiki_en_cased", '
- '"wiki_multilingual_uncased", "wiki_multilingual_cased", '
- '"wiki_cn_cased"')
-
-parser.add_argument('--output_dir',
- type=str,
- default='./output_dir',
- help='The directory where the exported model symbol will be created. '
- 'The default is ./output_dir')
-
-parser.add_argument('--seq_length',
- type=int,
- default=64,
- help='The maximum total input sequence length after WordPiece tokenization.'
- 'Sequences longer than this needs to be truncated, and sequences shorter '
- 'than this needs to be padded. Default is 384')
-
-parser.add_argument('--dropout',
- type=float,
- default=0.1,
- help='The dropout probability for the classification/regression head.')
-
-args = parser.parse_args()
-
-# create output dir
-output_dir = args.output_dir
-nlp.utils.mkdir(output_dir)
-
-###############################################################################
-# Logging #
-###############################################################################
-
-log = logging.getLogger('gluonnlp')
-log.setLevel(logging.DEBUG)
-formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
- datefmt='%H:%M:%S')
-fh = logging.FileHandler(os.path.join(args.output_dir, 'hybrid_export_bert.log'), mode='w')
-fh.setLevel(logging.INFO)
-fh.setFormatter(formatter)
-console = logging.StreamHandler()
-console.setLevel(logging.INFO)
-console.setFormatter(formatter)
-log.addHandler(console)
-log.addHandler(fh)
-log.info(args)
-
-###############################################################################
-# Hybridize the model #
-###############################################################################
-
-seq_length = args.seq_length
-
-if args.task == 'classification':
- bert, _ = get_model(
- name=args.model_name,
- dataset_name=args.dataset_name,
- pretrained=False,
- use_pooler=True,
- use_decoder=False,
- use_classifier=False)
- net = BERTClassifier(bert, num_classes=2, dropout=args.dropout)
-elif args.task == 'regression':
- bert, _ = get_model(
- name=args.model_name,
- dataset_name=args.dataset_name,
- pretrained=False,
- use_pooler=True,
- use_decoder=False,
- use_classifier=False)
- net = BERTClassifier(bert, num_classes=1, dropout=args.dropout)
-elif args.task == 'question_answering':
- bert, _ = get_model(
- name=args.model_name,
- dataset_name=args.dataset_name,
- pretrained=False,
- use_pooler=False,
- use_decoder=False,
- use_classifier=False)
- net = BertForQA(bert)
-else:
- raise ValueError('unknown task: %s'%args.task)
-
-if args.model_parameters:
- net.load_parameters(args.model_parameters)
-else:
- net.initialize()
- warnings.warn('--model_parameters is not provided. The parameter checkpoint (.params) '
- 'file will be created based on default parameter initialization.')
-
-net.hybridize(static_alloc=True, static_shape=True)
-
-###############################################################################
-# Prepare dummy input data #
-###############################################################################
-
-test_batch_size = 1
-
-inputs = mx.nd.arange(test_batch_size * seq_length)
-inputs = inputs.reshape(shape=(test_batch_size, seq_length))
-token_types = mx.nd.zeros_like(inputs)
-valid_length = mx.nd.arange(test_batch_size)
-batch = inputs, token_types, valid_length
-
-def export(batch, prefix):
- """Export the model."""
- log.info('Exporting the model ... ')
- inputs, token_types, valid_length = batch
- net(inputs, token_types, valid_length)
- net.export(prefix, epoch=0)
- assert os.path.isfile(prefix + '-symbol.json')
- assert os.path.isfile(prefix + '-0000.params')
-
-def infer(prefix):
- """Evaluate the model on a mini-batch."""
- log.info('Start inference ... ')
-
- # import with SymbolBlock. Alternatively, you can use Module.load APIs.
- imported_net = mx.gluon.nn.SymbolBlock.imports(prefix + '-symbol.json',
- ['data0', 'data1', 'data2'],
- prefix + '-0000.params')
-
- # exported model should be length-agnostic. Using a different seq_length should work
- inputs = mx.nd.arange(test_batch_size * (seq_length + 10))
- inputs = inputs.reshape(shape=(test_batch_size, seq_length + 10))
- token_types = mx.nd.zeros_like(inputs)
- valid_length = mx.nd.arange(test_batch_size)
-
- # run forward inference
- imported_net(inputs, token_types, valid_length)
- mx.nd.waitall()
-
- # benchmark speed after warmup
- tic = time.time()
- num_trials = 10
- for _ in range(num_trials):
- imported_net(inputs, token_types, valid_length)
- mx.nd.waitall()
- toc = time.time()
- log.info('Batch size={}, Thoughput={:.2f} batches/s'
- .format(test_batch_size, num_trials / (toc - tic)))
-
-
-###############################################################################
-# Export the model #
-###############################################################################
-if __name__ == '__main__':
- prefix = os.path.join(args.output_dir, args.task)
- export(batch, prefix)
- infer(prefix)
diff --git a/scripts/bert/finetune_classifier.py b/scripts/bert/finetune_classifier.py
deleted file mode 100644
index 8a400fb8b9..0000000000
--- a/scripts/bert/finetune_classifier.py
+++ /dev/null
@@ -1,704 +0,0 @@
-"""
-Sentence Pair Classification with Bidirectional Encoder Representations from Transformers
-
-=========================================================================================
-
-This example shows how to implement finetune a model with pre-trained BERT parameters for
-sentence pair classification, with Gluon NLP Toolkit.
-
-@article{devlin2018bert,
- title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
- author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
- journal={arXiv preprint arXiv:1810.04805},
- year={2018}
-}
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-
-import io
-import os
-import time
-import argparse
-import random
-import logging
-import warnings
-from functools import partial
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-from mxnet.contrib.amp import amp
-import gluonnlp as nlp
-from gluonnlp.data import BERTTokenizer
-from gluonnlp.data.classification import get_task
-from gluonnlp.data.bert.glue import truncate_seqs_equal, concat_sequences
-from gluonnlp.model import BERTClassifier, RoBERTaClassifier
-from gluonnlp.calibration import BertLayerCollector
-
-nlp.utils.check_version('0.9', warning_only=True)
-
-parser = argparse.ArgumentParser(
- description='BERT fine-tune examples for classification/regression tasks.',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-parser.add_argument('--optimizer', type=str, default='bertadam',
- help='The optimizer to be used for training')
-parser.add_argument('--epochs', type=int, default=3, help='number of epochs.')
-parser.add_argument(
- '--training_steps', type=int, help='The total training steps. '
- 'Note that if specified, epochs will be ignored.')
-parser.add_argument(
- '--batch_size',
- type=int,
- default=32,
- help='Batch size. Number of examples per gpu in a minibatch.')
-parser.add_argument(
- '--dev_batch_size',
- type=int,
- default=8,
- help='Batch size for dev set and test set')
-parser.add_argument(
- '--lr',
- type=float,
- default=3e-5,
- help='Initial learning rate')
-parser.add_argument(
- '--epsilon',
- type=float,
- default=1e-6,
- help='Small value to avoid division by 0'
-)
-parser.add_argument(
- '--warmup_ratio',
- type=float,
- default=0.1,
- help='ratio of warmup steps used in NOAM\'s stepsize schedule')
-parser.add_argument(
- '--log_interval',
- type=int,
- default=10,
- help='report interval')
-parser.add_argument(
- '--max_len',
- type=int,
- default=128,
- help='Maximum length of the sentence pairs')
-parser.add_argument(
- '--round_to', type=int, default=None,
- help='The length of padded sequences will be rounded up to be multiple of this argument.'
- 'When round to is set to 8, training throughput may increase for mixed precision'
- 'training on GPUs with tensorcores.')
-parser.add_argument(
- '--seed', type=int, default=2, help='Random seed')
-parser.add_argument(
- '--accumulate',
- type=int,
- default=None,
- help='The number of batches for gradients accumulation to simulate large batch size. '
- 'Default is None')
-parser.add_argument(
- '--gpu', type=int, default=None, help='Which gpu for finetuning.')
-parser.add_argument(
- '--task_name',
- type=str,
- choices=['MRPC', 'QNLI', 'RTE', 'STS-B', 'CoLA',
- 'MNLI', 'WNLI', 'SST', 'XNLI', 'LCQMC', 'ChnSentiCorp'],
- help='The name of the task to fine-tune. Choices include MRPC, QQP, '
- 'QNLI, RTE, STS-B, CoLA, MNLI, WNLI, SST.')
-parser.add_argument(
- '--bert_model',
- type=str,
- default='bert_12_768_12',
- choices=['bert_12_768_12', 'bert_24_1024_16', 'roberta_12_768_12', 'roberta_24_1024_16'],
- help='The name of pre-trained BERT model to fine-tune')
-parser.add_argument(
- '--bert_dataset',
- type=str,
- default='book_corpus_wiki_en_uncased',
- choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased',
- 'openwebtext_book_corpus_wiki_en_uncased', 'wiki_multilingual_uncased',
- 'wiki_multilingual_cased', 'wiki_cn_cased',
- 'openwebtext_ccnews_stories_books_cased'],
- help='The dataset BERT pre-trained with.')
-parser.add_argument(
- '--pretrained_bert_parameters',
- type=str,
- default=None,
- help='Pre-trained bert model parameter file.')
-parser.add_argument(
- '--model_parameters',
- type=str,
- default=None,
- help='A parameter file for the model that is loaded into the model'
- ' before training/inference. It is different from the parameter'
- ' file written after the model is trained.')
-parser.add_argument(
- '--output_dir',
- type=str,
- default='./output_dir',
- help='The output directory where the model params will be written.')
-parser.add_argument(
- '--only_inference',
- action='store_true',
- help='If set, we skip training and only perform inference on dev and test data.')
-parser.add_argument(
- '--dtype',
- type=str,
- default='float32',
- choices=['float32', 'float16'],
- help='The data type for training.')
-parser.add_argument(
- '--early_stop',
- type=int,
- default=None,
- help='Whether to perform early stopping based on the metric on dev set. '
- 'The provided value is the patience. ')
-parser.add_argument('--deploy', action='store_true',
- help='whether load static model for deployment')
-parser.add_argument('--model_prefix', type=str, required=False,
- help='load static model as hybridblock.')
-parser.add_argument('--only_calibration', action='store_true',
- help='quantize model')
-parser.add_argument('--num_calib_batches', type=int, default=5,
- help='number of batches for calibration')
-parser.add_argument('--quantized_dtype', type=str, default='auto',
- choices=['auto', 'int8', 'uint8'],
- help='quantization destination data type for input data')
-parser.add_argument('--calib_mode', type=str, default='customize',
- choices=['none', 'naive', 'entropy', 'customize'],
- help='calibration mode used for generating calibration table '
- 'for the quantized symbol.')
-
-args = parser.parse_args()
-
-
-log = logging.getLogger()
-log.setLevel(logging.INFO)
-
-logging.captureWarnings(True)
-fh = logging.FileHandler('log_{0}.txt'.format(args.task_name))
-formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
- datefmt='%H:%M:%S')
-fh.setLevel(logging.INFO)
-fh.setFormatter(formatter)
-console = logging.StreamHandler()
-console.setLevel(logging.INFO)
-console.setFormatter(formatter)
-log.addHandler(console)
-log.addHandler(fh)
-logging.info(args)
-
-batch_size = args.batch_size
-dev_batch_size = args.dev_batch_size
-task_name = args.task_name
-lr = args.lr
-epsilon = args.epsilon
-accumulate = args.accumulate
-log_interval = args.log_interval * accumulate if accumulate else args.log_interval
-if accumulate:
- logging.info('Using gradient accumulation. Effective batch size = ' \
- 'batch_size * accumulate = %d', accumulate * batch_size)
-
-# random seed
-np.random.seed(args.seed)
-random.seed(args.seed)
-mx.random.seed(args.seed)
-
-ctx = mx.cpu() if args.gpu is None else mx.gpu(args.gpu)
-
-task = get_task(task_name)
-
-# data type with mixed precision training
-if args.dtype == 'float16':
- amp.init()
-
-# model and loss
-only_inference = args.only_inference
-model_name = args.bert_model
-dataset = args.bert_dataset
-pretrained_bert_parameters = args.pretrained_bert_parameters
-model_parameters = args.model_parameters
-
-# load symbolic model
-deploy = args.deploy
-model_prefix = args.model_prefix
-
-if only_inference and not model_parameters:
- warnings.warn('model_parameters is not set. '
- 'Randomly initialized model will be used for inference.')
-
-get_pretrained = not (pretrained_bert_parameters is not None or model_parameters is not None)
-
-use_roberta = 'roberta' in model_name
-get_model_params = {
- 'name': model_name,
- 'dataset_name': dataset,
- 'pretrained': get_pretrained,
- 'ctx': ctx,
- 'use_decoder': False,
- 'use_classifier': False,
-}
-# RoBERTa does not contain parameters for sentence pair classification
-if not use_roberta:
- get_model_params['use_pooler'] = True
-
-bert, vocabulary = nlp.model.get_model(**get_model_params)
-
-# initialize the rest of the parameters
-initializer = mx.init.Normal(0.02)
-# STS-B is a regression task.
-# STSBTask().class_labels returns None
-do_regression = not task.class_labels
-if do_regression:
- num_classes = 1
- loss_function = gluon.loss.L2Loss()
-else:
- num_classes = len(task.class_labels)
- loss_function = gluon.loss.SoftmaxCELoss()
-# reuse the BERTClassifier class with num_classes=1 for regression
-if use_roberta:
- model = RoBERTaClassifier(bert, dropout=0.0, num_classes=num_classes)
-else:
- model = BERTClassifier(bert, dropout=0.1, num_classes=num_classes)
-# initialize classifier
-if not model_parameters:
- model.classifier.initialize(init=initializer, ctx=ctx)
-
-# load checkpointing
-output_dir = args.output_dir
-if pretrained_bert_parameters:
- logging.info('loading bert params from %s', pretrained_bert_parameters)
- nlp.utils.load_parameters(model.bert, pretrained_bert_parameters, ctx=ctx, ignore_extra=True,
- cast_dtype=True)
-if model_parameters:
- logging.info('loading model params from %s', model_parameters)
- nlp.utils.load_parameters(model, model_parameters, ctx=ctx, cast_dtype=True)
-nlp.utils.mkdir(output_dir)
-
-logging.debug(model)
-model.hybridize(static_alloc=True)
-loss_function.hybridize(static_alloc=True)
-
-if deploy:
- logging.info('load symbol file directly as SymbolBlock for model deployment')
- model = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
- ['data0', 'data1', 'data2'],
- '{}-0000.params'.format(args.model_prefix))
- model.hybridize(static_alloc=True, static_shape=True)
-
-# data processing
-do_lower_case = 'uncased' in dataset
-if use_roberta:
- bert_tokenizer = nlp.data.GPT2BPETokenizer()
-else:
- bert_tokenizer = BERTTokenizer(vocabulary, lower=do_lower_case)
-
-# calibration config
-only_calibration = args.only_calibration
-num_calib_batches = args.num_calib_batches
-quantized_dtype = args.quantized_dtype
-calib_mode = args.calib_mode
-
-def convert_examples_to_features(example, tokenizer=None, truncate_length=512, cls_token=None,
- sep_token=None, class_labels=None, label_alias=None, vocab=None,
- is_test=False):
- """convert glue examples into necessary features"""
- if not is_test:
- label_dtype = 'int32' if class_labels else 'float32'
- # get the label
- label = example[-1]
- example = example[:-1]
- #create label maps if classification task
- if class_labels:
- label_map = {}
- for (i, l) in enumerate(class_labels):
- label_map[l] = i
- if label_alias:
- for key in label_alias:
- label_map[key] = label_map[label_alias[key]]
- label = label_map[label]
- label = np.array([label], dtype=label_dtype)
-
- # tokenize raw text
- tokens_raw = [tokenizer(l) for l in example]
- # truncate to the truncate_length,
- tokens_trun = truncate_seqs_equal(tokens_raw, truncate_length)
- # concate the sequences with special tokens
- tokens_trun[0] = [cls_token] + tokens_trun[0]
- tokens, segment_ids, _ = concat_sequences(tokens_trun, [[sep_token]] * len(tokens_trun))
- # convert the token to ids
- input_ids = vocab[tokens]
- valid_length = len(input_ids)
- if not is_test:
- return input_ids, segment_ids, valid_length, label
- else:
- return input_ids, segment_ids, valid_length
-
-
-def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab):
- """Train/eval Data preparation function."""
- label_dtype = 'int32' if task.class_labels else 'float32'
- truncate_length = max_len - 3 if task.is_pair else max_len - 2
- trans = partial(convert_examples_to_features, tokenizer=tokenizer,
- truncate_length=truncate_length,
- cls_token=vocab.cls_token if not use_roberta else vocab.bos_token,
- sep_token=vocab.sep_token if not use_roberta else vocab.eos_token,
- class_labels=task.class_labels, label_alias=task.label_alias, vocab=vocab)
-
- # data train
- # task.dataset_train returns (segment_name, dataset)
- train_tsv = task.dataset_train()[1]
- data_train = mx.gluon.data.SimpleDataset(list(map(trans, train_tsv)))
- data_train_len = data_train.transform(lambda _, segment_ids, valid_length, label: valid_length,
- lazy=False)
- # bucket sampler for training
- pad_val = vocabulary[vocabulary.padding_token]
- batchify_fn = nlp.data.batchify.Tuple(
- nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to), # input
- nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to), # segment
- nlp.data.batchify.Stack(), # length
- nlp.data.batchify.Stack(label_dtype)) # label
- batch_sampler = nlp.data.sampler.FixedBucketSampler(data_train_len, batch_size=batch_size,
- num_buckets=10, ratio=0, shuffle=True)
- # data loader for training
- loader_train = gluon.data.DataLoader(dataset=data_train, num_workers=4,
- batch_sampler=batch_sampler, batchify_fn=batchify_fn)
-
- # data dev. For MNLI, more than one dev set is available
- dev_tsv = task.dataset_dev()
- dev_tsv_list = dev_tsv if isinstance(dev_tsv, list) else [dev_tsv]
- loader_dev_list = []
- for segment, data in dev_tsv_list:
- data_dev = mx.gluon.data.SimpleDataset(list(map(trans, data)))
- loader_dev = mx.gluon.data.DataLoader(data_dev, batch_size=dev_batch_size, num_workers=4,
- shuffle=False, batchify_fn=batchify_fn)
- loader_dev_list.append((segment, loader_dev))
-
- # batchify for data test
- test_batchify_fn = nlp.data.batchify.Tuple(
- nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),
- nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to),
- nlp.data.batchify.Stack())
- # transform for data test
- test_trans = partial(convert_examples_to_features, tokenizer=tokenizer, truncate_length=max_len,
- cls_token=vocab.cls_token if not use_roberta else vocab.bos_token,
- sep_token=vocab.sep_token if not use_roberta else vocab.eos_token,
- class_labels=None, is_test=True, vocab=vocab)
-
- # data test. For MNLI, more than one test set is available
- test_tsv = task.dataset_test()
- test_tsv_list = test_tsv if isinstance(test_tsv, list) else [test_tsv]
- loader_test_list = []
- for segment, data in test_tsv_list:
- data_test = mx.gluon.data.SimpleDataset(list(map(test_trans, data)))
- loader_test = mx.gluon.data.DataLoader(data_test, batch_size=dev_batch_size, num_workers=4,
- shuffle=False, batchify_fn=test_batchify_fn)
- loader_test_list.append((segment, loader_test))
- return loader_train, loader_dev_list, loader_test_list, len(data_train)
-
-
-# Get the loader.
-logging.info('processing dataset...')
-train_data, dev_data_list, test_data_list, num_train_examples = preprocess_data(
- bert_tokenizer, task, batch_size, dev_batch_size, args.max_len, vocabulary)
-
-def calibration(net, dev_data_list, num_calib_batches, quantized_dtype, calib_mode):
- """calibration function on the dev dataset."""
- assert len(dev_data_list) == 1, \
- 'Currectly, MNLI not supported.'
- assert ctx == mx.cpu(), \
- 'Currently only supports CPU with MKL-DNN backend.'
- logging.info('Now we are doing calibration on dev with %s.', ctx)
- for _, dev_data in dev_data_list:
- collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=logging)
- num_calib_examples = dev_batch_size * num_calib_batches
- net = mx.contrib.quantization.quantize_net_v2(net, quantized_dtype=quantized_dtype,
- exclude_layers=[],
- quantize_mode='smart',
- quantize_granularity='channel-wise',
- calib_data=dev_data,
- calib_mode=calib_mode,
- num_calib_examples=num_calib_examples,
- ctx=ctx,
- LayerOutputCollector=collector,
- logger=logging)
- # save params
- ckpt_name = 'model_bert_{0}_quantized_{1}'.format(task_name, calib_mode)
- params_saved = os.path.join(output_dir, ckpt_name)
- net.export(params_saved, epoch=0)
- logging.info('Saving quantized model at %s', output_dir)
-
-
-def test(loader_test, segment):
- """Inference function on the test dataset."""
- logging.info('Now we are doing testing on %s with %s.', segment, ctx)
-
- tic = time.time()
- results = []
- for _, seqs in enumerate(loader_test):
- input_ids, segment_ids, valid_length = seqs
- input_ids = input_ids.as_in_context(ctx)
- valid_length = valid_length.as_in_context(ctx).astype('float32')
- if use_roberta:
- out = model(input_ids, valid_length)
- else:
- out = model(input_ids, segment_ids.as_in_context(ctx), valid_length)
- if not task.class_labels:
- # regression task
- for result in out.asnumpy().reshape(-1).tolist():
- results.append('{:.3f}'.format(result))
- else:
- # classification task
- indices = mx.nd.topk(out, k=1, ret_typ='indices', dtype='int32').asnumpy()
- for index in indices:
- results.append(task.class_labels[int(index)])
-
- mx.nd.waitall()
- toc = time.time()
- logging.info('Time cost=%.2fs, throughput=%.2f samples/s', toc - tic,
- dev_batch_size * len(loader_test) / (toc - tic))
- # write result to a file.
- segment = segment.replace('_mismatched', '-mm')
- segment = segment.replace('_matched', '-m')
- segment = segment.replace('SST', 'SST-2')
- filename = args.task_name + segment.replace('test', '') + '.tsv'
- test_path = os.path.join(args.output_dir, filename)
- with io.open(test_path, 'w', encoding='utf-8') as f:
- f.write(u'index\tprediction\n')
- for i, pred in enumerate(results):
- f.write(u'%d\t%s\n' % (i, str(pred)))
-
-
-def log_train(batch_id, batch_num, metric, step_loss, log_interval, epoch_id, learning_rate):
- """Generate and print out the log message for training. """
- metric_nm, metric_val = metric.get()
- if not isinstance(metric_nm, list):
- metric_nm, metric_val = [metric_nm], [metric_val]
-
- train_str = '[Epoch %d Batch %d/%d] loss=%.4f, lr=%.7f, metrics:' + \
- ','.join([i + ':%.4f' for i in metric_nm])
- logging.info(train_str, epoch_id + 1, batch_id + 1, batch_num, step_loss / log_interval,
- learning_rate, *metric_val)
-
-
-def log_eval(batch_id, batch_num, metric, step_loss, log_interval):
- """Generate and print out the log message for inference. """
- metric_nm, metric_val = metric.get()
- if not isinstance(metric_nm, list):
- metric_nm, metric_val = [metric_nm], [metric_val]
-
- eval_str = '[Batch %d/%d] loss=%.4f, metrics:' + \
- ','.join([i + ':%.4f' for i in metric_nm])
- logging.info(eval_str, batch_id + 1, batch_num, step_loss / log_interval, *metric_val)
-
-
-def train(metric):
- """Training function."""
- if not only_inference:
- logging.info('Now we are doing BERT classification training on %s!', ctx)
-
- all_model_params = model.collect_params()
- optimizer_params = {'learning_rate': lr, 'epsilon': epsilon, 'wd': 0.01}
- trainer = gluon.Trainer(all_model_params, args.optimizer, optimizer_params,
- update_on_kvstore=False)
- if args.dtype == 'float16':
- amp.init_trainer(trainer)
-
- epoch_number = args.epochs
- step_size = batch_size * accumulate if accumulate else batch_size
- num_train_steps = int(num_train_examples / step_size * args.epochs)
- if args.training_steps:
- num_train_steps = args.training_steps
- epoch_number = 9999
-
- logging.info('training steps=%d', num_train_steps)
- warmup_ratio = args.warmup_ratio
- num_warmup_steps = int(num_train_steps * warmup_ratio)
- step_num = 0
-
- # Do not apply weight decay on LayerNorm and bias terms
- for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
- v.wd_mult = 0.0
- # Collect differentiable parameters
- params = [p for p in all_model_params.values() if p.grad_req != 'null']
-
- # Set grad_req if gradient accumulation is required
- if accumulate and accumulate > 1:
- for p in params:
- p.grad_req = 'add'
- # track best eval score
- metric_history = []
- best_metric = None
- patience = args.early_stop
-
- tic = time.time()
- finish_flag = False
- for epoch_id in range(epoch_number):
- if args.early_stop and patience == 0:
- logging.info('Early stopping at epoch %d', epoch_id)
- break
- if finish_flag:
- break
- if not only_inference:
- metric.reset()
- step_loss = 0
- tic = time.time()
- all_model_params.zero_grad()
-
- for batch_id, seqs in enumerate(train_data):
- # learning rate schedule
- if step_num < num_warmup_steps:
- new_lr = lr * step_num / num_warmup_steps
- else:
- non_warmup_steps = step_num - num_warmup_steps
- offset = non_warmup_steps / (num_train_steps - num_warmup_steps)
- new_lr = lr - offset * lr
- trainer.set_learning_rate(new_lr)
-
- # forward and backward
- with mx.autograd.record():
- input_ids, segment_ids, valid_length, label = seqs
- input_ids = input_ids.as_in_context(ctx)
- valid_length = valid_length.as_in_context(ctx).astype('float32')
- label = label.as_in_context(ctx)
- if use_roberta:
- out = model(input_ids, valid_length)
- else:
- out = model(input_ids, segment_ids.as_in_context(ctx), valid_length)
- ls = loss_function(out, label).mean()
- if args.dtype == 'float16':
- with amp.scale_loss(ls, trainer) as scaled_loss:
- mx.autograd.backward(scaled_loss)
- else:
- ls.backward()
-
- # update
- if not accumulate or (batch_id + 1) % accumulate == 0:
- trainer.allreduce_grads()
- nlp.utils.clip_grad_global_norm(params, 1)
- trainer.update(accumulate if accumulate else 1)
- step_num += 1
- if accumulate and accumulate > 1:
- # set grad to zero for gradient accumulation
- all_model_params.zero_grad()
-
- step_loss += ls.asscalar()
- if not do_regression:
- label = label.reshape((-1))
- metric.update([label], [out])
- if (batch_id + 1) % (args.log_interval) == 0:
- log_train(batch_id, len(train_data), metric, step_loss, args.log_interval,
- epoch_id, trainer.learning_rate)
- step_loss = 0
- if step_num >= num_train_steps:
- logging.info('Finish training step: %d', step_num)
- finish_flag = True
- break
- mx.nd.waitall()
-
- # inference on dev data
- for segment, dev_data in dev_data_list:
- metric_nm, metric_val = evaluate(dev_data, metric, segment)
- if best_metric is None or metric_val >= best_metric:
- best_metric = metric_val
- patience = args.early_stop
- else:
- if args.early_stop is not None:
- patience -= 1
- metric_history.append((epoch_id, metric_nm, metric_val))
-
- if not only_inference:
- # save params
- ckpt_name = 'model_bert_{0}_{1}.params'.format(task_name, epoch_id)
- params_saved = os.path.join(output_dir, ckpt_name)
-
- nlp.utils.save_parameters(model, params_saved)
- logging.info('params saved in: %s', params_saved)
- toc = time.time()
- logging.info('Time cost=%.2fs', toc - tic)
- tic = toc
-
- if not only_inference:
- # we choose the best model based on metric[0],
- # assuming higher score stands for better model quality
- metric_history.sort(key=lambda x: x[2][0], reverse=True)
- epoch_id, metric_nm, metric_val = metric_history[0]
- ckpt_name = 'model_bert_{0}_{1}.params'.format(task_name, epoch_id)
- params_saved = os.path.join(output_dir, ckpt_name)
- nlp.utils.load_parameters(model, params_saved)
- metric_str = 'Best model at epoch {}. Validation metrics:'.format(epoch_id)
- metric_str += ','.join([i + ':%.4f' for i in metric_nm])
- logging.info(metric_str, *metric_val)
-
- # inference on test data
- for segment, test_data in test_data_list:
- test(test_data, segment)
-
-
-def evaluate(loader_dev, metric, segment):
- """Evaluate the model on validation dataset."""
- logging.info('Now we are doing evaluation on %s with %s.', segment, ctx)
- metric.reset()
- step_loss = 0
- tic = time.time()
- for batch_id, seqs in enumerate(loader_dev):
- input_ids, segment_ids, valid_length, label = seqs
- input_ids = input_ids.as_in_context(ctx)
- valid_length = valid_length.as_in_context(ctx).astype('float32')
- label = label.as_in_context(ctx)
- if use_roberta:
- out = model(input_ids, valid_length)
- else:
- out = model(input_ids, segment_ids.as_in_context(ctx), valid_length)
-
- ls = loss_function(out, label).mean()
- step_loss += ls.asscalar()
- if not do_regression:
- label = label.reshape((-1))
- metric.update([label], [out])
- if (batch_id + 1) % (args.log_interval) == 0:
- log_eval(batch_id, len(loader_dev), metric, step_loss, args.log_interval)
- step_loss = 0
-
- metric_nm, metric_val = metric.get()
- if not isinstance(metric_nm, list):
- metric_nm, metric_val = [metric_nm], [metric_val]
- metric_str = 'validation metrics:' + ','.join([i + ':%.4f' for i in metric_nm])
- logging.info(metric_str, *metric_val)
-
- mx.nd.waitall()
- toc = time.time()
- logging.info('Time cost=%.2fs, throughput=%.2f samples/s', toc - tic,
- dev_batch_size * len(loader_dev) / (toc - tic))
- return metric_nm, metric_val
-
-
-if __name__ == '__main__':
- if only_calibration:
- try:
- calibration(model,
- dev_data_list,
- num_calib_batches,
- quantized_dtype,
- calib_mode)
- except AttributeError:
- nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx)
- warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
- else:
- train(task.metrics)
diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
deleted file mode 100644
index b807123cd4..0000000000
--- a/scripts/bert/finetune_squad.py
+++ /dev/null
@@ -1,862 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-"""
-SQuAD with Bidirectional Encoder Representations from Transformers
-==================================================================
-
-This example shows how to implement finetune a model with pre-trained BERT parameters for
-SQuAD, with Gluon NLP Toolkit.
-
-@article{devlin2018bert,
- title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
- author={Devlin, Jacob and Chang, Ming- \
- Wei and Lee, Kenton and Toutanova, Kristina},
- journal={arXiv preprint arXiv:1810.04805},
- year={2018}
-}
-"""
-
-import argparse
-import collections
-import json
-import logging
-import os
-import io
-import random
-import time
-import warnings
-import itertools
-import pickle
-import multiprocessing as mp
-from functools import partial
-
-import numpy as np
-import mxnet as mx
-
-import gluonnlp as nlp
-from gluonnlp.data import SQuAD
-from gluonnlp.data.bert.glue import concat_sequences
-from gluonnlp.data.bert.squad import improve_answer_span, \
- tokenize_and_align_positions, get_doc_spans, align_position2doc_spans, \
- check_is_max_context, convert_squad_examples
-from gluonnlp.calibration import BertLayerCollector
-from model.qa import BertForQALoss, BertForQA
-from bert_qa_evaluate import get_F1_EM, predict, PredResult
-
-np.random.seed(6)
-random.seed(6)
-mx.random.seed(6)
-
-log = logging.getLogger('gluonnlp')
-log.setLevel(logging.DEBUG)
-formatter = logging.Formatter(
- fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s', datefmt='%H:%M:%S')
-
-parser = argparse.ArgumentParser(
- description='BERT QA example.'
- 'We fine-tune the BERT model on SQuAD dataset.')
-
-parser.add_argument('--only_predict',
- action='store_true',
- help='Whether to predict only.')
-
-parser.add_argument('--model_parameters',
- type=str,
- default=None,
- help='Model parameter file')
-
-parser.add_argument('--bert_model',
- type=str,
- default='bert_12_768_12',
- help='BERT model name. options are bert_12_768_12 and bert_24_1024_16.')
-
-parser.add_argument('--bert_dataset',
- type=str,
- default='book_corpus_wiki_en_uncased',
- help='BERT dataset name.'
- 'options are book_corpus_wiki_en_uncased and book_corpus_wiki_en_cased.')
-
-parser.add_argument('--pretrained_bert_parameters',
- type=str,
- default=None,
- help='Pre-trained bert model parameter file. default is None')
-
-parser.add_argument('--uncased',
- action='store_false',
- help='if not set, inputs are converted to lower case.')
-
-parser.add_argument('--output_dir',
- type=str,
- default='./output_dir',
- help='The output directory where the model params will be written.'
- ' default is ./output_dir')
-
-parser.add_argument('--epochs',
- type=int,
- default=3,
- help='number of epochs, default is 3')
-parser.add_argument('--training_steps',
- type=int,
- help='training steps, epochs will be ignored '
- 'if trainin_steps is specified.')
-parser.add_argument('--batch_size',
- type=int,
- default=32,
- help='Batch size. Number of examples per gpu in a minibatch. default is 32')
-
-parser.add_argument('--test_batch_size',
- type=int,
- default=24,
- help='Test batch size. default is 24')
-
-parser.add_argument('--optimizer',
- type=str,
- default='bertadam',
- help='optimization algorithm. default is bertadam')
-
-parser.add_argument('--accumulate',
- type=int,
- default=None,
- help='The number of batches for '
- 'gradients accumulation to simulate large batch size. Default is None')
-
-parser.add_argument('--lr',
- type=float,
- default=5e-5,
- help='Initial learning rate. default is 5e-5')
-
-parser.add_argument('--warmup_ratio',
- type=float,
- default=0.1,
- help='ratio of warmup steps that linearly increase learning rate from '
- '0 to target learning rate. default is 0.1')
-
-parser.add_argument('--log_interval',
- type=int,
- default=50,
- help='report interval. default is 50')
-
-parser.add_argument('--max_seq_length',
- type=int,
- default=384,
- help='The maximum total input sequence length after WordPiece tokenization.'
- 'Sequences longer than this will be truncated, and sequences shorter '
- 'than this will be padded. default is 384')
-
-parser.add_argument(
- '--round_to', type=int, default=None,
- help='The length of padded sequences will be rounded up to be multiple of this argument.'
- 'When round to is set to 8, training throughput may increase for mixed precision'
- 'training on GPUs with tensorcores.')
-
-parser.add_argument('--doc_stride',
- type=int,
- default=128,
- help='When splitting up a long document into chunks, how much stride to '
- 'take between chunks. default is 128')
-
-parser.add_argument('--max_query_length',
- type=int,
- default=64,
- help='The maximum number of tokens for the question. Questions longer than '
- 'this will be truncated to this length. default is 64')
-
-parser.add_argument('--n_best_size',
- type=int,
- default=20,
- help='The total number of n-best predictions to generate in the '
- 'nbest_predictions.json output file. default is 20')
-
-parser.add_argument('--max_answer_length',
- type=int,
- default=30,
- help='The maximum length of an answer that can be generated. This is needed '
- 'because the start and end predictions are not conditioned on one another.'
- ' default is 30')
-
-parser.add_argument('--version_2',
- action='store_true',
- help='SQuAD examples whether contain some that do not have an answer.')
-
-parser.add_argument('--null_score_diff_threshold',
- type=float,
- default=0.0,
- help='If null_score - best_non_null is greater than the threshold predict null.'
- 'Typical values are between -1.0 and -5.0. default is 0.0')
-
-parser.add_argument('--gpu',
- action='store_true',
- help='use GPU instead of CPU')
-
-parser.add_argument('--sentencepiece',
- type=str,
- default=None,
- help='Path to the sentencepiece .model file for both tokenization and vocab.')
-
-parser.add_argument('--debug',
- action='store_true',
- help='Run the example in test mode for sanity checks')
-
-parser.add_argument('--dtype',
- type=str,
- default='float32',
- help='Data type used for training. Either float32 or float16')
-
-parser.add_argument('--comm_backend',
- type=str,
- default=None,
- help='Communication backend. Set to horovod if horovod is used for '
- 'multi-GPU training')
-
-parser.add_argument('--deploy', action='store_true',
- help='whether load static model for deployment')
-
-parser.add_argument('--model_prefix', type=str, required=False,
- help='load static model as hybridblock.')
-
-parser.add_argument('--only_calibration', action='store_true',
- help='quantize model')
-
-parser.add_argument('--num_calib_batches', type=int, default=10,
- help='number of batches for calibration')
-
-parser.add_argument('--quantized_dtype', type=str, default='auto',
- choices=['auto', 'int8', 'uint8'],
- help='quantization destination data type for input data')
-
-parser.add_argument('--calib_mode', type=str, default='customize',
- choices=['none', 'naive', 'entropy', 'customize'],
- help='calibration mode used for generating calibration table '
- 'for the quantized symbol.')
-
-args = parser.parse_args()
-
-output_dir = args.output_dir
-if not os.path.exists(output_dir):
- os.mkdir(output_dir)
-
-fh = logging.FileHandler(os.path.join(args.output_dir, 'finetune_squad.log'),
- mode='w')
-fh.setLevel(logging.INFO)
-fh.setFormatter(formatter)
-console = logging.StreamHandler()
-console.setLevel(logging.INFO)
-console.setFormatter(formatter)
-log.addHandler(console)
-log.addHandler(fh)
-
-log.info(args)
-
-if args.comm_backend == 'horovod':
- import horovod.mxnet as hvd
- hvd.init()
- rank = hvd.rank()
- size = hvd.size()
- local_rank = hvd.local_rank()
-else:
- rank = 0
- size = 1
- local_rank = 0
-
-if args.dtype == 'float16':
- from mxnet.contrib import amp
- amp.init()
-
-model_name = args.bert_model
-dataset_name = args.bert_dataset
-only_predict = args.only_predict
-model_parameters = args.model_parameters
-pretrained_bert_parameters = args.pretrained_bert_parameters
-if pretrained_bert_parameters and model_parameters:
- raise ValueError('Cannot provide both pre-trained BERT parameters and '
- 'BertForQA model parameters.')
-lower = args.uncased
-
-batch_size = args.batch_size
-test_batch_size = args.test_batch_size
-lr = args.lr
-ctx = mx.gpu(local_rank) if args.gpu else mx.cpu()
-
-accumulate = args.accumulate
-log_interval = args.log_interval * accumulate if accumulate else args.log_interval
-if accumulate:
- log.info('Using gradient accumulation. Effective total batch size = {}'.
- format(accumulate*batch_size*size))
-
-optimizer = args.optimizer
-warmup_ratio = args.warmup_ratio
-
-
-version_2 = args.version_2
-null_score_diff_threshold = args.null_score_diff_threshold
-
-max_seq_length = args.max_seq_length
-doc_stride = args.doc_stride
-max_query_length = args.max_query_length
-n_best_size = args.n_best_size
-max_answer_length = args.max_answer_length
-
-if max_seq_length <= max_query_length + 3:
- raise ValueError('The max_seq_length (%d) must be greater than max_query_length '
- '(%d) + 3' % (max_seq_length, max_query_length))
-
-# vocabulary and tokenizer
-if args.sentencepiece:
- logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
- if dataset_name:
- warnings.warn('Both --dataset_name and --sentencepiece are provided. '
- 'The vocabulary will be loaded based on --sentencepiece.')
- vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
- dataset_name = None
-else:
- vocab = None
-
-pretrained = not model_parameters and not pretrained_bert_parameters and not args.sentencepiece
-bert, vocab = nlp.model.get_model(
- name=model_name,
- dataset_name=dataset_name,
- vocab=vocab,
- pretrained=pretrained,
- ctx=ctx,
- use_pooler=False,
- use_decoder=False,
- use_classifier=False)
-
-if args.sentencepiece:
- tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, lower=lower)
-else:
- tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=lower)
-
-batchify_fn = nlp.data.batchify.Tuple(
- nlp.data.batchify.Stack(),
- nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
- nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
- nlp.data.batchify.Stack('float32'),
- nlp.data.batchify.Stack('float32'),
- nlp.data.batchify.Stack('float32'))
-
-# load symbolic model
-deploy = args.deploy
-model_prefix = args.model_prefix
-
-net = BertForQA(bert=bert)
-if model_parameters:
- # load complete BertForQA parameters
- nlp.utils.load_parameters(net, model_parameters, ctx=ctx, cast_dtype=True)
-elif pretrained_bert_parameters:
- # only load BertModel parameters
- nlp.utils.load_parameters(bert, pretrained_bert_parameters, ctx=ctx,
- ignore_extra=True, cast_dtype=True)
- net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-elif pretrained:
- # only load BertModel parameters
- net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-else:
- # no checkpoint is loaded
- net.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-
-net.hybridize(static_alloc=True)
-
-loss_function = BertForQALoss()
-loss_function.hybridize(static_alloc=True)
-
-if deploy:
- logging.info('load symbol file directly as SymbolBlock for model deployment')
- net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
- ['data0', 'data1', 'data2'],
- '{}-0000.params'.format(args.model_prefix))
- net.hybridize(static_alloc=True, static_shape=True)
-
-# calibration config
-only_calibration = args.only_calibration
-num_calib_batches = args.num_calib_batches
-quantized_dtype = args.quantized_dtype
-calib_mode = args.calib_mode
-
-def train():
- """Training function."""
- segment = 'train' #if not args.debug else 'dev'
- log.info('Loading %s data...', segment)
- if version_2:
- train_data = SQuAD(segment, version='2.0')
- else:
- train_data = SQuAD(segment, version='1.1')
- if args.debug:
- sampled_data = [train_data[i] for i in range(0, 10000)]
- train_data = mx.gluon.data.SimpleDataset(sampled_data)
- log.info('Number of records in Train data:{}'.format(len(train_data)))
- train_data_transform = preprocess_dataset(
- tokenizer,
- train_data,
- max_seq_length=max_seq_length,
- doc_stride=doc_stride,
- max_query_length=max_query_length,
- input_features=True)
-
- log.info('The number of examples after preprocessing:{}'.format(
- len(train_data_transform)))
-
- sampler = nlp.data.SplitSampler(len(train_data_transform), num_parts=size,
- part_index=rank, even_size=True)
- num_train_examples = len(sampler)
- train_dataloader = mx.gluon.data.DataLoader(train_data_transform,
- batchify_fn=batchify_fn,
- batch_size=batch_size,
- num_workers=4,
- sampler=sampler)
-
- log.info('Start Training')
-
- optimizer_params = {'learning_rate': lr, 'wd': 0.01}
- param_dict = net.collect_params()
- if args.comm_backend == 'horovod':
- trainer = hvd.DistributedTrainer(param_dict, optimizer, optimizer_params)
- else:
- trainer = mx.gluon.Trainer(param_dict, optimizer, optimizer_params,
- update_on_kvstore=False)
- if args.dtype == 'float16':
- amp.init_trainer(trainer)
-
- step_size = batch_size * accumulate if accumulate else batch_size
- num_train_steps = int(num_train_examples / step_size * args.epochs)
- if args.training_steps:
- num_train_steps = args.training_steps
-
- num_warmup_steps = int(num_train_steps * warmup_ratio)
-
- def set_new_lr(step_num, batch_id):
- """set new learning rate"""
- # set grad to zero for gradient accumulation
- if accumulate:
- if batch_id % accumulate == 0:
- step_num += 1
- else:
- step_num += 1
- # learning rate schedule
- # Notice that this learning rate scheduler is adapted from traditional linear learning
- # rate scheduler where step_num >= num_warmup_steps, new_lr = 1 - step_num/num_train_steps
- if step_num < num_warmup_steps:
- new_lr = lr * step_num / num_warmup_steps
- else:
- offset = (step_num - num_warmup_steps) * lr / \
- (num_train_steps - num_warmup_steps)
- new_lr = lr - offset
- trainer.set_learning_rate(new_lr)
- return step_num
-
- # Do not apply weight decay on LayerNorm and bias terms
- for _, v in net.collect_params('.*beta|.*gamma|.*bias').items():
- v.wd_mult = 0.0
- # Collect differentiable parameters
- params = [p for p in param_dict.values() if p.grad_req != 'null']
-
- # Set grad_req if gradient accumulation is required
- if accumulate:
- for p in params:
- p.grad_req = 'add'
- net.collect_params().zero_grad()
-
- epoch_tic = time.time()
-
- total_num = 0
- log_num = 0
- batch_id = 0
- step_loss = 0.0
- tic = time.time()
- step_num = 0
-
- tic = time.time()
- while step_num < num_train_steps:
- for _, data in enumerate(train_dataloader):
- # set new lr
- step_num = set_new_lr(step_num, batch_id)
- # forward and backward
- _, inputs, token_types, valid_length, start_label, end_label = data
- num_labels = len(inputs)
- log_num += num_labels
- total_num += num_labels
-
- with mx.autograd.record():
- out = net(inputs.as_in_context(ctx),
- token_types.as_in_context(ctx),
- valid_length.as_in_context(ctx).astype('float32'))
-
- loss = loss_function(out, [
- start_label.as_in_context(ctx).astype('float32'),
- end_label.as_in_context(ctx).astype('float32')
- ]).sum() / num_labels
-
- if accumulate:
- loss = loss / accumulate
- if args.dtype == 'float16':
- with amp.scale_loss(loss, trainer) as l:
- mx.autograd.backward(l)
- norm_clip = 1.0 * size * trainer._amp_loss_scaler.loss_scale
- else:
- mx.autograd.backward(loss)
- norm_clip = 1.0 * size
-
- # update
- if not accumulate or (batch_id + 1) % accumulate == 0:
- trainer.allreduce_grads()
- nlp.utils.clip_grad_global_norm(params, norm_clip)
- trainer.update(1)
- if accumulate:
- param_dict.zero_grad()
-
- if args.comm_backend == 'horovod':
- step_loss += hvd.allreduce(loss, average=True).asscalar()
- else:
- step_loss += loss.asscalar()
-
- if (batch_id + 1) % log_interval == 0:
- toc = time.time()
- log.info('Batch: {}/{}, Loss={:.4f}, lr={:.7f} '
- 'Thoughput={:.2f} samples/s'
- .format(batch_id % len(train_dataloader),
- len(train_dataloader), step_loss / log_interval,
- trainer.learning_rate, log_num/(toc - tic)))
- tic = time.time()
- step_loss = 0.0
- log_num = 0
-
- if step_num >= num_train_steps:
- break
- batch_id += 1
-
- log.info('Finish training step: %d', step_num)
- epoch_toc = time.time()
- log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format(
- epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic)))
-
- if rank == 0:
- net.save_parameters(os.path.join(output_dir, 'net.params'))
-
-def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
- """calibration function on the dev dataset."""
- log.info('Loading dev data...')
- if version_2:
- dev_data = SQuAD('dev', version='2.0')
- else:
- dev_data = SQuAD('dev', version='1.1')
- if args.debug:
- sampled_data = [dev_data[0], dev_data[1], dev_data[2]]
- dev_data = mx.gluon.data.SimpleDataset(sampled_data)
- log.info('Number of records in dev data:{}'.format(len(dev_data)))
-
- batchify_fn_calib = nlp.data.batchify.Tuple(
- nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
- nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
- nlp.data.batchify.Stack('float32'),
- nlp.data.batchify.Stack('float32'))
-
- dev_data_transform = preprocess_dataset(tokenizer,
- dev_data,
- max_seq_length=max_seq_length,
- doc_stride=doc_stride,
- max_query_length=max_query_length,
- input_features=True,
- for_calibration=True)
-
- dev_dataloader = mx.gluon.data.DataLoader(
- dev_data_transform,
- batchify_fn=batchify_fn_calib,
- num_workers=4, batch_size=test_batch_size,
- shuffle=False, last_batch='keep')
-
- assert ctx == mx.cpu(), \
- 'Currently only supports CPU with MKL-DNN backend.'
- log.info('Now we are doing calibration on dev with %s.', ctx)
- collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=log)
- num_calib_examples = test_batch_size * num_calib_batches
- net = mx.contrib.quantization.quantize_net_v2(net, quantized_dtype=quantized_dtype,
- exclude_layers=[],
- quantize_mode='smart',
- quantize_granularity='channel-wise',
- calib_data=dev_dataloader,
- calib_mode=calib_mode,
- num_calib_examples=num_calib_examples,
- ctx=ctx,
- LayerOutputCollector=collector,
- logger=log)
- # save params
- ckpt_name = 'model_bert_squad_quantized_{0}'.format(calib_mode)
- params_saved = os.path.join(output_dir, ckpt_name)
- net.export(params_saved, epoch=0)
- log.info('Saving quantized model at %s', output_dir)
-
-def evaluate():
- """Evaluate the model on validation dataset."""
- log.info('Loading dev data...')
- if version_2:
- dev_data = SQuAD('dev', version='2.0')
- else:
- dev_data = SQuAD('dev', version='1.1')
- if args.debug:
- sampled_data = [dev_data[i] for i in range(100)]
- dev_data = mx.gluon.data.SimpleDataset(sampled_data)
- log.info('Number of records in dev data:{}'.format(len(dev_data)))
-
- dev_dataset = preprocess_dataset(tokenizer,
- dev_data,
- max_seq_length=max_seq_length,
- doc_stride=doc_stride,
- max_query_length=max_query_length,
- input_features=False)
-
- dev_data_transform = preprocess_dataset(tokenizer,
- dev_data,
- max_seq_length=max_seq_length,
- doc_stride=doc_stride,
- max_query_length=max_query_length,
- input_features=True)
-
- log.info('The number of examples after preprocessing:{}'.format(
- len(dev_data_transform)))
-
- dev_dataloader = mx.gluon.data.DataLoader(dev_data_transform,
- batchify_fn=batchify_fn,
- num_workers=4,
- batch_size=test_batch_size,
- shuffle=False,
- last_batch='keep')
-
- log.info('start prediction')
-
- all_results = collections.defaultdict(list)
-
- epoch_tic = time.time()
- total_num = 0
- for data in dev_dataloader:
- example_ids, inputs, token_types, valid_length, _, _ = data
- total_num += len(inputs)
- out = net(inputs.as_in_context(ctx),
- token_types.as_in_context(ctx),
- valid_length.as_in_context(ctx).astype('float32'))
-
- output = mx.nd.split(out, axis=2, num_outputs=2)
- example_ids = example_ids.asnumpy().tolist()
- pred_start = output[0].reshape((0, -3)).asnumpy()
- pred_end = output[1].reshape((0, -3)).asnumpy()
-
- for example_id, start, end in zip(example_ids, pred_start, pred_end):
- all_results[example_id].append(PredResult(start=start, end=end))
-
- epoch_toc = time.time()
- log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format(
- epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic)))
-
- log.info('Get prediction results...')
-
- all_predictions = collections.OrderedDict()
-
- for features in dev_dataset:
- results = all_results[features[0].example_id]
- example_qas_id = features[0].qas_id
-
- prediction, _ = predict(
- features=features,
- results=results,
- tokenizer=nlp.data.BERTBasicTokenizer(lower=lower),
- max_answer_length=max_answer_length,
- null_score_diff_threshold=null_score_diff_threshold,
- n_best_size=n_best_size,
- version_2=version_2)
-
- all_predictions[example_qas_id] = prediction
-
- if version_2:
- log.info('Please run evaluate-v2.0.py to get evaluation results for SQuAD 2.0')
- else:
- F1_EM = get_F1_EM(dev_data, all_predictions)
- log.info(F1_EM)
-
- with io.open(os.path.join(output_dir, 'predictions.json'),
- 'w', encoding='utf-8') as fout:
- data = json.dumps(all_predictions, ensure_ascii=False)
- fout.write(data)
-
-
-
-SquadBERTFeautre = collections.namedtuple('SquadBERTFeautre', [
- 'example_id', 'qas_id', 'doc_tokens', 'valid_length', 'tokens',
- 'token_to_orig_map', 'token_is_max_context', 'input_ids', 'p_mask',
- 'segment_ids', 'start_position', 'end_position', 'is_impossible'
-])
-
-
-def convert_examples_to_features(example,
- tokenizer=None,
- cls_token=None,
- sep_token=None,
- vocab=None,
- max_seq_length=384,
- doc_stride=128,
- max_query_length=64,
- cls_index=0):
- """convert the examples to the BERT features"""
- query_tokenized = [cls_token] + tokenizer(
- example.question_text)[:max_query_length]
- #tokenize paragraph and get start/end position of the answer in tokenized paragraph
- tok_start_position, tok_end_position, all_doc_tokens, _, tok_to_orig_index = \
- tokenize_and_align_positions(example.doc_tokens,
- example.start_position,
- example.end_position,
- tokenizer)
- # get doc spans using sliding window
- doc_spans, doc_spans_indices = get_doc_spans(
- all_doc_tokens, max_seq_length - len(query_tokenized) - 2, doc_stride)
-
- if not example.is_impossible:
- (tok_start_position, tok_end_position) = improve_answer_span(
- all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
- example.orig_answer_text)
- # get the new start/end position
- positions = [
- align_position2doc_spans([tok_start_position, tok_end_position],
- doc_idx,
- offset=len(query_tokenized) + 1,
- default_value=0)
- for doc_idx in doc_spans_indices
- ]
- else:
- # if the question is impossible to answer, set the start/end position to cls index
- positions = [[cls_index, cls_index] for _ in doc_spans_indices]
-
- # record whether the tokens in a docspan have max context
- token_is_max_context = [{
- len(query_tokenized) + p:
- check_is_max_context(doc_spans_indices, i, p + doc_spans_indices[i][0])
- for p in range(len(doc_span))
- } for (i, doc_span) in enumerate(doc_spans)]
-
- token_to_orig_map = [{
- len(query_tokenized) + p + 1:
- tok_to_orig_index[p + doc_spans_indices[i][0]]
- for p in range(len(doc_span))
- } for (i, doc_span) in enumerate(doc_spans)]
-
- #get sequence features: tokens, segment_ids, p_masks
- seq_features = [
- concat_sequences([query_tokenized, doc_span], [[sep_token]] * 2)
- for doc_span in doc_spans
- ]
-
- features = [
- SquadBERTFeautre(example_id=example.example_id,
- qas_id=example.qas_id,
- doc_tokens=example.doc_tokens,
- valid_length=len(tokens),
- tokens=tokens,
- token_to_orig_map=t2o,
- token_is_max_context=is_max,
- input_ids=vocab[tokens],
- p_mask=p_mask,
- segment_ids=segment_ids,
- start_position=start,
- end_position=end,
- is_impossible=example.is_impossible)
- for (tokens, segment_ids, p_mask), (start, end), is_max, t2o in zip(
- seq_features, positions, token_is_max_context, token_to_orig_map)
- ]
- return features
-
-
-def preprocess_dataset(tokenizer,
- dataset,
- vocab=None,
- max_seq_length=384,
- doc_stride=128,
- max_query_length=64,
- input_features=True,
- num_workers=4,
- load_from_pickle=False,
- feature_file=None,
- for_calibration=False):
- """Loads a dataset into features"""
- vocab = tokenizer.vocab if vocab is None else vocab
- trans = partial(convert_examples_to_features,
- tokenizer=tokenizer,
- cls_token=vocab.cls_token,
- sep_token=vocab.sep_token,
- vocab=vocab,
- max_seq_length=max_seq_length,
- doc_stride=doc_stride,
- max_query_length=max_query_length)
- pool = mp.Pool(num_workers)
- start = time.time()
- if not load_from_pickle:
- example_trans = partial(convert_squad_examples,
- is_training=input_features)
- # convert the raw dataset into raw features
- examples = pool.map(example_trans, dataset)
- raw_features = pool.map(trans, examples)
- if feature_file:
- with open(feature_file, 'wb') as file:
- pickle.dump(list(raw_features), file)
- else:
- assert feature_file, 'feature file should be provided.'
- with open(feature_file, 'wb') as file:
- raw_features = pickle.load(file)
-
- if input_features:
- # convert the full features into the training features
- # Note that we will need the full features to make evaluation
- # Due to using sliding windows in data preprocessing,
- # we will have multiple examples for a single entry after processed.
- # Thus we need to flatten it for training.
- data_feature = mx.gluon.data.SimpleDataset(
- list(itertools.chain.from_iterable(raw_features)))
- if for_calibration:
- data_feature = data_feature.transform(lambda *example: (
- example[7], # inputs_id
- example[9], # segment_ids
- example[3], # valid_length,
- example[10])) # start_position,
- else:
- data_feature = data_feature.transform(lambda *example: (
- example[0], # example_id
- example[7], # inputs_id
- example[9], # segment_ids
- example[3], # valid_length,
- example[10], # start_position,
- example[11])) # end_position
- else:
- data_feature = mx.gluon.data.SimpleDataset(list(raw_features))
-
- end = time.time()
- pool.close()
- print('Done! Transform dataset costs %.2f seconds.' % (end - start))
- return data_feature
-
-
-if __name__ == '__main__':
- if only_calibration:
- try:
- calibration(net,
- num_calib_batches,
- quantized_dtype,
- calib_mode)
- except AttributeError:
- nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx)
- warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
- elif not only_predict:
- train()
- evaluate()
- elif model_parameters or deploy:
- evaluate()
diff --git a/scripts/bert/fp16_utils.py b/scripts/bert/fp16_utils.py
deleted file mode 100644
index f74c5528b1..0000000000
--- a/scripts/bert/fp16_utils.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Trainer for mixed precision training."""
-import warnings
-import mxnet as mx
-import gluonnlp as nlp
-
-class FP16Trainer:
- """ Trainer for mixed precision training.
-
- Parameters
- ----------
- trainer: gluon.Trainer
- the original gluon Trainer object for fp32 training.
- dynamic_loss_scale: bool. Default is True
- whether to use dynamic loss scaling. This is recommended for optimizing model
- parameters using FP16.
- loss_scaler_params : dict
- Key-word arguments to be passed to loss scaler constructor. For example,
- `{"init_scale" : 2.**10, "scale_window" : 2000, "tolerance" : 0.05}`
- for `DynamicLossScaler`.
- See each `LossScaler` for a list of supported arguments'
- """
- def __init__(self, trainer, dynamic_loss_scale=True, loss_scaler_params=None):
- if trainer._kvstore_params['update_on_kvstore'] is not False and trainer._kvstore:
- err = 'Only gluon.Trainer created with update_on_kvstore=False is supported.'
- raise NotImplementedError(err)
- self.fp32_trainer = trainer
- loss_scaler_params = loss_scaler_params if loss_scaler_params else {}
- self._scaler = DynamicLossScaler(**loss_scaler_params) if dynamic_loss_scale \
- else StaticLossScaler(**loss_scaler_params)
- # if the optimizer supports NaN check, we can always defer the NaN check to the optimizer
- # TODO(haibin) this should be added via registry
- self._support_nan_check = trainer._optimizer.__class__.__name__ == 'BERTAdam'
-
- def backward(self, loss):
- """backward propagation with loss"""
- with mx.autograd.record():
- if isinstance(loss, (tuple, list)):
- ls = [l * self._scaler.loss_scale for l in loss]
- else:
- ls = loss * self._scaler.loss_scale
- mx.autograd.backward(ls)
-
- def step(self, batch_size, max_norm=None):
- """Makes one step of parameter update. Should be called after
- `fp16_optimizer.backward()`, and outside of `record()` scope.
-
- Parameters
- ----------
- batch_size : int
- Batch size of data processed. Gradient will be normalized by `1/batch_size`.
- Set this to 1 if you normalized loss manually with `loss = mean(loss)`.
- max_norm : NDArray, optional, default is None
- max value for global 2-norm of gradients.
- """
- self.fp32_trainer.allreduce_grads()
- step_size = batch_size * self._scaler.loss_scale
- if max_norm:
- _, ratio, is_finite = nlp.utils.grad_global_norm(self.fp32_trainer._params,
- max_norm * self._scaler.loss_scale)
- step_size = ratio * step_size
- if self._support_nan_check:
- self.fp32_trainer.update(step_size)
- overflow = is_finite.asscalar() < 1
- else:
- overflow = is_finite.asscalar() < 1
- if not overflow:
- step_size = step_size.asscalar()
- self.fp32_trainer.update(step_size)
- else:
- # TODO(haibin) optimize the performance when max_norm is not present
- # sequentially adding isnan/isinf results may be slow
- if self._support_nan_check:
- self.fp32_trainer.update(step_size)
- overflow = self._scaler.has_overflow(self.fp32_trainer._params)
- else:
- overflow = self._scaler.has_overflow(self.fp32_trainer._params)
- if not overflow:
- self.fp32_trainer.update(step_size)
- # update scale based on overflow information
- self._scaler.update_scale(overflow)
-
-class LossScaler:
- """Abstract loss scaler"""
- def has_overflow(self, params):
- """ detect inf and nan """
- is_not_finite = 0
- for param in params:
- if param.grad_req != 'null':
- grad = param.list_grad()[0]
- is_not_finite += mx.nd.contrib.isnan(grad).sum().astype('float32', copy=False)
- is_not_finite += mx.nd.contrib.isinf(grad).sum().astype('float32', copy=False)
- # NDArray is implicitly converted to bool
- if is_not_finite == 0:
- return False
- else:
- return True
-
- def update_scale(self, overflow):
- raise NotImplementedError()
-
-class StaticLossScaler(LossScaler):
- """Static loss scaler"""
- def __init__(self, init_scale=1):
- self.loss_scale = init_scale
-
- def update_scale(self, overflow):
- """update loss scale"""
-
-class DynamicLossScaler(LossScaler):
- """Class that manages dynamic loss scaling.
-
- There are two problems regarding gradient scale when fp16 is used for training.
- One is overflow: the fp16 gradient is too large that it causes NaN.
- To combat such an issue, we need to scale down the gradient when such an event
- is detected. The other is underflow: the gradient is too small such that the
- precision suffers. This is hard to detect though. What dynamic loss scaler does
- it that, it starts the scale at a relatively large value (e.g. 2**15).
- Everytime when a NaN is detected in the gradient, the scale is reduced (by default)
- by 2x. On the other hand, if a NaN is not detected for a long time
- (e.g. 2000 steps), then the scale is increased (by default) by 2x."""
- def __init__(self, init_scale=2.**10, scale_factor=2., scale_window=2000,
- tolerance=0.):
- self.loss_scale = init_scale
- self.scale_factor = scale_factor
- self.scale_window = scale_window
- self.tolerance = tolerance
- self._num_steps = 0
- self._last_overflow_iter = -1
- self._last_rescale_iter = -1
- self._overflows_since_rescale = 0
-
- def update_scale(self, overflow):
- """dynamically update loss scale"""
- iter_since_rescale = self._num_steps - self._last_rescale_iter
- if overflow:
- self._last_overflow_iter = self._num_steps
- self._overflows_since_rescale += 1
- percentage = self._overflows_since_rescale / float(iter_since_rescale)
- # we tolerate a certrain amount of NaNs before actually scaling it down
- if percentage >= self.tolerance:
- self.loss_scale /= self.scale_factor
- self._last_rescale_iter = self._num_steps
- self._overflows_since_rescale = 0
- if self.loss_scale < 1:
- warnings.warn('DynamicLossScaler: overflow detected. set loss_scale = %s'%
- self.loss_scale)
- elif (self._num_steps - self._last_overflow_iter) % self.scale_window == 0:
- self.loss_scale *= self.scale_factor
- self._last_rescale_iter = self._num_steps
- self._num_steps += 1
diff --git a/scripts/bert/index.rst b/scripts/bert/index.rst
deleted file mode 100644
index 158da3be52..0000000000
--- a/scripts/bert/index.rst
+++ /dev/null
@@ -1,369 +0,0 @@
-BERT
-----
-
-:download:`Download scripts `
-
-
-Reference: Devlin, Jacob, et al. "`Bert: Pre-training of deep bidirectional transformers for language understanding. `_" arXiv preprint arXiv:1810.04805 (2018).
-
-BERT Model Zoo
-~~~~~~~~~~~~~~
-
-The following pre-trained BERT models are available from the **gluonnlp.model.get_model** API:
-
-+-----------------------------------------+----------------+-----------------+
-| | bert_12_768_12 | bert_24_1024_16 |
-+=========================================+================+=================+
-| book_corpus_wiki_en_uncased | ✓ | ✓ |
-+-----------------------------------------+----------------+-----------------+
-| book_corpus_wiki_en_cased | ✓ | ✓ |
-+-----------------------------------------+----------------+-----------------+
-| openwebtext_book_corpus_wiki_en_uncased | ✓ | x |
-+-----------------------------------------+----------------+-----------------+
-| wiki_multilingual_uncased | ✓ | x |
-+-----------------------------------------+----------------+-----------------+
-| wiki_multilingual_cased | ✓ | x |
-+-----------------------------------------+----------------+-----------------+
-| wiki_cn_cased | ✓ | x |
-+-----------------------------------------+----------------+-----------------+
-| scibert_scivocab_uncased | ✓ | x |
-+-----------------------------------------+----------------+-----------------+
-| scibert_scivocab_cased | ✓ | x |
-+-----------------------------------------+----------------+-----------------+
-| scibert_basevocab_uncased | ✓ | x |
-+-----------------------------------------+----------------+-----------------+
-| scibert_basevocab_cased | ✓ | x |
-+-----------------------------------------+----------------+-----------------+
-| biobert_v1.0_pmc_cased | ✓ | x |
-+-----------------------------------------+----------------+-----------------+
-| biobert_v1.0_pubmed_cased | ✓ | x |
-+-----------------------------------------+----------------+-----------------+
-| biobert_v1.0_pubmed_pmc_cased | ✓ | x |
-+-----------------------------------------+----------------+-----------------+
-| biobert_v1.1_pubmed_cased | ✓ | x |
-+-----------------------------------------+----------------+-----------------+
-| clinicalbert_uncased | ✓ | x |
-+-----------------------------------------+----------------+-----------------+
-| kobert_news_wiki_ko_cased | ✓ | x |
-+-----------------------------------------+----------------+-----------------+
-
-where **bert_12_768_12** refers to the BERT BASE model, and **bert_24_1024_16** refers to the BERT LARGE model.
-
-.. code-block:: python
-
- import gluonnlp as nlp; import mxnet as mx;
- model, vocab = nlp.model.get_model('bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', use_classifier=False, use_decoder=False);
- tokenizer = nlp.data.BERTTokenizer(vocab, lower=True);
- transform = nlp.data.BERTSentenceTransform(tokenizer, max_seq_length=512, pair=False, pad=False);
- sample = transform(['Hello world!']);
- words, valid_len, segments = mx.nd.array([sample[0]]), mx.nd.array([sample[1]]), mx.nd.array([sample[2]]);
- seq_encoding, cls_encoding = model(words, segments, valid_len);
-
-
-The pretrained parameters for dataset_name
-'openwebtext_book_corpus_wiki_en_uncased' were obtained by running the GluonNLP
-BERT pre-training script on OpenWebText.
-
-The pretrained parameters for dataset_name 'scibert_scivocab_uncased',
-'scibert_scivocab_cased', 'scibert_basevocab_uncased', 'scibert_basevocab_cased'
-were obtained by converting the parameters published by "Beltagy, I., Cohan, A.,
-& Lo, K. (2019). Scibert: Pretrained contextualized embeddings for scientific
-text. arXiv preprint `arXiv:1903.10676 `_."
-
-The pretrained parameters for dataset_name 'biobert_v1.0_pmc',
-'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc', 'biobert_v1.1_pubmed' were
-obtained by converting the parameters published by "Lee, J., Yoon, W., Kim, S.,
-Kim, D., Kim, S., So, C. H., & Kang, J. (2019). Biobert: pre-trained biomedical
-language representation model for biomedical text mining. arXiv preprint
-`arXiv:1901.08746 `_."
-
-The pretrained parameters for dataset_name 'clinicalbert' were obtained by
-converting the parameters published by "Huang, K., Altosaar, J., & Ranganath, R.
-(2019). ClinicalBERT: Modeling Clinical Notes and Predicting Hospital
-Readmission. arXiv preprint `arXiv:1904.05342
-`_."
-
-Additionally, GluonNLP supports the "`RoBERTa `_" model:
-
-+-----------------------------------------+-------------------+--------------------+
-| | roberta_12_768_12 | roberta_24_1024_16 |
-+=========================================+===================+====================+
-| openwebtext_ccnews_stories_books_cased | ✓ | ✓ |
-+-----------------------------------------+-------------------+--------------------+
-
-.. code-block:: python
-
- import gluonnlp as nlp; import mxnet as mx;
- model, vocab = nlp.model.get_model('roberta_12_768_12', dataset_name='openwebtext_ccnews_stories_books_cased', use_decoder=False);
- tokenizer = nlp.data.GPT2BPETokenizer();
- text = [vocab.bos_token] + tokenizer('Hello world!') + [vocab.eos_token];
- seq_encoding = model(mx.nd.array([vocab[text]]))
-
-GluonNLP also supports the "`DistilBERT `_" model:
-
-+-----------------------------------------+----------------------+
-| | distilbert_6_768_12 |
-+=========================================+======================+
-| distil_book_corpus_wiki_en_uncased | ✓ |
-+-----------------------------------------+----------------------+
-
-.. code-block:: python
-
- import gluonnlp as nlp; import mxnet as mx;
- model, vocab = nlp.model.get_model('distilbert_6_768_12', dataset_name='distil_book_corpus_wiki_en_uncased');
- tokenizer = nlp.data.BERTTokenizer(vocab, lower=True);
- transform = nlp.data.BERTSentenceTransform(tokenizer, max_seq_length=512, pair=False, pad=False);
- sample = transform(['Hello world!']);
- words, valid_len = mx.nd.array([sample[0]]), mx.nd.array([sample[1]])
- seq_encoding, cls_encoding = model(words, valid_len);
-
-Finally, GluonNLP also suports Korean BERT pre-trained model, "`KoBERT `_", using Korean wiki dataset (`kobert_news_wiki_ko_cased`).
-
-.. code-block:: python
-
- import gluonnlp as nlp; import mxnet as mx;
- model, vocab = nlp.model.get_model('bert_12_768_12', dataset_name='kobert_news_wiki_ko_cased',use_decoder=False, use_classifier=False)
- tok = nlp.data.get_tokenizer('bert_12_768_12', 'kobert_news_wiki_ko_cased')
- tok('안녕하세요.')
-
-.. hint::
-
- The pre-training, fine-tuning and export scripts are available `here. `__
-
-
-Sentence Classification
-~~~~~~~~~~~~~~~~~~~~~~~
-
-GluonNLP provides the following example script to fine-tune sentence classification with pre-trained
-BERT model.
-
-To enable mixed precision training with float16, set `--dtype` argument to `float16`.
-
-Results using `bert_12_768_12`:
-
-.. editing URL for the following table: https://tinyurl.com/y4n8q84w
-
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-|Task Name |Metrics |Results on Dev Set |log |command |
-+=================+=====================+=======================+============================================================================================================================================+=================================================================================================================================================================+
-| CoLA |Matthew Corr. |60.32 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| SST-2 |Accuracy |93.46 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| MRPC |Accuracy/F1 |88.73/91.96 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| STS-B |Pearson Corr. |90.34 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| QQP |Accuracy |91 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| MNLI |Accuracy(m/mm) |84.29/85.07 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| XNLI (Chinese) |Accuracy |78.43 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| RTE |Accuracy |74 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-
-
-
-Results using `roberta_12_768_12`:
-
-.. editing URL for the following table: https://www.shorturl.at/cjAO7
-
-+---------------------+------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| Dataset | SST-2 | MNLI-M/MM |
-+=====================+======================================================================================================+==================================================================================================================+
-| Validation Accuracy | 95.3% | 87.69%, 87.23% |
-+---------------------+------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| Log | `log `__ | `log `__ |
-+---------------------+------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| Command | `command `__ | `command `__ |
-+---------------------+------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
-
-.. editing URL for the following table: https://tinyurl.com/y5rrowj3
-
-Question Answering on SQuAD
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-+-----------+-----------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+
-| Dataset | SQuAD 1.1 | SQuAD 1.1 | SQuAD 2.0 |
-+===========+=========================================================================================================================================+==========================================================================================================================================+==========================================================================================================================================+
-| Model | bert_12_768_12 | bert_24_1024_16 | bert_24_1024_16 |
-+-----------+-----------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+
-| F1 / EM | 88.58 / 81.26 | 90.97 / 84.22 | 81.27 / 78.14 |
-+-----------+-----------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+
-| Log | `log `__ | `log `__ | `log `__ |
-+-----------+-----------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+
-| Command | `command `__ | `command `__ | `command `__ |
-+-----------+-----------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+
-| Prediction| `predictions.json `__ | `predictions.json `__ | `predictions.json `__ |
-+-----------+-----------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+
-
-For all model settings above, we set learing rate = 3e-5 and optimizer = adam.
-
-Note that the BERT model is memory-consuming. If you have limited GPU memory, you can use the following command to accumulate gradient to achieve the same result with a large batch size by setting *accumulate* and *batch_size* arguments accordingly.
-
-.. code-block:: console
-
- $ python finetune_squad.py --optimizer adam --accumulate 2 --batch_size 6 --lr 3e-5 --epochs 2 --gpu
-
-We support multi-GPU training via horovod:
-
-.. code-block:: console
-
- $ HOROVOD_WITH_MXNET=1 HOROVOD_GPU_ALLREDUCE=NCCL pip install horovod --user --no-cache-dir
- $ horovodrun -np 8 python finetune_squad.py --bert_model bert_24_1024_16 --batch_size 4 --lr 3e-5 --epochs 2 --gpu --dtype float16 --comm_backend horovod
-
-SQuAD 2.0
-+++++++++
-
-For SQuAD 2.0, you need to specify the parameter *version_2* and specify the parameter *null_score_diff_threshold*. Typical values are between -1.0 and -5.0. Use the following command to fine-tune the BERT large model on SQuAD 2.0 and generate predictions.json.
-
-To get the score of the dev data, you need to download the dev dataset (`dev-v2.0.json `_) and the evaluate script (`evaluate-2.0.py `_). Then use the following command to get the score of the dev dataset.
-
-.. code-block:: console
-
- $ python evaluate-v2.0.py dev-v2.0.json predictions.json
-
-BERT INT8 Quantization
-~~~~~~~~~~~~~~~~~~~~~~
-
-GluonNLP provides the following example scripts to quantize fine-tuned
-BERT models into int8 data type. Note that INT8 Quantization needs a nightly
-version of `mxnet-mkl `_.
-
-Sentence Classification
-+++++++++++++++++++++++
-
-+-----------+-------------------+---------------+---------------+---------+---------+------------------------------------------------------------------------------------------------------------------------+
-| Dataset | Model | FP32 Accuracy | INT8 Accuracy | FP32 F1 | INT8 F1 | Command |
-+===========+===================+===============+===============+=========+=========+========================================================================================================================+
-| MRPC | bert_12_768_12 | 87.01 | 87.01 | 90.97 | 90.88 |`command `__ |
-+-----------+-------------------+---------------+---------------+---------+---------+------------------------------------------------------------------------------------------------------------------------+
-| SST-2 | bert_12_768_12 | 93.23 | 93.00 | | |`command `__ |
-+-----------+-------------------+---------------+---------------+---------+---------+------------------------------------------------------------------------------------------------------------------------+
-
-Question Answering
-++++++++++++++++++
-
-+-----------+-------------------+---------+---------+---------+---------+----------------------------------------------------------------------------------------------------------------------------+
-| Dataset | Model | FP32 EM | INT8 EM | FP32 F1 | INT8 F1 | Command |
-+===========+===================+=========+=========+=========+=========+============================================================================================================================+
-| SQuAD 1.1 | bert_12_768_12 | 81.18 | 80.32 | 88.58 | 88.10 |`command `__ |
-+-----------+-------------------+---------+---------+---------+---------+----------------------------------------------------------------------------------------------------------------------------+
-
-For all model settings above, we use a subset of evaluation dataset for calibration.
-
-Pre-training from Scratch
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-We also provide scripts for pre-training BERT with masked language modeling and and next sentence prediction.
-
-The pre-training data format expects: (1) One sentence per line. These should ideally be actual sentences, not entire paragraphs or arbitrary spans of text for the "next sentence prediction" task. (2) Blank lines between documents. You can find a sample pre-training text with 3 documents `here `__. You can perform sentence segmentation with an off-the-shelf NLP toolkit such as NLTK.
-
-
-.. hint::
-
- You can download pre-processed English wikipedia dataset `here. `__
-
-
-Pre-requisite
-+++++++++++++
-
-We recommend horovod for scalable multi-gpu multi-machine training.
-
-To install horovod, you need:
-
-- `NCCL `__, and
-- `OpenMPI `__
-
-Then you can install horovod via the following command:
-
-.. code-block:: console
-
- $ HOROVOD_WITH_MXNET=1 HOROVOD_GPU_ALLREDUCE=NCCL pip install horovod==0.16.2 --user --no-cache-dir
-
-Run Pre-training
-++++++++++++++++
-
-You can use the following command to run pre-training with 2 hosts, 8 GPUs each:
-
-.. code-block:: console
-
- $ mpirun -np 16 -H host0_ip:8,host1_ip:8 -mca pml ob1 -mca btl ^openib \
- -mca btl_tcp_if_exclude docker0,lo --map-by ppr:4:socket \
- --mca plm_rsh_agent 'ssh -q -o StrictHostKeyChecking=no' \
- -x NCCL_MIN_NRINGS=8 -x NCCL_DEBUG=INFO -x HOROVOD_HIERARCHICAL_ALLREDUCE=1 \
- -x MXNET_SAFE_ACCUMULATION=1 --tag-output \
- python run_pretraining.py --data='folder1/*.txt,folder2/*.txt,' \
- --data_eval='dev_folder/*.txt,' --num_steps 1000000 \
- --lr 1e-4 --total_batch_size 256 --accumulate 1 --raw --comm_backend horovod
-
-If you see out-of-memory error, try increasing --accumulate for gradient accumulation.
-
-When multiple hosts are present, please make sure you can ssh to these nodes without password.
-
-Alternatively, if horovod is not available, you could run pre-training with the MXNet native parameter server by setting --comm_backend and --gpus.
-
-.. code-block:: console
-
- $ MXNET_SAFE_ACCUMULATION=1 python run_pretraining.py --comm_backend device --gpus 0,1,2,3,4,5,6,7 ...
-
-The BERT base model produced by gluonnlp pre-training script (`log `__) achieves 83.6% on MNLI-mm, 93% on SST-2, 87.99% on MRPC and 80.99/88.60 on SQuAD 1.1 validation set on the books corpus and English wikipedia dataset.
-
-Custom Vocabulary
-+++++++++++++++++
-
-The pre-training script supports subword tokenization with a custom vocabulary using `sentencepiece `__.
-
-To install sentencepiece, run:
-
-.. code-block:: console
-
- $ pip install sentencepiece==0.1.82 --user
-
-You can `train /github.com/google/sentencepiece/tree/v0.1.82/python#model-training>`__ a custom sentencepiece vocabulary by specifying the vocabulary size:
-
-.. code-block:: python
-
- import sentencepiece as spm
- spm.SentencePieceTrainer.Train('--input=a.txt,b.txt --unk_id=0 --pad_id=3 --model_prefix=my_vocab --vocab_size=30000 --model_type=BPE')
-
-To use sentencepiece vocab for pre-training, please set --sentencepiece=my_vocab.model when using run_pretraining.py.
-
-
-
-Export BERT for Deployment
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Current export.py support exporting BERT models. Supported values for --task argument include classification, regression and question answering.
-
-.. code-block:: console
-
- $ python export.py --task classification --model_parameters /path/to/saved/ckpt.params --output_dir /path/to/output/dir/ --seq_length 128
-
-This will export the BERT model for classification to a symbol.json file, saved to the directory specified by --output_dir.
-The --model_parameters argument is optional. If not set, the .params file saved in the output directory will be randomly initialized parameters.
-
-BERT for Sentence or Tokens Embedding
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The goal of this BERT Embedding is to obtain the token embedding from BERT's pre-trained model. In this way, instead of building and do fine-tuning for an end-to-end NLP model, you can build your model by just utilizing the token embeddings. You can use the command line interface below:
-
-.. code-block:: shell
-
- python embedding.py --sentences "GluonNLP is a toolkit that enables easy text preprocessing, datasets loading and neural models building to help you speed up your Natural Language Processing (NLP) research."
- Text: g ##lu ##on ##nl ##p is a tool ##kit that enables easy text prep ##ro ##ces ##sing , data ##set ##s loading and neural models building to help you speed up your natural language processing ( nl ##p ) research .
- Tokens embedding: [array([-0.11881411, -0.59530115, 0.627092 , ..., 0.00648153,
- -0.03886228, 0.03406909], dtype=float32), array([-0.7995638 , -0.6540758 , -0.00521846, ..., -0.42272145,
- -0.5787281 , 0.7021201 ], dtype=float32), array([-0.7406778 , -0.80276626, 0.3931962 , ..., -0.49068323,
- -0.58128357, 0.6811132 ], dtype=float32), array([-0.43287313, -1.0018158 , 0.79617643, ..., -0.26877284,
- -0.621779 , -0.2731115 ], dtype=float32), array([-0.8515188 , -0.74098676, 0.4427735 , ..., -0.41267148,
- -0.64225197, 0.3949393 ], dtype=float32), array([-0.86652845, -0.27746758, 0.8806506 , ..., -0.87452525,
- -0.9551989 , -0.0786318 ], dtype=float32), array([-1.0987284 , -0.36603633, 0.2826037 , ..., -0.33794224,
- -0.55210876, -0.09221527], dtype=float32), array([-0.3483025 , 0.401534 , 0.9361341 , ..., -0.29747447,
- -0.49559578, -0.08878893], dtype=float32), array([-0.65626 , -0.14857645, 0.29733548, ..., -0.15890433,
- -0.45487815, -0.28494897], dtype=float32), array([-0.1983894 , 0.67196256, 0.7867421 , ..., -0.7990434 ,
- 0.05860569, -0.26884627], dtype=float32), array([-0.3775159 , -0.00590206, 0.5240432 , ..., -0.26754653,
- -0.37806216, 0.23336883], dtype=float32), array([ 0.1876977 , 0.30165672, 0.47167772, ..., -0.43823618,
- -0.42823148, -0.48873612], dtype=float32), array([-0.6576557 , -0.09822252, 0.1121515 , ..., -0.21743725,
- -0.1820574 , -0.16115054], dtype=float32)]
diff --git a/scripts/bert/model/__init__.py b/scripts/bert/model/__init__.py
deleted file mode 100644
index e1aae8e5ab..0000000000
--- a/scripts/bert/model/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""BERT model."""
-from . import qa
diff --git a/scripts/bert/model/qa.py b/scripts/bert/model/qa.py
deleted file mode 100644
index 39418bd54b..0000000000
--- a/scripts/bert/model/qa.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""BertForQA models."""
-
-__all__ = ['BertForQA', 'BertForQALoss']
-
-from mxnet.gluon import HybridBlock, loss, nn
-from mxnet.gluon.loss import Loss
-
-
-class BertForQA(HybridBlock):
- """Model for SQuAD task with BERT.
-
- The model feeds token ids and token type ids into BERT to get the
- pooled BERT sequence representation, then apply a Dense layer for QA task.
-
- Parameters
- ----------
- bert: BERTModel
- Bidirectional encoder with transformer.
- prefix : str or None
- See document of `mx.gluon.Block`.
- params : ParameterDict or None
- See document of `mx.gluon.Block`.
- """
-
- def __init__(self, bert, prefix=None, params=None):
- super(BertForQA, self).__init__(prefix=prefix, params=params)
- self.bert = bert
- with self.name_scope():
- self.span_classifier = nn.Dense(units=2, flatten=False)
-
- def __call__(self, inputs, token_types, valid_length=None):
- #pylint: disable=arguments-differ, dangerous-default-value
- """Generate the unnormalized score for the given the input sequences."""
- # XXX Temporary hack for hybridization as hybridblock does not support None inputs
- valid_length = [] if valid_length is None else valid_length
- return super(BertForQA, self).__call__(inputs, token_types, valid_length)
-
- def hybrid_forward(self, F, inputs, token_types, valid_length=None):
- # pylint: disable=arguments-differ
- """Generate the unnormalized score for the given the input sequences.
-
- Parameters
- ----------
- inputs : NDArray, shape (batch_size, seq_length)
- Input words for the sequences.
- token_types : NDArray, shape (batch_size, seq_length)
- Token types for the sequences, used to indicate whether the word belongs to the
- first sentence or the second one.
- valid_length : NDArray or None, shape (batch_size,)
- Valid length of the sequence. This is used to mask the padded tokens.
-
- Returns
- -------
- outputs : NDArray
- Shape (batch_size, seq_length, 2)
- """
- # XXX Temporary hack for hybridization as hybridblock does not support None inputs
- if isinstance(valid_length, list) and len(valid_length) == 0:
- valid_length = None
- bert_output = self.bert(inputs, token_types, valid_length)
- output = self.span_classifier(bert_output)
- return output
-
-
-class BertForQALoss(Loss):
- """Loss for SQuAD task with BERT.
-
- """
-
- def __init__(self, weight=None, batch_axis=0, **kwargs): # pylint: disable=unused-argument
- super(BertForQALoss, self).__init__(
- weight=None, batch_axis=0, **kwargs)
- self.loss = loss.SoftmaxCELoss()
-
- def hybrid_forward(self, F, pred, label): # pylint: disable=arguments-differ
- """
- Parameters
- ----------
- pred : NDArray, shape (batch_size, seq_length, 2)
- BERTSquad forward output.
- label : list, length is 2, each shape is (batch_size,1)
- label[0] is the starting position of the answer,
- label[1] is the ending position of the answer.
-
- Returns
- -------
- outputs : NDArray
- Shape (batch_size,)
- """
- pred = F.split(pred, axis=2, num_outputs=2)
- start_pred = pred[0].reshape((0, -3))
- start_label = label[0]
- end_pred = pred[1].reshape((0, -3))
- end_label = label[1]
- return (self.loss(start_pred, start_label) + self.loss(
- end_pred, end_label)) / 2
diff --git a/scripts/bert/pretraining_utils.py b/scripts/bert/pretraining_utils.py
deleted file mode 100644
index 876703240c..0000000000
--- a/scripts/bert/pretraining_utils.py
+++ /dev/null
@@ -1,526 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Utilities for pre-training."""
-import time
-import os
-import sys
-import logging
-import random
-import multiprocessing
-
-import numpy as np
-import mxnet as mx
-import gluonnlp as nlp
-
-from data.create_pretraining_data import create_training_instances
-
-
-__all__ = ['get_model_loss', 'get_pretrain_data_npz', 'get_dummy_dataloader',
- 'save_parameters', 'save_states', 'evaluate', 'split_and_load',
- 'get_pretrain_data_text', 'generate_dev_set', 'profile']
-
-def get_model_loss(ctx, model, pretrained, dataset_name, vocab, dtype,
- ckpt_dir=None, start_step=None):
- """Get model for pre-training.
-
- Parameters
- ----------
- ctx : Context or list of Context
- Contexts to initialize model
- model : str
- The name of the model, 'bert_12_768_12' or 'bert_24_1024_16'.
- pretrained : bool
- Whether to use pre-trained model weights as initialization.
- dataset_name : str
- The name of the dataset, which is used to retrieve the corresponding vocabulary file
- when the vocab argument is not provided. Options include 'book_corpus_wiki_en_uncased',
- 'book_corpus_wiki_en_cased', 'wiki_multilingual_uncased', 'wiki_multilingual_cased',
- 'wiki_cn_cased'.
- vocab : BERTVocab or None
- The vocabulary for the model. If not provided, The vocabulary will be constructed
- based on dataset_name.
- dtype : float
- Data type of the model for training.
- ckpt_dir : str
- The path to the checkpoint directory.
- start_step : int or None
- If provided, it loads the model from the corresponding checkpoint from the ckpt_dir.
-
- Returns
- -------
- BERTForPretrain : the model for pre-training.
- BERTVocab : the vocabulary.
- """
- # model
- model, vocabulary = nlp.model.get_model(model, dataset_name=dataset_name, vocab=vocab,
- pretrained=pretrained, ctx=ctx,
- hparam_allow_override=True)
-
- if not pretrained:
- model.initialize(init=mx.init.Normal(0.02), ctx=ctx)
- model.cast(dtype)
-
- if ckpt_dir and start_step:
- param_path = os.path.join(ckpt_dir, '%07d.params'%start_step)
- nlp.utils.load_parameters(model, param_path, ctx=ctx, cast_dtype=True)
- logging.info('Loading step %d checkpoints from %s.', start_step, param_path)
-
- model.hybridize(static_alloc=True, static_shape=True)
-
- # losses
- nsp_loss = mx.gluon.loss.SoftmaxCELoss()
- mlm_loss = mx.gluon.loss.SoftmaxCELoss()
- nsp_loss.hybridize(static_alloc=True, static_shape=True)
- mlm_loss.hybridize(static_alloc=True, static_shape=True)
-
- model = BERTForPretrain(model, nsp_loss, mlm_loss, len(vocabulary))
- return model, vocabulary
-
-
-def prepare_pretrain_npz_dataset(filename, allow_pickle=False):
- """Create dataset based on the numpy npz file"""
- if isinstance(filename, (list, tuple)):
- assert len(filename) == 1, \
- 'When .npy/.npz data file is loaded, len(filename) must be 1.' \
- ' Received len(filename)={}.'.format(len(filename))
- filename = filename[0]
- logging.debug('start to load file %s ...', filename)
- return nlp.data.NumpyDataset(filename, allow_pickle=allow_pickle)
-
-
-def prepare_pretrain_text_dataset(filename, tokenizer, max_seq_length, short_seq_prob,
- masked_lm_prob, max_predictions_per_seq, whole_word_mask,
- vocab, num_workers=1, worker_pool=None):
- """Create dataset based on the raw text files"""
- dupe_factor = 1
- if not isinstance(filename, (list, tuple)):
- filename = [filename]
- logging.debug('start to load files %s ...', filename)
- instances = create_training_instances((filename, tokenizer, max_seq_length,
- short_seq_prob, masked_lm_prob,
- max_predictions_per_seq,
- whole_word_mask, vocab,
- dupe_factor, num_workers,
- worker_pool, None))
- return mx.gluon.data.ArrayDataset(*instances)
-
-
-def prepare_pretrain_bucket_sampler(dataset, batch_size, shuffle=False,
- num_ctxes=1, num_buckets=1):
- """Create data sampler based on the dataset"""
- if isinstance(dataset, nlp.data.NumpyDataset):
- lengths = dataset.get_field('valid_lengths')
- else:
- lengths = dataset.transform(lambda input_ids, segment_ids, masked_lm_positions, \
- masked_lm_ids, masked_lm_weights, \
- next_sentence_labels, valid_lengths: \
- valid_lengths, lazy=False)
- # calculate total batch size for all GPUs
- batch_size = batch_size * num_ctxes
- sampler = nlp.data.FixedBucketSampler(lengths,
- batch_size=batch_size,
- num_buckets=num_buckets,
- ratio=0,
- shuffle=shuffle)
- logging.debug('Sampler created for a new dataset:\n%s', sampler.stats())
- return sampler
-
-
-def get_pretrain_data_text(data, batch_size, num_ctxes, shuffle,
- num_buckets, vocab, tokenizer, max_seq_length, short_seq_prob,
- masked_lm_prob, max_predictions_per_seq, whole_word_mask,
- num_parts=1, part_idx=0, num_dataset_workers=1, num_batch_workers=1,
- circle_length=1, repeat=1,
- dataset_cached=False, num_max_dataset_cached=0):
- """Get a data iterator from raw text documents.
-
- Parameters
- ----------
- batch_size : int
- The batch size per GPU.
- num_ctxes : int
- The number of GPUs.
- shuffle : bool
- Whether to shuffle the data.
- num_buckets : int
- The number of buckets for the FixedBucketSampler for training.
- vocab : BERTVocab
- The vocabulary.
- tokenizer : BERTTokenizer or BERTSPTokenizer
- The tokenizer.
- max_seq_length : int
- The hard limit of maximum sequence length of sentence pairs.
- short_seq_prob : float
- The probability of sampling sequences shorter than the max_seq_length.
- masked_lm_prob : float
- The probability of replacing texts with masks/random words/original words.
- max_predictions_per_seq : int
- The hard limit of the number of predictions for masked words
- whole_word_mask : bool
- Whether to use whole word masking.
- num_parts : int
- The number of partitions for the dataset.
- part_idx : int
- The index of the partition to read.
- num_dataset_workers : int
- The number of worker processes for dataset construction.
- num_batch_workers : int
- The number of worker processes for batch construction.
- circle_length : int, default is 1
- The number of files to be read for a single worker at the same time.
- When circle_length is larger than 1, we merge circle_length files.
- repeat : int, default is 1
- The number of times that files are repeated.
- dataset_cached : bool, default is False
- Whether or not to cache last processed dataset.
- Each processed dataset can only be cached for once.
- When there is no new available processed dataset to be fetched,
- we pop a cached processed dataset.
- num_max_dataset_cached : int, default is 0
- Maximum number of cached datasets. It is valid only if dataset_cached is True
- """
- num_files = len(nlp.utils.glob(data))
- logging.info('%d files are found.', num_files)
- assert num_files >= num_parts, \
- 'The number of text files must be no less than the number of ' \
- 'workers/partitions (%d). Only %d files at %s are found.'%(num_parts, num_files, data)
- dataset_params = {'tokenizer': tokenizer, 'max_seq_length': max_seq_length,
- 'short_seq_prob': short_seq_prob, 'masked_lm_prob': masked_lm_prob,
- 'max_predictions_per_seq': max_predictions_per_seq, 'vocab':vocab,
- 'whole_word_mask': whole_word_mask}
- sampler_params = {'batch_size': batch_size, 'shuffle': shuffle,
- 'num_ctxes': num_ctxes, 'num_buckets': num_buckets}
- dataset_fn = prepare_pretrain_text_dataset
- sampler_fn = prepare_pretrain_bucket_sampler
- pad_val = vocab[vocab.padding_token]
- batchify_fn = nlp.data.batchify.Tuple(
- nlp.data.batchify.Pad(pad_val=pad_val, round_to=8), # input_id
- nlp.data.batchify.Pad(pad_val=pad_val), # masked_id
- nlp.data.batchify.Pad(pad_val=0), # masked_position
- nlp.data.batchify.Pad(pad_val=0), # masked_weight
- nlp.data.batchify.Stack(), # next_sentence_label
- nlp.data.batchify.Pad(pad_val=0, round_to=8), # segment_id
- nlp.data.batchify.Stack())
- split_sampler = nlp.data.SplitSampler(num_files, num_parts=num_parts,
- part_index=part_idx, repeat=repeat)
- dataloader = nlp.data.DatasetLoader(data,
- file_sampler=split_sampler,
- dataset_fn=dataset_fn,
- batch_sampler_fn=sampler_fn,
- dataset_params=dataset_params,
- batch_sampler_params=sampler_params,
- batchify_fn=batchify_fn,
- num_dataset_workers=num_dataset_workers,
- num_batch_workers=num_batch_workers,
- pin_memory=False,
- circle_length=circle_length,
- dataset_cached=dataset_cached,
- num_max_dataset_cached=num_max_dataset_cached)
- return dataloader
-
-
-def get_pretrain_data_npz(data, batch_size, num_ctxes,
- shuffle, num_buckets,
- vocab, num_parts=1, part_idx=0,
- num_dataset_workers=1, num_batch_workers=1,
- circle_length=1, repeat=1,
- dataset_cached=False, num_max_dataset_cached=0):
- """Get a data iterator from pre-processed npz files.
-
- Parameters
- ----------
- batch_size : int
- The batch size per GPU.
- num_ctxes : int
- The number of GPUs.
- shuffle : bool
- Whether to shuffle the data.
- num_buckets : int
- The number of buckets for the FixedBucketSampler for training.
- vocab : BERTVocab
- The vocabulary.
- num_parts : int
- The number of partitions for the dataset.
- part_idx : int
- The index of the partition to read.
- num_dataset_workers : int
- The number of worker processes for dataset construction.
- num_batch_workers : int
- The number of worker processes for batch contruction.
- circle_length : int, default is 1
- The number of files to be read for a single worker at the same time.
- When circle_length is larger than 1, we merge circle_length files.
- repeat : int, default is 1
- The number of times that files are repeated.
- dataset_cached : bool, default is False
- Whether or not to cache last processed dataset.
- Each processed dataset can only be cached for once.
- When there is no new available processed dataset to be fetched,
- we pop a cached processed dataset.
- num_max_dataset_cached : int, default is 0
- Maximum number of cached datasets. It is valid only if dataset_cached is True
- """
- num_files = len(nlp.utils.glob(data))
- logging.info('%d files are found.', num_files)
- assert num_files >= num_parts, \
- 'The number of text files must be no less than the number of ' \
- 'workers/partitions (%d). Only %d files at %s are found.'%(num_parts, num_files, data)
- dataset_params = {'allow_pickle': True}
- sampler_params = {'batch_size': batch_size, 'shuffle': shuffle,
- 'num_ctxes': num_ctxes, 'num_buckets': num_buckets}
- dataset_fn = prepare_pretrain_npz_dataset
- sampler_fn = prepare_pretrain_bucket_sampler
- pad_val = vocab[vocab.padding_token]
- batchify_fn = nlp.data.batchify.Tuple(
- nlp.data.batchify.Pad(pad_val=pad_val, round_to=8), # input_id
- nlp.data.batchify.Pad(pad_val=pad_val), # masked_id
- nlp.data.batchify.Pad(pad_val=0), # masked_position
- nlp.data.batchify.Pad(pad_val=0), # masked_weight
- nlp.data.batchify.Stack(), # next_sentence_label
- nlp.data.batchify.Pad(pad_val=0, round_to=8), # segment_id
- nlp.data.batchify.Stack())
- split_sampler = nlp.data.SplitSampler(num_files, num_parts=num_parts,
- part_index=part_idx, repeat=repeat)
- dataloader = nlp.data.DatasetLoader(data,
- file_sampler=split_sampler,
- dataset_fn=dataset_fn,
- batch_sampler_fn=sampler_fn,
- dataset_params=dataset_params,
- batch_sampler_params=sampler_params,
- batchify_fn=batchify_fn,
- num_dataset_workers=num_dataset_workers,
- num_batch_workers=num_batch_workers,
- pin_memory=False,
- circle_length=circle_length,
- dataset_cached=dataset_cached,
- num_max_dataset_cached=num_max_dataset_cached)
- return dataloader
-
-
-def get_dummy_dataloader(batch_size, seq_len, max_predict):
- """Return a dummy data loader which returns a fixed data batch of target shape"""
- class DummyIter():
- def __init__(self, batch):
- self._batch = batch
-
- def __iter__(self):
- while True:
- yield self._batch
- data_batch = ((mx.nd.zeros((batch_size, seq_len)),
- mx.nd.zeros((batch_size, max_predict)),
- mx.nd.zeros((batch_size, max_predict)),
- mx.nd.zeros((batch_size, max_predict)),
- mx.nd.ones((batch_size,)) * seq_len,
- mx.nd.zeros((batch_size, seq_len)),
- mx.nd.ones((batch_size,)) * seq_len))
- return DummyIter(data_batch)
-
-
-def save_parameters(step_num, model, ckpt_dir):
- """Save the model parameter, marked by step_num."""
- param_path = os.path.join(ckpt_dir, '%07d.params'%step_num)
- logging.info('[step %d] Saving model params to %s.', step_num, param_path)
- nlp.utils.save_parameters(model, param_path)
-
-def save_states(step_num, trainer, ckpt_dir, local_rank=0):
- """Save the trainer states, marked by step_num."""
- trainer_path = os.path.join(ckpt_dir, '%07d.states.%02d'%(step_num, local_rank))
- logging.info('[step %d] Saving trainer states to %s.', step_num, trainer_path)
- nlp.utils.save_states(trainer, trainer_path)
-
-def log_noacc(begin_time, running_num_tks, running_mlm_loss, running_nsp_loss, step_num,
- trainer, log_interval):
- """Log training progress."""
- end_time = time.time()
- duration = end_time - begin_time
- throughput = running_num_tks / duration / 1000.0
- running_mlm_loss = running_mlm_loss / log_interval
- running_nsp_loss = running_nsp_loss / log_interval
- lr = trainer.learning_rate if trainer else 0
- # pylint: disable=line-too-long
- logging.info('[step {}]\tmlm_loss={:7.5f}\tnsp_loss={:5.2f}\tthroughput={:.1f}K tks/s\tlr={:.7f} time={:.2f}, latency={:.1f} ms/step'
- .format(step_num, running_mlm_loss.asscalar(), running_nsp_loss.asscalar(),
- throughput.asscalar(), lr, duration, duration*1000/log_interval))
- # pylint: enable=line-too-long
-
-def log(begin_time, running_num_tks, running_mlm_loss, running_nsp_loss, step_num,
- mlm_metric, nsp_metric, trainer, log_interval):
- """Log training progress."""
- end_time = time.time()
- duration = end_time - begin_time
- throughput = running_num_tks / duration / 1000.0
- running_mlm_loss = running_mlm_loss / log_interval
- running_nsp_loss = running_nsp_loss / log_interval
- lr = trainer.learning_rate if trainer else 0
- # pylint: disable=line-too-long
- logging.info('[step {}]\tmlm_loss={:7.5f}\tmlm_acc={:4.2f}\tnsp_loss={:5.2f}\tnsp_acc={:5.2f}\tthroughput={:.1f}K tks/s\tlr={:.7f} time={:.2f}, latency={:.1f} ms/step'
- .format(step_num, running_mlm_loss.asscalar(), mlm_metric.get()[1] * 100, running_nsp_loss.asscalar(),
- nsp_metric.get()[1] * 100, throughput.asscalar(), lr, duration, duration*1000/log_interval))
- # pylint: enable=line-too-long
-
-
-def split_and_load(arrs, ctx):
- """split and load arrays to a list of contexts"""
- assert isinstance(arrs, (list, tuple))
- # split and load
- loaded_arrs = [mx.gluon.utils.split_and_load(arr, ctx, even_split=False) for arr in arrs]
- return zip(*loaded_arrs)
-
-
-class BERTForPretrain(mx.gluon.Block):
- """Model for pre-training MLM and NSP with BERT.
-
- Parameters
- ----------
- bert: BERTModel
- Bidirectional encoder with transformer.
- mlm_loss : Loss or None
- nsp_loss : Loss or None
- vocab_size : int
- prefix : str or None
- See document of `mx.gluon.Block`.
- params : ParameterDict or None
- See document of `mx.gluon.Block`.
- """
-
- def __init__(self, bert, mlm_loss, nsp_loss, vocab_size, prefix=None, params=None):
- super(BERTForPretrain, self).__init__(prefix=prefix, params=params)
- self.bert = bert
- self.mlm_loss = mlm_loss
- self.nsp_loss = nsp_loss
- self._vocab_size = vocab_size
-
- def forward(self, input_id, masked_id, masked_position, masked_weight,
- next_sentence_label=None, segment_id=None, valid_length=None):
- # pylint: disable=arguments-differ
- """Predict with BERT for MLM and NSP. """
- num_masks = masked_weight.sum() + 1e-8
- valid_length = valid_length.reshape(-1)
- masked_id = masked_id.reshape(-1)
- _, _, classified, decoded = self.bert(input_id, segment_id, valid_length, masked_position)
- decoded = decoded.reshape((-1, self._vocab_size))
- ls1 = self.mlm_loss(decoded.astype('float32', copy=False),
- masked_id, masked_weight.reshape((-1, 1)))
- ls2 = self.nsp_loss(classified.astype('float32', copy=False), next_sentence_label)
- ls1 = ls1.sum() / num_masks
- ls2 = ls2.mean()
- return classified, decoded, ls1, ls2
-
-
-def evaluate(data_eval, model, ctx, log_interval, dtype):
- """Evaluation function."""
- logging.info('Running evaluation ... ')
- mlm_metric = nlp.metric.MaskedAccuracy()
- nsp_metric = nlp.metric.MaskedAccuracy()
- mlm_metric.reset()
- nsp_metric.reset()
-
- eval_begin_time = time.time()
- begin_time = time.time()
- step_num = 0
- running_mlm_loss = running_nsp_loss = 0
- total_mlm_loss = total_nsp_loss = 0
- running_num_tks = 0
- for _, data_batch in enumerate(data_eval):
- step_num += 1
-
- data_list = split_and_load(data_batch, ctx)
- ns_label_list, ns_pred_list = [], []
- mask_label_list, mask_pred_list, mask_weight_list = [], [], []
- for data in data_list:
- (input_id, masked_id, masked_position, masked_weight, \
- next_sentence_label, segment_id, valid_length) = data
- valid_length = valid_length.astype(dtype, copy=False)
- out = model(input_id, masked_id, masked_position, masked_weight, \
- next_sentence_label, segment_id, valid_length)
- classified, decoded, ls1, ls2 = out
- masked_id = masked_id.reshape(-1)
- ns_label_list.append(next_sentence_label)
- ns_pred_list.append(classified)
- mask_label_list.append(masked_id)
- mask_pred_list.append(decoded)
- mask_weight_list.append(masked_weight)
-
- valid_length = valid_length.astype('float32', copy=False)
- running_mlm_loss += ls1.as_in_context(mx.cpu())
- running_nsp_loss += ls2.as_in_context(mx.cpu())
- running_num_tks += valid_length.sum().as_in_context(mx.cpu())
- nsp_metric.update(ns_label_list, ns_pred_list)
- mlm_metric.update(mask_label_list, mask_pred_list, mask_weight_list)
-
- # logging
- if (step_num + 1) % (log_interval) == 0:
- total_mlm_loss += running_mlm_loss
- total_nsp_loss += running_nsp_loss
- log(begin_time, running_num_tks, running_mlm_loss, running_nsp_loss,
- step_num, mlm_metric, nsp_metric, None, log_interval)
- begin_time = time.time()
- running_mlm_loss = running_nsp_loss = running_num_tks = 0
- mlm_metric.reset_local()
- nsp_metric.reset_local()
-
- mx.nd.waitall()
- eval_end_time = time.time()
- # accumulate losses from last few batches, too
- if running_mlm_loss != 0:
- total_mlm_loss += running_mlm_loss
- total_nsp_loss += running_nsp_loss
- total_mlm_loss /= step_num
- total_nsp_loss /= step_num
- logging.info('Eval mlm_loss={:.3f}\tmlm_acc={:.1f}\tnsp_loss={:.3f}\tnsp_acc={:.1f}\t'
- .format(total_mlm_loss.asscalar(), mlm_metric.get_global()[1] * 100,
- total_nsp_loss.asscalar(), nsp_metric.get_global()[1] * 100))
- logging.info('Eval cost={:.1f}s'.format(eval_end_time - eval_begin_time))
-
-
-def generate_dev_set(tokenizer, vocab, cache_file, args):
- """Generate validation set."""
- # set random seed to generate dev data deterministically
- np.random.seed(0)
- random.seed(0)
- mx.random.seed(0)
- worker_pool = multiprocessing.Pool()
- eval_files = nlp.utils.glob(args.data_eval)
- num_files = len(eval_files)
- assert num_files > 0, 'Number of eval files must be greater than 0.' \
- 'Only found %d files at %s'%(num_files, args.data_eval)
- logging.info('Generating validation set from %d files on rank 0.', len(eval_files))
- create_training_instances((eval_files, tokenizer, args.max_seq_length,
- args.short_seq_prob, args.masked_lm_prob,
- args.max_predictions_per_seq,
- args.whole_word_mask, vocab,
- 1, args.num_dataset_workers,
- worker_pool, cache_file))
- logging.info('Done generating validation set on rank 0.')
-
-def profile(curr_step, start_step, end_step, profile_name='profile.json',
- early_exit=True):
- """profile the program between [start_step, end_step)."""
- if curr_step == start_step:
- mx.nd.waitall()
- mx.profiler.set_config(profile_memory=False, profile_symbolic=True,
- profile_imperative=True, filename=profile_name,
- aggregate_stats=True)
- mx.profiler.set_state('run')
- elif curr_step == end_step:
- mx.nd.waitall()
- mx.profiler.set_state('stop')
- logging.info(mx.profiler.dumps())
- mx.profiler.dump()
- if early_exit:
- sys.exit(0)
diff --git a/scripts/bert/run_pretraining.py b/scripts/bert/run_pretraining.py
deleted file mode 100644
index 8a5b4bb295..0000000000
--- a/scripts/bert/run_pretraining.py
+++ /dev/null
@@ -1,479 +0,0 @@
-"""
-Pre-training Bidirectional Encoder Representations from Transformers
-=========================================================================================
-This example shows how to pre-train a BERT model with Gluon NLP Toolkit.
-@article{devlin2018bert,
- title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
- author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
- journal={arXiv preprint arXiv:1810.04805},
- year={2018}
-}
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-
-import os
-import sys
-import random
-import warnings
-import logging
-import functools
-import time
-import argparse
-
-import mxnet as mx
-import gluonnlp as nlp
-try:
- import horovod.mxnet as hvd
-except ImportError:
- pass
-
-from fp16_utils import FP16Trainer
-from pretraining_utils import get_model_loss, get_pretrain_data_npz, get_dummy_dataloader
-from pretraining_utils import split_and_load, log, log_noacc, evaluate
-from pretraining_utils import save_parameters, save_states, profile
-from pretraining_utils import get_pretrain_data_text, generate_dev_set
-
-# parser
-parser = argparse.ArgumentParser(description='BERT pretraining example.')
-# logging and serialization
-parser.add_argument('--ckpt_dir', type=str, default='./ckpt_dir',
- help='Path to checkpoint directory')
-parser.add_argument('--log_interval', type=int, default=250, help='Report interval')
-parser.add_argument('--ckpt_interval', type=int, default=25000, help='Checkpoint interval')
-# model
-parser.add_argument('--pretrained', action='store_true',
- help='Initialize the model with pretrained weights')
-parser.add_argument('--model', type=str, default='bert_12_768_12',
- choices=['bert_12_768_12', 'bert_24_1024_16'],
- help='Model to pre-train.')
-parser.add_argument('--dataset_name', type=str, default='book_corpus_wiki_en_uncased',
- choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased',
- 'wiki_multilingual_uncased', 'wiki_multilingual_cased',
- 'wiki_cn_cased'],
- help='The pre-defined dataset from which the vocabulary is created.')
-# training
-parser.add_argument('--data', type=str, default=None,
- help='Path to training data file. File name with wildcard such as '
- 'dir/*.train is accepted.')
-parser.add_argument('--total_batch_size', type=int, default=256,
- help='Global effective batch size. '
- 'total_batch_size = batch_size_per_worker * num_worker * accumulate.')
-parser.add_argument('--accumulate', type=int, default=1,
- help='Number of batches for gradient accumulation. '
- 'total_batch_size = batch_size_per_worker * num_worker * accumulate.')
-parser.add_argument('--num_steps', type=int, default=20, help='Number of optimization steps')
-parser.add_argument('--optimizer', type=str, default='bertadam',
- help='The optimization algorithm')
-parser.add_argument('--start_step', type=int, default=0,
- help='Start optimization step from the checkpoint.')
-parser.add_argument('--lr', type=float, default=1e-4, help='Learning rate')
-parser.add_argument('--warmup_ratio', type=float, default=0.01,
- help='ratio of warmup steps used in NOAM\'s stepsize schedule')
-parser.add_argument('--dtype', type=str, default='float16', help='data dtype')
-parser.add_argument('--no_compute_acc', action='store_true',
- help='skip accuracy metric computation during training')
-# validation
-parser.add_argument('--eval_interval', type=int, default=50000, help='Evaluation interval')
-parser.add_argument('--total_batch_size_eval', type=int, default=256,
- help='Global batch size for evaluation. total_batch_size_eval = '
- 'batch_size_eval_per_worker * num_worker * accumulate.')
-parser.add_argument('--data_eval', type=str, required=True,
- help='Path to evaluation data file. File name with wildcard such as '
- 'dir/*.dev is accepted.')
-parser.add_argument('--eval_use_npz', action='store_true',
- help='Set to True if --data_eval provides npz files instead of raw text files')
-# debugging
-parser.add_argument('--synthetic_data', action='store_true',
- help='If provided, synthetic data is used for training')
-parser.add_argument('--verbose', action='store_true', help='verbose logging')
-parser.add_argument('--profile', type=str, default=None,
- help='output profiling result to the provided file path')
-# data pre-processing
-parser.add_argument('--num_buckets', type=int, default=1,
- help='Number of buckets for variable length sequence sampling')
-parser.add_argument('--raw', action='store_true',
- help='If set, both training and dev samples are generated on-the-fly '
- 'from raw texts instead of pre-processed npz files. ')
-parser.add_argument('--max_seq_length', type=int, default=512,
- help='Maximum input sequence length. Effective only if --raw is set.')
-parser.add_argument('--short_seq_prob', type=float, default=0,
- help='The probability of producing sequences shorter than max_seq_length. '
- 'Effective only if --raw is set.')
-parser.add_argument('--masked_lm_prob', type=float, default=0.15,
- help='Probability for masks. Effective only if --raw is set.')
-parser.add_argument('--max_predictions_per_seq', type=int, default=80,
- help='Maximum number of predictions per sequence. '
- 'Effective only if --raw is set.')
-parser.add_argument('--cased', action='store_true',
- help='Whether to tokenize with cased characters. '
- 'Effective only if --raw is set.')
-parser.add_argument('--whole_word_mask', action='store_true',
- help='Whether to use whole word masking rather than per-subword masking.'
- 'Effective only if --raw is set.')
-parser.add_argument('--sentencepiece', default=None, type=str,
- help='Path to the sentencepiece .model file for both tokenization and vocab. '
- 'Effective only if --raw is set.')
-parser.add_argument('--num_dataset_workers', type=int, default=4,
- help='Number of workers to pre-process dataset.')
-parser.add_argument('--num_batch_workers', type=int, default=2,
- help='Number of workers to pre-process mini-batch.')
-parser.add_argument('--circle_length', type=int, default=2,
- help='Number of files to be read for a single GPU at the same time.')
-parser.add_argument('--repeat', type=int, default=8,
- help='Number of times that files are repeated in each shuffle.')
-parser.add_argument('--dataset_cached', action='store_true',
- help='Whether or not to cache the last processed training dataset.')
-parser.add_argument('--num_max_dataset_cached', type=int, default=0,
- help='Maximum number of cached processed training dataset.')
-# stage 2
-parser.add_argument('--phase2', action='store_true', help='phase 2 training')
-parser.add_argument('--phase1_num_steps', type=int, help='number of steps for phase 1')
-# communication
-parser.add_argument('--comm_backend', type=str, default='device',
- choices=['horovod', 'dist_sync_device', 'device'],
- help='Communication backend.')
-parser.add_argument('--gpus', type=str, default=None,
- help='List of gpus to run when device or dist_sync_device is used for '
- 'communication, e.g. 0 or 0,2,5. empty means using cpu.')
-args = parser.parse_args()
-
-# logging
-nlp.utils.mkdir(args.ckpt_dir)
-level = logging.DEBUG if args.verbose else logging.INFO
-os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
-
-class DataParallelBERT(nlp.utils.Parallelizable):
- """Data parallel BERT model.
-
- Parameters
- ----------
- model : Block
- The BERT model.
- """
- def __init__(self, model, trainer):
- self._model = model
- self._trainer = trainer
-
- def forward_backward(self, x):
- """forward backward implementation"""
- (input_id, masked_id, masked_position, masked_weight, \
- next_sentence_label, segment_id, valid_length) = x
-
- valid_length = valid_length.astype(args.dtype, copy=False)
- with mx.autograd.record():
- out = self._model(input_id, masked_id, masked_position, masked_weight,
- next_sentence_label, segment_id, valid_length)
- classified, decoded, ls1, ls2 = out
- ls = ls1 + ls2
- ls = ls / args.accumulate
- if self._trainer:
- self._trainer.backward(ls)
- else:
- ls.backward()
-
- masked_id = masked_id.reshape(-1)
- valid_length = valid_length.astype('float32', copy=False)
- return next_sentence_label, classified, masked_id, decoded, \
- masked_weight, ls1, ls2, valid_length
-
-def init_comm(backend):
- """Init communication backend"""
- # backend specific implementation
- if backend == 'horovod':
- try:
- import horovod.mxnet as hvd # pylint: disable=import-outside-toplevel
- except ImportError:
- logging.info('horovod must be installed.')
- sys.exit(1)
- hvd.init()
- store = None
- num_workers = hvd.size()
- rank = hvd.rank()
- local_rank = hvd.local_rank()
- is_master_node = rank == local_rank
- ctxs = [mx.gpu(local_rank)]
- else:
- # kvstore
- store = mx.kv.create(backend)
- num_workers = store.num_workers
- rank = store.rank
- local_rank = 0
- is_master_node = rank == local_rank
- ctxs = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
- [mx.gpu(int(x)) for x in args.gpus.split(',')]
- return store, num_workers, rank, local_rank, is_master_node, ctxs
-
-backend = args.comm_backend
-store, num_workers, rank, local_rank, is_master_node, ctxs = init_comm(backend)
-
-filename = os.path.join(args.ckpt_dir,
- ('phase1_log.' if not args.phase2 else 'phase2_log.') + str(rank))
-logging.basicConfig(filename=filename)
-logging.getLogger().setLevel(level)
-logging.info(args)
-logging.info(os.environ)
-
-assert args.total_batch_size % (args.accumulate * num_workers) == 0
-assert args.total_batch_size_eval % (args.accumulate * num_workers) == 0
-batch_size = int(args.total_batch_size / num_workers / args.accumulate)
-batch_size_eval = int(args.total_batch_size_eval / num_workers / args.accumulate)
-assert batch_size > 0
-assert batch_size_eval > 0
-
-def train(data_train, data_eval, model):
- """Training function."""
- # backend specific implementation
- param_dict = model.bert.collect_params()
- if backend == 'horovod':
- hvd.broadcast_parameters(param_dict, root_rank=0)
-
- mlm_metric = nlp.metric.MaskedAccuracy()
- nsp_metric = nlp.metric.MaskedAccuracy()
- mlm_metric.reset()
- nsp_metric.reset()
-
- logging.info('Creating distributed trainer...')
- lr = args.lr
- optim_params = {'learning_rate': lr, 'epsilon': 1e-6, 'wd': 0.01}
- if args.dtype == 'float16':
- optim_params['multi_precision'] = True
-
- dynamic_loss_scale = args.dtype == 'float16'
- if dynamic_loss_scale:
- loss_scale_param = {'scale_window': 2000 / num_workers, 'init_scale': 2**10}
- else:
- loss_scale_param = None
-
- # backend specific implementation
- if backend == 'horovod':
- trainer = hvd.DistributedTrainer(param_dict, args.optimizer, optim_params)
- else:
- trainer = mx.gluon.Trainer(param_dict, args.optimizer, optim_params,
- update_on_kvstore=False)
- fp16_trainer = FP16Trainer(trainer, dynamic_loss_scale=dynamic_loss_scale,
- loss_scaler_params=loss_scale_param)
-
- if args.start_step:
- state_path = os.path.join(args.ckpt_dir, '%07d.states.%02d'%(args.start_step, local_rank))
- logging.info('Loading trainer state from %s', state_path)
- nlp.utils.load_states(trainer, state_path)
-
- accumulate = args.accumulate
- num_train_steps = args.num_steps
- warmup_ratio = args.warmup_ratio
- num_warmup_steps = int(num_train_steps * warmup_ratio)
- params = [p for p in param_dict.values() if p.grad_req != 'null']
-
- # Do not apply weight decay on LayerNorm and bias terms
- for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
- v.wd_mult = 0.0
- if accumulate > 1:
- for p in params:
- p.grad_req = 'add'
-
- train_begin_time = time.time()
- begin_time = time.time()
- running_mlm_loss, running_nsp_loss = 0, 0
- running_num_tks = 0
- batch_num = 0
- step_num = args.start_step
-
- if args.phase2:
- step_num -= args.phase1_num_steps
-
- logging.info('Training started')
-
- # create dummy data loader if needed
- parallel_model = DataParallelBERT(model, trainer=fp16_trainer)
- num_ctxes = len(ctxs)
- parallel = nlp.utils.Parallel(num_ctxes if num_ctxes > 1 else 0, parallel_model)
-
- while step_num < num_train_steps:
-
- data_train_iter = iter(data_train)
- end_of_batch = False
- next_data_batch = next(data_train_iter)
- while not end_of_batch:
- data_batch = next_data_batch
- if step_num >= num_train_steps:
- break
- if batch_num % accumulate == 0:
- step_num += 1
- # update learning rate
- if step_num <= num_warmup_steps:
- new_lr = lr * step_num / num_warmup_steps
- else:
- offset = (num_train_steps - step_num) / (num_train_steps - num_warmup_steps)
- new_lr = lr * max(offset, 0)
- trainer.set_learning_rate(new_lr)
- if args.profile:
- profile(step_num, 10, 14, profile_name=args.profile + str(rank))
-
- # load data
- data_list = list(split_and_load(data_batch, ctxs))
-
- ns_label_list, ns_pred_list = [], []
- mask_label_list, mask_pred_list, mask_weight_list = [], [], []
-
- num_data = len(data_list)
- for i in range(num_data):
- parallel.put(data_list[i])
- for _ in range(num_data):
- (next_sentence_label, classified, masked_id,
- decoded, masked_weight, ls1, ls2, valid_length) = parallel.get()
- ns_label_list.append(next_sentence_label)
- ns_pred_list.append(classified)
- mask_label_list.append(masked_id)
- mask_pred_list.append(decoded)
- mask_weight_list.append(masked_weight)
- running_mlm_loss += ls1.as_in_context(mx.cpu()) / len(ctxs)
- running_nsp_loss += ls2.as_in_context(mx.cpu()) / len(ctxs)
- running_num_tks += valid_length.sum().as_in_context(mx.cpu())
- # pre fetch next batch
- try:
- next_data_batch = next(data_train_iter)
- except StopIteration:
- end_of_batch = True
-
- # update
- if (batch_num + 1) % accumulate == 0:
- fp16_trainer.step(1, max_norm=1.0 * num_workers)
- if accumulate > 1:
- param_dict.zero_grad()
- # update metrics
- if args.no_compute_acc:
- mask_pred_list[0].wait_to_read()
- else:
- nsp_metric.update(ns_label_list, ns_pred_list)
- mlm_metric.update(mask_label_list, mask_pred_list, mask_weight_list)
-
- # logging
- if step_num % (args.log_interval) == 0 and (batch_num + 1) % accumulate == 0:
- if args.no_compute_acc:
- log_noacc(begin_time, running_num_tks, running_mlm_loss / accumulate,
- running_nsp_loss / accumulate, step_num,
- trainer, args.log_interval)
- else:
- log(begin_time, running_num_tks, running_mlm_loss / accumulate,
- running_nsp_loss / accumulate, step_num, mlm_metric, nsp_metric,
- trainer, args.log_interval)
- mlm_metric.reset_local()
- nsp_metric.reset_local()
- begin_time = time.time()
- running_mlm_loss = running_nsp_loss = running_num_tks = 0
-
- # saving checkpoints
- if step_num % args.ckpt_interval == 0 and (batch_num + 1) % accumulate == 0:
- if is_master_node:
- save_states(step_num, trainer, args.ckpt_dir, local_rank)
- if local_rank == 0:
- save_parameters(step_num, model.bert, args.ckpt_dir)
- if step_num % args.eval_interval == 0 and data_eval \
- and (batch_num + 1) % accumulate == 0:
- # eval data is always based on a fixed npz file.
- dataset_eval = get_pretrain_data_npz(data_eval, batch_size_eval,
- 1, False, 1, vocab)
- evaluate(dataset_eval, model, ctxs, args.log_interval, args.dtype)
-
- batch_num += 1
-
- if is_master_node:
- save_states(step_num, trainer, args.ckpt_dir, local_rank)
- if local_rank == 0:
- save_parameters(step_num, model.bert, args.ckpt_dir)
- mx.nd.waitall()
- train_end_time = time.time()
- logging.info('Train cost={:.1f}s'.format(train_end_time - train_begin_time))
-
-if __name__ == '__main__':
- random_seed = random.randint(0, 1000)
-
- dataset_name, vocab = args.dataset_name, None
- if args.sentencepiece:
- logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
- if args.dataset_name:
- warnings.warn('Both --dataset_name and --sentencepiece are provided. '
- 'The vocabulary will be loaded based on --sentencepiece')
- dataset_name = None
- vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
-
- model, vocab = get_model_loss(ctxs, args.model, args.pretrained,
- dataset_name, vocab, args.dtype,
- ckpt_dir=args.ckpt_dir,
- start_step=args.start_step)
- logging.info('Model created')
- data_eval = args.data_eval
-
- if args.raw:
- if args.sentencepiece:
- tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab,
- lower=not args.cased)
- else:
- tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=not args.cased)
-
- cache_dir = os.path.join(args.ckpt_dir, 'data_eval_cache')
- cache_file = os.path.join(cache_dir, 'part-000.npz')
- nlp.utils.mkdir(cache_dir)
-
- # generate dev dataset from the raw text if needed
- if not args.eval_use_npz:
- data_eval = cache_file
- if not os.path.isfile(cache_file) and rank == 0:
- generate_dev_set(tokenizer, vocab, cache_file, args)
-
- logging.debug('Random seed set to %d', random_seed)
- mx.random.seed(random_seed)
-
- if args.data:
- if args.raw:
- get_dataset_fn = functools.partial(get_pretrain_data_text,
- max_seq_length=args.max_seq_length,
- short_seq_prob=args.short_seq_prob,
- masked_lm_prob=args.masked_lm_prob,
- max_predictions_per_seq=args.max_predictions_per_seq,
- whole_word_mask=args.whole_word_mask,
- tokenizer=tokenizer,
- circle_length=args.circle_length,
- repeat=args.repeat,
- dataset_cached=args.dataset_cached,
- num_max_dataset_cached=args.num_max_dataset_cached)
- else:
- get_dataset_fn = get_pretrain_data_npz
-
- if args.synthetic_data:
- data_train = get_dummy_dataloader(batch_size, args.max_seq_length,
- args.max_predictions_per_seq)
- else:
- shuffle = True
- logging.info('args.num_buckets: {}, num_workers: {}, rank: {}'.format(args.num_buckets,
- num_workers,
- rank))
- data_train = get_dataset_fn(args.data, batch_size,
- len(ctxs), shuffle, args.num_buckets, vocab,
- num_parts=num_workers, part_idx=rank,
- num_dataset_workers=args.num_dataset_workers,
- num_batch_workers=args.num_batch_workers)
- train(data_train, data_eval, model)
- if data_eval:
- # eval data is always based on a fixed npz file.
- shuffle = False
- dataset_eval = get_pretrain_data_npz(data_eval, batch_size_eval,
- len(ctxs), shuffle, 1, vocab)
- evaluate(dataset_eval, model, ctxs, args.log_interval, args.dtype)
diff --git a/scripts/bert/sample_text.txt b/scripts/bert/sample_text.txt
deleted file mode 100644
index a42812060c..0000000000
--- a/scripts/bert/sample_text.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
-Text should be one-sentence-per-line, with empty lines between documents.
-This sample text is public domain and was randomly selected from Project Guttenberg.
-
-The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
-Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
-Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
-"Cass" Beard had risen early that morning, but not with a view to discovery.
-A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
-The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
-This was nearly opposite.
-Mr. Cassius crossed the highway, and stopped suddenly.
-Something glittered in the nearest red pool before him.
-Gold, surely!
-But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
-Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
-Like most of his fellow gold-seekers, Cass was superstitious.
-
-The fountain of classic wisdom, Hypatia herself.
-As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
-From my youth I felt in me a soul above the matter-entangled herd.
-She revealed to me the glorious fact, that I am a spark of Divinity itself.
-A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
-There is a philosophic pleasure in opening one's treasures to the modest young.
-Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
-Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
-but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
-Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
-His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
-while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
-At last they reached the quay at the opposite end of the street;
-and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
-He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
diff --git a/scripts/bert/utils.py b/scripts/bert/utils.py
deleted file mode 100644
index c2c0b5694d..0000000000
--- a/scripts/bert/utils.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility functions for BERT."""
-
-import sys
-import logging
-import collections
-import hashlib
-import io
-
-import mxnet as mx
-import gluonnlp as nlp
-
-__all__ = ['tf_vocab_to_gluon_vocab', 'load_text_vocab']
-
-
-def tf_vocab_to_gluon_vocab(tf_vocab):
- special_tokens = ['[UNK]', '[PAD]', '[SEP]', '[MASK]', '[CLS]']
- assert all(t in tf_vocab for t in special_tokens)
- counter = nlp.data.count_tokens(tf_vocab.keys())
- vocab = nlp.vocab.BERTVocab(counter, token_to_idx=tf_vocab)
- return vocab
-
-
-def get_hash(filename):
- sha1 = hashlib.sha1()
- with open(filename, 'rb') as f:
- while True:
- data = f.read(1048576)
- if not data:
- break
- sha1.update(data)
- return sha1.hexdigest(), str(sha1.hexdigest())[:8]
-
-
-def read_tf_checkpoint(path):
- """read tensorflow checkpoint"""
- from tensorflow.python import pywrap_tensorflow # pylint: disable=import-outside-toplevel
- tensors = {}
- reader = pywrap_tensorflow.NewCheckpointReader(path)
- var_to_shape_map = reader.get_variable_to_shape_map()
- for key in sorted(var_to_shape_map):
- tensor = reader.get_tensor(key)
- tensors[key] = tensor
- return tensors
-
-def profile(curr_step, start_step, end_step, profile_name='profile.json',
- early_exit=True):
- """profile the program between [start_step, end_step)."""
- if curr_step == start_step:
- mx.nd.waitall()
- mx.profiler.set_config(profile_memory=False, profile_symbolic=True,
- profile_imperative=True, filename=profile_name,
- aggregate_stats=True)
- mx.profiler.set_state('run')
- elif curr_step == end_step:
- mx.nd.waitall()
- mx.profiler.set_state('stop')
- logging.info(mx.profiler.dumps())
- mx.profiler.dump()
- if early_exit:
- sys.exit(0)
-
-def load_text_vocab(vocab_file):
- """Loads a vocabulary file into a dictionary."""
- vocab = collections.OrderedDict()
- index = 0
- with io.open(vocab_file, 'r') as reader:
- while True:
- token = reader.readline()
- if not token:
- break
- token = token.strip()
- vocab[token] = index
- index += 1
- return vocab
diff --git a/scripts/conversion_toolkits/README.md b/scripts/conversion_toolkits/README.md
new file mode 100644
index 0000000000..8437202287
--- /dev/null
+++ b/scripts/conversion_toolkits/README.md
@@ -0,0 +1,77 @@
+# Conversion Scripts
+
+In GluonNLP, we provide shared scripts to convert the model checkpoints in other repositories to GluonNLP.
+
+At this stage, the model needs to be downloaded locally, and the converting scripts accepts only the file directory as the argument,
+without the support of accepting the url. In addition, both the tensorflow fine-tuned models that
+can be loaded in TF1 Hub modules and TF2 SavedModels are accepted, although the parameters of mask
+language model are not provided in TF2 SavedModels in most cases, and
+the differences of these parameters are not required to be tested after converting.
+
+The testing step mentioned above are controlled by the flag `--test`, in which the maximum
+tolerance of 1e-3 between gluon model with converted weights and original tensorflow model.
+In addition, we can use GPU in all converting scripts by adding `--gpu 0`.
+
+For RoBERTa XLM-R and BART model, we rely on the master version of [fairseq](https://github.com/pytorch/fairseq#requirements-and-installation) package locally as `pip install git+https://github.com/pytorch/fairseq.git@master`.
+
+## Convert all models
+
+``bash
+bash convert_all.sh
+``
+
+### BERT
+Convert model from [BERT LIST](https://tfhub.dev/google/collections/bert/1).
+
+You can use the script provided in [convert_bert_from_tf_hub.sh](convert_bert_from_tf_hub.sh).
+The following command give you a rough idea about the code.
+
+```bash
+bash convert_bert_from_tf_hub.sh
+```
+
+In the process, we downloaded the config file from the [official repo](https://github.com/google-research/bert#pre-trained-models), download the configuration file `bert_config.json`,
+and move it into `${case}_bert_${model}/assets/`.
+
+### ALBERT
+You can use the command described in
+```bash
+bash convert_albert_from_tf_hub.sh
+```
+
+### ELECTRA
+The TF Hub is not available for ELECTRA model currently.
+Thus, you will need to clone the [electra repository](https://github.com/ZheyuYe/electra)
+and download the checkpoint. The parameters are converted from local checkpoints.
+By running the following command, you can convert + verify the ELECTRA model with both the discriminator and the generator.
+
+Notice: please set up the `--electra_path` with the cloned path if you'd like to directly use `convert_electra.py`.
+
+```bash
+bash convert_electra.sh
+```
+
+### MobileBert
+```bash
+bash convert_mobilebert.sh
+```
+
+### RoBERTa
+```bash
+bash convert_roberta.sh
+```
+
+### XLM-R
+```bash
+bash convert_xlmr.sh
+```
+
+### BART
+```bash
+bash convert_bart.sh
+```
+
+### GPT-2
+```bash
+bash convert_gpt2.sh
+```
diff --git a/scripts/conversion_toolkits/bert_base_config.json b/scripts/conversion_toolkits/bert_base_config.json
new file mode 100644
index 0000000000..fca794a5f0
--- /dev/null
+++ b/scripts/conversion_toolkits/bert_base_config.json
@@ -0,0 +1,13 @@
+{
+ "attention_probs_dropout_prob": 0.1,
+ "hidden_act": "gelu",
+ "hidden_dropout_prob": 0.1,
+ "hidden_size": 768,
+ "initializer_range": 0.02,
+ "intermediate_size": 3072,
+ "max_position_embeddings": 512,
+ "num_attention_heads": 12,
+ "num_hidden_layers": 12,
+ "type_vocab_size": 2,
+ "vocab_size": 30522
+}
diff --git a/scripts/conversion_toolkits/bert_large_config.json b/scripts/conversion_toolkits/bert_large_config.json
new file mode 100644
index 0000000000..a7efa973d7
--- /dev/null
+++ b/scripts/conversion_toolkits/bert_large_config.json
@@ -0,0 +1,13 @@
+{
+ "attention_probs_dropout_prob": 0.1,
+ "hidden_act": "gelu",
+ "hidden_dropout_prob": 0.1,
+ "hidden_size": 1024,
+ "initializer_range": 0.02,
+ "intermediate_size": 4096,
+ "max_position_embeddings": 512,
+ "num_attention_heads": 16,
+ "num_hidden_layers": 24,
+ "type_vocab_size": 2,
+ "vocab_size": 30522
+}
diff --git a/scripts/conversion_toolkits/convert_albert.sh b/scripts/conversion_toolkits/convert_albert.sh
new file mode 100644
index 0000000000..69c37e7bd1
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_albert.sh
@@ -0,0 +1,11 @@
+python3 -m pip install tensorflow==1.15 --upgrade --user
+python3 -m pip install tensorflow_hub --upgrade --user
+export TF_FORCE_GPU_ALLOW_GROWTH="true"
+for model in base large xlarge xxlarge
+do
+ hub_directory="google_albert_${model}_v2"
+ mkdir ${hub_directory}
+ wget "https://tfhub.dev/google/albert_${model}/3?tf-hub-format=compressed" -O "${hub_directory}.tar.gz"
+ tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
+ python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type albert --test
+done
diff --git a/scripts/conversion_toolkits/convert_all.sh b/scripts/conversion_toolkits/convert_all.sh
new file mode 100644
index 0000000000..a38031e9e1
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_all.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+bash convert_bert_from_tf_hub.sh
+bash convert_albert_from_tf_hub.sh
+bash convert_electra.sh
+bash convert_mobilebert.sh
+bash convert_roberta.sh
+bash convert_xlmr.sh
+bash convert_bart.sh
+bash convert_gpt2.sh
diff --git a/scripts/conversion_toolkits/convert_bart.sh b/scripts/conversion_toolkits/convert_bart.sh
new file mode 100644
index 0000000000..ee6cd1b3ec
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_bart.sh
@@ -0,0 +1,8 @@
+python3 -m pip install git+https://github.com/pytorch/fairseq.git@master --upgrade --user
+for model in base large
+do
+ mkdir bart_${model}
+ wget "https://dl.fbaipublicfiles.com/fairseq/models/bart.${model}.tar.gz"
+ tar zxf bart.${model}.tar.gz --directory bart_${model}
+ python3 convert_fairseq_bart.py --fairseq_model_path bart_${model}/bart.${model} --test
+done
diff --git a/scripts/conversion_toolkits/convert_bert.sh b/scripts/conversion_toolkits/convert_bert.sh
new file mode 100644
index 0000000000..1fd3432265
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_bert.sh
@@ -0,0 +1,52 @@
+python3 -m pip install tensorflow==2.3.0 --upgrade --user
+python3 -m pip install tensorflow_hub --upgrade --user
+export TF_FORCE_GPU_ALLOW_GROWTH="true"
+
+# Conversion for English Models
+for model in base large
+do
+ for case in cased uncased
+ do
+ hub_directory="google_en_${case}_bert_${model}"
+ mkdir ${hub_directory}
+ if [ ${model} == base ];then
+ url="https://tfhub.dev/google/bert_${case}_L-12_H-768_A-12/1?tf-hub-format=compressed"
+ else
+ url="https://tfhub.dev/google/bert_${case}_L-24_H-1024_A-16/1?tf-hub-format=compressed"
+ fi
+ wget ${url} -O "${hub_directory}.tar.gz"
+ tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
+ cp bert_${model}_config.json ${hub_directory}/assets/
+ python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
+ done
+done
+
+# Conversion for Chinese Models
+url="https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/2?tf-hub-format=compressed"
+hub_directory="google_zh_bert_base"
+mkdir ${hub_directory}
+wget ${url} -O "${hub_directory}.tar.gz"
+tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
+cp bert_base_config.json ${hub_directory}/assets/
+python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
+
+# Conversion for Multi-lingual Models
+url="https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/2?tf-hub-format=compressed"
+hub_directory="google_multi_cased_bert_base"
+mkdir ${hub_directory}
+wget ${url} -O "${hub_directory}.tar.gz"
+tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
+cp bert_base_config.json ${hub_directory}/assets/
+python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
+
+# Conversion for Whole-word-masking Models
+for case in cased uncased
+do
+ hub_directory="google_en_${case}_bert_wwm_large"
+ mkdir ${hub_directory}
+ url="https://tfhub.dev/tensorflow/bert_en_wwm_${case}_L-24_H-1024_A-16/2?tf-hub-format=compressed"
+ wget ${url} -O "${hub_directory}.tar.gz"
+ tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
+ cp bert_large_config.json ${hub_directory}/assets/
+ python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
+done
diff --git a/scripts/conversion_toolkits/convert_electra.py b/scripts/conversion_toolkits/convert_electra.py
new file mode 100644
index 0000000000..6d60f0e37b
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_electra.py
@@ -0,0 +1,439 @@
+import os
+import re
+import sys
+import shutil
+import logging
+import argparse
+
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_allclose
+
+from gluonnlp.utils.misc import naming_convention, logging_config
+from gluonnlp.data.tokenizers import HuggingFaceWordPieceTokenizer
+from gluonnlp.models.electra import ElectraModel, \
+ ElectraGenerator, ElectraDiscriminator, ElectraForPretrain, get_generator_cfg
+import tensorflow.compat.v1 as tf
+
+tf.disable_eager_execution()
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
+
+mx.npx.set_np()
+np.random.seed(1234)
+mx.npx.random.seed(1234)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Convert the TF Electra Model to Gluon')
+ parser.add_argument('--tf_model_path', type=str,
+ help='Directory of the model downloaded from TF hub.')
+ parser.add_argument('--electra_path', type=str,
+ help='Path to the github repository of electra, you may clone it by '
+ '`git clone https://github.com/ZheyuYe/electra.git`.')
+ parser.add_argument('--model_size', type=str, choices=['small', 'base', 'large'],
+ help='Size of the Electra model')
+ parser.add_argument('--save_dir', type=str, default=None,
+ help='directory path to save the converted Electra model.')
+ parser.add_argument('--gpu', type=int, default=None,
+ help='a single gpu to run mxnet, e.g. 0 or 1 The default device is cpu ')
+ parser.add_argument('--test', action='store_true')
+ args = parser.parse_args()
+ return args
+
+
+def read_tf_checkpoint(path):
+ """read tensorflow checkpoint"""
+ from tensorflow.python import pywrap_tensorflow
+ tensors = {}
+ reader = pywrap_tensorflow.NewCheckpointReader(path)
+ var_to_shape_map = reader.get_variable_to_shape_map()
+ for key in sorted(var_to_shape_map):
+ tensor = reader.get_tensor(key)
+ tensors[key] = tensor
+ return tensors
+
+
+def get_dict_config(model_size, electra_path):
+ sys.path.append(electra_path)
+ electra_dir = os.path.abspath(os.path.join(os.path.dirname(electra_path), os.path.pardir))
+ sys.path.append(electra_dir)
+ from electra.util.training_utils import get_bert_config
+ from electra.configure_pretraining import PretrainingConfig
+
+ config = PretrainingConfig(model_name='', data_dir='', model_size=model_size)
+ bert_config = get_bert_config(config)
+ # we are not store all configuration of electra generators but only the scale size.
+ config_dict = bert_config.to_dict()
+ config_dict.update(
+ {'embedding_size': config.embedding_size,
+ 'generator_hidden_size': config.generator_hidden_size,
+ 'generator_layers': config.generator_layers,
+ })
+ return config_dict
+
+
+def convert_tf_config(config_dict, vocab_size):
+ """Convert the config file"""
+
+ assert vocab_size == config_dict['vocab_size']
+ cfg = ElectraModel.get_cfg().clone()
+ cfg.defrost()
+ cfg.MODEL.vocab_size = vocab_size
+ cfg.MODEL.units = config_dict['hidden_size']
+ cfg.MODEL.embed_size = config_dict['embedding_size']
+ cfg.MODEL.hidden_size = config_dict['intermediate_size']
+ cfg.MODEL.max_length = config_dict['max_position_embeddings']
+ cfg.MODEL.num_heads = config_dict['num_attention_heads']
+ cfg.MODEL.num_layers = config_dict['num_hidden_layers']
+ cfg.MODEL.pos_embed_type = 'learned'
+ cfg.MODEL.activation = config_dict['hidden_act']
+ cfg.MODEL.layer_norm_eps = 1E-12
+ cfg.MODEL.num_token_types = config_dict['type_vocab_size']
+ cfg.MODEL.hidden_dropout_prob = float(config_dict['hidden_dropout_prob'])
+ cfg.MODEL.attention_dropout_prob = float(config_dict['attention_probs_dropout_prob'])
+ cfg.MODEL.dtype = 'float32'
+ cfg.MODEL.generator_layers_scale = config_dict['generator_layers']
+ cfg.MODEL.generator_units_scale = config_dict['generator_hidden_size']
+ cfg.INITIALIZER.weight = ['truncnorm', 0,
+ config_dict['initializer_range']] # TruncNorm(0, 0.02)
+ cfg.INITIALIZER.bias = ['zeros']
+ cfg.VERSION = 1
+ cfg.freeze()
+ return cfg
+
+
+def convert_tf_assets(tf_assets_dir, model_size, electra_path):
+ """Convert the assets file including config, vocab and tokenizer model"""
+ file_names = os.listdir(tf_assets_dir)
+ vocab_path = None
+ for ele in file_names:
+ if ele.endswith('.txt'):
+ assert vocab_path is None
+ vocab_path = ele
+ assert vocab_path is not None
+
+ if vocab_path:
+ vocab_path = os.path.join(tf_assets_dir, vocab_path)
+ vocab_size = len(open(vocab_path, 'r', encoding='utf-8').readlines())
+ config_dict = get_dict_config(model_size, electra_path)
+ cfg = convert_tf_config(config_dict, vocab_size)
+ return cfg, vocab_path
+
+
+CONVERT_MAP = [
+ ('backbone_model.discriminator_predictions/dense_1', 'rtd_encoder.2'),
+ ('backbone_model.discriminator_predictions/dense', 'rtd_encoder.0'),
+ ('backbone_model.generator_predictions/dense', 'mlm_decoder.0'),
+ ('backbone_model.generator_predictions/LayerNorm', 'mlm_decoder.2'),
+ ('backbone_model.generator_predictions/output_bias', 'mlm_decoder.3.bias'),
+ ('electra/', ''),
+ ('generator/', ''),
+ ('embeddings_project', 'embed_factorized_proj'),
+ ('embeddings/word_embeddings', 'word_embed.weight'),
+ ('embeddings/token_type_embeddings', 'token_type_embed.weight'),
+ ('embeddings/position_embeddings', 'token_pos_embed._embed.weight'),
+ ('layer_', 'all_encoder_layers.'),
+ ('embeddings/LayerNorm', 'embed_layer_norm'),
+ ('attention/output/LayerNorm', 'layer_norm'),
+ ('attention/output/dense', 'attention_proj'),
+ ('output/LayerNorm', 'ffn.layer_norm'),
+ ('LayerNorm', 'layer_norm'),
+ ('intermediate/dense', 'ffn.ffn_1'),
+ ('output/dense', 'ffn.ffn_2'),
+ ('output/', ''),
+ ('kernel', 'weight'),
+ ('/', '.'),
+]
+
+
+def get_name_map(tf_names, convert_type='backbone'):
+ """
+ Get the converting mapping between tensor names and mxnet names.
+ The above mapping CONVERT_MAP is effectively adaptive to Bert and Albert,
+ but there is no guarantee that it can match to other tf models in case of
+ some sepecial variable_scope (tensorflow) and prefix (mxnet).
+
+ Redefined mapping is encouraged to adapt the personalization model.
+
+ Parameters
+ ----------
+ tf_names
+ the parameters names of tensorflow model
+ convert_type
+ choices=['backbone', 'disc', 'gen']
+ Returns
+ -------
+ A dictionary with the following format:
+ {tf_names : mx_names}
+ """
+ name_map = {}
+ for source_name in tf_names:
+ target_name = source_name
+ if convert_type == 'backbone':
+ if 'electra' not in source_name:
+ continue
+ elif convert_type == 'disc':
+ target_name = 'backbone_model.' + target_name
+ if 'generator' in source_name:
+ continue
+ elif convert_type == 'gen':
+ target_name = 'backbone_model.' + target_name
+ if 'generator' not in source_name:
+ continue
+ else:
+ raise NotImplementedError
+ # skip the qkv weights
+ if 'self/' in source_name:
+ name_map[source_name] = None
+ continue
+ for old, new in CONVERT_MAP:
+ target_name = target_name.replace(old, new)
+ name_map[source_name] = target_name
+ return name_map
+
+
+def convert_tf_model(model_dir, save_dir, test_conversion, model_size, gpu, electra_path):
+ ctx = mx.gpu(gpu) if gpu is not None else mx.cpu()
+ if not os.path.exists(save_dir):
+ os.makedirs(save_dir)
+
+ cfg, vocab_path = convert_tf_assets(model_dir, model_size, electra_path)
+ with open(os.path.join(save_dir, 'model.yml'), 'w') as of:
+ of.write(cfg.dump())
+ new_vocab = HuggingFaceWordPieceTokenizer(
+ vocab_file=vocab_path,
+ unk_token='[UNK]',
+ pad_token='[PAD]',
+ cls_token='[CLS]',
+ sep_token='[SEP]',
+ mask_token='[MASK]',
+ lowercase=True).vocab
+ new_vocab.save(os.path.join(save_dir, 'vocab.json'))
+
+ # test input data
+ batch_size = 3
+ seq_length = 32
+ num_mask = 5
+ input_ids = np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length))
+ valid_length = np.random.randint(seq_length // 2, seq_length, (batch_size,))
+ input_mask = np.broadcast_to(np.arange(seq_length).reshape(1, -1), (batch_size, seq_length)) \
+ < np.expand_dims(valid_length, 1)
+ segment_ids = np.random.randint(0, 2, (batch_size, seq_length))
+ mlm_positions = np.random.randint(0, seq_length // 2, (batch_size, num_mask))
+
+ tf_input_ids = tf.constant(input_ids, dtype=np.int32)
+ tf_input_mask = tf.constant(input_mask, dtype=np.int32)
+ tf_segment_ids = tf.constant(segment_ids, dtype=np.int32)
+
+ init_checkpoint = os.path.join(model_dir, 'electra_{}'.format(model_size))
+ tf_params = read_tf_checkpoint(init_checkpoint)
+ # get parameter names for tensorflow with unused parameters filtered out.
+ tf_names = sorted(tf_params.keys())
+ tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names)
+ tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names)
+ tf_names = filter(lambda name: name != 'global_step', tf_names)
+ tf_names = filter(lambda name: name != 'generator_predictions/temperature', tf_names)
+ tf_names = list(tf_names)
+
+ # reload the electra module for this local scope
+ sys.path.append(electra_path)
+ electra_dir = os.path.abspath(os.path.join(os.path.dirname(electra_path), os.path.pardir))
+ sys.path.append(electra_dir)
+ from electra.util.training_utils import get_bert_config
+ from electra.configure_pretraining import PretrainingConfig
+ from electra.model import modeling
+
+ config = PretrainingConfig(model_name='', data_dir='', model_size=model_size)
+ bert_config = get_bert_config(config)
+ bert_model = modeling.BertModel(
+ bert_config=bert_config,
+ is_training=False,
+ input_ids=tf_input_ids,
+ input_mask=tf_input_mask,
+ token_type_ids=tf_segment_ids,
+ use_one_hot_embeddings=False,
+ embedding_size=cfg.MODEL.embed_size)
+ tvars = tf.trainable_variables()
+ assignment_map, _ = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+ tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+ with tf.Session() as sess:
+ sess.run(tf.global_variables_initializer())
+ # the name of the parameters are ending with ':0' like
+ # 'electra/embeddings/word_embeddings:0'
+ backbone_params = {v.name.split(":")[0]: v.read_value() for v in tvars}
+ backbone_params = sess.run(backbone_params)
+ tf_token_outputs_np = {
+ 'pooled_output': sess.run(bert_model.get_pooled_output()),
+ 'sequence_output': sess.run(bert_model.get_sequence_output()),
+ }
+
+ # The following part only ensure the parameters in backbone model are valid
+ for k in backbone_params:
+ assert_allclose(tf_params[k], backbone_params[k])
+
+ # Build gluon model and initialize
+ gluon_model = ElectraModel.from_cfg(cfg)
+ gluon_model.initialize(ctx=ctx)
+ gluon_model.hybridize()
+
+ gluon_disc_model = ElectraDiscriminator(cfg)
+ gluon_disc_model.initialize(ctx=ctx)
+ gluon_disc_model.hybridize()
+
+ gen_cfg = get_generator_cfg(cfg)
+ disc_backbone = gluon_disc_model.backbone_model
+ gluon_gen_model = ElectraGenerator(gen_cfg)
+ gluon_gen_model.tie_embeddings(disc_backbone.word_embed.collect_params(),
+ disc_backbone.token_type_embed.collect_params(),
+ disc_backbone.token_pos_embed.collect_params(),
+ disc_backbone.embed_layer_norm.collect_params())
+ gluon_gen_model.initialize(ctx=ctx)
+ gluon_gen_model.hybridize()
+
+ # pepare test data
+ mx_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
+ mx_valid_length = mx.np.array(valid_length, dtype=np.int32, ctx=ctx)
+ mx_token_types = mx.np.array(segment_ids, dtype=np.int32, ctx=ctx)
+ mx_masked_positions = mx.np.array(mlm_positions, dtype=np.int32, ctx=ctx)
+
+ for convert_type in ['backbone', 'disc', 'gen']:
+ name_map = get_name_map(tf_names, convert_type=convert_type)
+ # go through the gluon model to infer the shape of parameters
+
+ if convert_type == 'backbone':
+ model = gluon_model
+ contextual_embedding, pooled_output = model(
+ mx_input_ids, mx_token_types, mx_valid_length)
+ elif convert_type == 'disc':
+ model = gluon_disc_model
+ contextual_embedding, pooled_output, rtd_scores = \
+ model(mx_input_ids, mx_token_types, mx_valid_length)
+ elif convert_type == 'gen':
+ model = gluon_gen_model
+ contextual_embedding, pooled_output, mlm_scores = \
+ model(mx_input_ids, mx_token_types, mx_valid_length, mx_masked_positions)
+
+ # replace tensorflow parameter names with gluon parameter names
+ mx_params = model.collect_params()
+ all_keys = set(mx_params.keys())
+ for (src_name, dst_name) in name_map.items():
+ tf_param_val = tf_params[src_name]
+ if dst_name is None:
+ continue
+ all_keys.remove(dst_name)
+ if src_name.endswith('kernel'):
+ mx_params[dst_name].set_data(tf_param_val.T)
+ else:
+ mx_params[dst_name].set_data(tf_param_val)
+
+ # Merge query/kernel, key/kernel, value/kernel to encoder.all_encoder_groups.0.attn_qkv.weight
+ def convert_qkv_weights(tf_prefix, mx_prefix):
+ """
+ To convert the qkv weights with different prefix.
+
+ In tensorflow framework, the prefix of query/key/value for the albert model is
+ 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/kernel',
+ and that for the bert model is 'bert/encoder/layer_{}/attention/self/key/bias'.
+ In gluonnlp framework, the prefix is slightly different as
+ 'encoder.all_encoder_groups.0.attn_qkv.weight' for albert model and
+ 'encoder.all_layers.{}.attn_qkv.weight' for bert model, as the
+ curly braces {} can be filled with the layer number.
+ """
+ # Merge query_weight, key_weight, value_weight to mx_params
+ query_weight = tf_params[
+ '{}/query/kernel'.format(tf_prefix)]
+ key_weight = tf_params[
+ '{}/key/kernel'.format(tf_prefix)]
+ value_weight = tf_params[
+ '{}/value/kernel'.format(tf_prefix)]
+ mx_params['{}.attn_qkv.weight'.format(mx_prefix)].set_data(
+ np.concatenate([query_weight, key_weight, value_weight], axis=1).T)
+ # Merge query_bias, key_bias, value_bias to mx_params
+ query_bias = tf_params[
+ '{}/query/bias'.format(tf_prefix)]
+ key_bias = tf_params[
+ '{}/key/bias'.format(tf_prefix)]
+ value_bias = tf_params[
+ '{}/value/bias'.format(tf_prefix)]
+ mx_params['{}.attn_qkv.bias'.format(mx_prefix)].set_data(
+ np.concatenate([query_bias, key_bias, value_bias], axis=0))
+
+ # The below parameters of the generator are already initialized in the
+ # discriminator, no need to reload.
+ disc_embed_params = set(['backbone_model.embed_layer_norm.beta',
+ 'backbone_model.embed_layer_norm.gamma',
+ 'backbone_model.token_pos_embed._embed.weight',
+ 'backbone_model.token_type_embed.weight',
+ 'mlm_decoder.3.weight',
+ 'backbone_model.word_embed.weight'])
+
+ for key in all_keys:
+ if convert_type == 'gen' and key in disc_embed_params:
+ continue
+ assert re.match(r'^(backbone_model\.){0,1}encoder\.all_encoder_layers\.[\d]+\.attn_qkv\.(weight|bias)$',
+ key) is not None, 'Parameter key {} mismatch'.format(key)
+
+ tf_prefix = None
+ for layer_id in range(cfg.MODEL.num_layers):
+ mx_prefix = 'encoder.all_encoder_layers.{}'.format(layer_id)
+ if convert_type == 'gen':
+ mx_prefix = 'backbone_model.' + mx_prefix
+ tf_prefix = 'generator/encoder/layer_{}/attention/self'.format(layer_id)
+ elif convert_type == 'disc':
+ mx_prefix = 'backbone_model.' + mx_prefix
+ tf_prefix = 'electra/encoder/layer_{}/attention/self'.format(layer_id)
+ else:
+ tf_prefix = 'electra/encoder/layer_{}/attention/self'.format(layer_id)
+
+ convert_qkv_weights(tf_prefix, mx_prefix)
+
+ if convert_type == 'backbone':
+ # test conversion results for backbone model
+ if test_conversion:
+ tf_contextual_embedding = tf_token_outputs_np['sequence_output']
+ tf_pooled_output = tf_token_outputs_np['pooled_output']
+ contextual_embedding, pooled_output = model(
+ mx_input_ids, mx_token_types, mx_valid_length)
+ assert_allclose(pooled_output.asnumpy(), tf_pooled_output, 1E-3, 1E-3)
+ for i in range(batch_size):
+ ele_valid_length = valid_length[i]
+ assert_allclose(contextual_embedding[i, :ele_valid_length, :].asnumpy(),
+ tf_contextual_embedding[i, :ele_valid_length, :], 1E-3, 1E-3)
+ model.save_parameters(os.path.join(save_dir, 'model.params'), deduplicate=True)
+ logging.info('Convert the backbone model in {} to {}/{}'.format(model_dir,
+ save_dir, 'model.params'))
+ elif convert_type == 'disc':
+ model.save_parameters(os.path.join(save_dir, 'disc_model.params'), deduplicate=True)
+ logging.info(
+ 'Convert the discriminator model in {} to {}/{}'.format(model_dir, save_dir, 'disc_model.params'))
+ elif convert_type == 'gen':
+ model.save_parameters(os.path.join(save_dir, 'gen_model.params'), deduplicate=True)
+ logging.info('Convert the generator model in {} to {}/{}'.format(model_dir,
+ save_dir, 'gen_model.params'))
+
+ logging.info('Conversion finished!')
+ logging.info('Statistics:')
+
+ old_names = os.listdir(save_dir)
+ for old_name in old_names:
+ new_name, long_hash = naming_convention(save_dir, old_name)
+ old_path = os.path.join(save_dir, old_name)
+ new_path = os.path.join(save_dir, new_name)
+ shutil.move(old_path, new_path)
+ file_size = os.path.getsize(new_path)
+ logging.info('\t{}/{} {} {}'.format(save_dir, new_name, long_hash, file_size))
+
+
+if __name__ == '__main__':
+ args = parse_args()
+ logging_config()
+ save_dir = args.save_dir if args.save_dir is not None else os.path.basename(
+ args.tf_model_path) + '_gluon'
+ convert_tf_model(
+ args.tf_model_path,
+ save_dir,
+ args.test,
+ args.model_size,
+ args.gpu,
+ args.electra_path)
diff --git a/scripts/conversion_toolkits/convert_electra.sh b/scripts/conversion_toolkits/convert_electra.sh
new file mode 100644
index 0000000000..93c452329c
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_electra.sh
@@ -0,0 +1,12 @@
+python3 -m pip install tensorflow==1.15 --upgrade --user
+export TF_FORCE_GPU_ALLOW_GROWTH="true"
+git clone https://github.com/ZheyuYe/electra.git
+cd electra
+git checkout 923179410471f9e1820b3f0771c239e1752e4e18
+cd ..
+for model in small base large
+do
+ wget https://storage.googleapis.com/electra-data/electra_${model}.zip
+ unzip electra_${model}.zip
+ python3 convert_electra.py --tf_model_path electra_${model} --electra_path electra --model_size ${model} --test
+done
diff --git a/scripts/conversion_toolkits/convert_fairseq_bart.py b/scripts/conversion_toolkits/convert_fairseq_bart.py
new file mode 100644
index 0000000000..4c78fff23c
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_fairseq_bart.py
@@ -0,0 +1,321 @@
+import os
+import shutil
+import logging
+import argparse
+
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_allclose
+
+import torch
+from fairseq.models.bart import BARTModel as fairseq_BARTModel
+from gluonnlp.utils.misc import sha1sum, logging_config, naming_convention
+from gluonnlp.models.bart import BartModel
+from convert_fairseq_roberta import convert_vocab
+
+mx.npx.set_np()
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Convert the fairseq BART Model to Gluon.')
+ parser.add_argument('--fairseq_model_path', type=str, required=True,
+ help='Directory of the fairseq BART model.')
+ parser.add_argument('--save_dir', type=str, default=None,
+ help='Directory path to save the converted BART model.')
+ parser.add_argument('--gpu', type=int, default=None,
+ help='The single gpu to run mxnet, (e.g. --gpu 0) the default device is cpu.')
+ parser.add_argument('--test', action='store_true',
+ help='Whether to test the conversion.')
+ return parser.parse_args()
+
+
+def convert_config(fairseq_cfg, vocab_size, cfg):
+ print('converting config')
+ cfg.defrost()
+ # Config for the bart base model
+ cfg.MODEL.vocab_size = vocab_size
+ cfg.MODEL.max_src_length = fairseq_cfg.max_source_positions
+ cfg.MODEL.max_tgt_length = fairseq_cfg.max_target_positions
+ cfg.MODEL.pos_embed_type = 'learned'
+ cfg.MODEL.shared_embed = fairseq_cfg.share_all_embeddings
+ cfg.MODEL.scale_embed = not fairseq_cfg.no_scale_embedding
+ cfg.MODEL.tie_weights = fairseq_cfg.share_decoder_input_output_embed
+ cfg.MODEL.data_norm = fairseq_cfg.layernorm_embedding
+ cfg.MODEL.pooler_activation = fairseq_cfg.pooler_activation_fn
+ cfg.MODEL.layer_norm_eps = 1E-5
+ cfg.MODEL.dropout = fairseq_cfg.dropout
+ cfg.MODEL.activation_dropout = fairseq_cfg.activation_dropout
+ cfg.MODEL.attention_dropout = fairseq_cfg.attention_dropout
+ cfg.MODEL.dtype = 'float32'
+
+ # Parameters for the encoder
+ cfg.MODEL.ENCODER.pre_norm = fairseq_cfg.encoder_normalize_before
+ cfg.MODEL.ENCODER.num_layers = fairseq_cfg.encoder_layers
+ cfg.MODEL.ENCODER.units = fairseq_cfg.encoder_embed_dim
+ cfg.MODEL.ENCODER.num_heads = fairseq_cfg.encoder_attention_heads
+ cfg.MODEL.ENCODER.hidden_size = fairseq_cfg.encoder_ffn_embed_dim
+ cfg.MODEL.ENCODER.activation = fairseq_cfg.activation_fn
+
+ # Parameters for the decoder
+ cfg.MODEL.DECODER.pre_norm = fairseq_cfg.decoder_normalize_before
+ cfg.MODEL.DECODER.num_layers = fairseq_cfg.decoder_layers
+ cfg.MODEL.DECODER.units = fairseq_cfg.decoder_embed_dim
+ cfg.MODEL.DECODER.num_heads = fairseq_cfg.decoder_attention_heads
+ cfg.MODEL.DECODER.hidden_size = fairseq_cfg.decoder_ffn_embed_dim
+ cfg.MODEL.DECODER.activation = fairseq_cfg.activation_fn
+
+ cfg.INITIALIZER.embed = ['xavier', 'gaussian', 'in', 1.0]
+ cfg.INITIALIZER.weight = ['xavier', 'uniform', 'avg', 1.0]
+ cfg.INITIALIZER.bias = ['zeros']
+ cfg.VERSION = 1
+ cfg.freeze()
+ return cfg
+
+
+def convert_params(fairseq_model,
+ gluon_cfg,
+ ctx):
+ fairseq_params = fairseq_model.state_dict()
+ # apply a linear mapping to vocab dictionary
+ gluon_model = BartModel.from_cfg(gluon_cfg, use_pooler=False)
+ gluon_model.initialize(ctx=ctx)
+ gluon_model.hybridize()
+ gluon_params = gluon_model.collect_params()
+ all_keys = set(gluon_params.keys())
+
+ def convert_attention(num_layers,
+ fairseq_prefix,
+ gluon_prefix,
+ fairseq_attn_prefix='self_attn',
+ gluon_attn_prefix='attn_qkv'):
+ for layer_id in range(num_layers):
+ fs_atten_prefix = \
+ '{}.layers.{}.{}.' \
+ .format(fairseq_prefix, layer_id, fairseq_attn_prefix)
+ fs_q_weight = fairseq_params[fs_atten_prefix + 'q_proj.weight'].cpu().numpy()
+ fs_k_weight = fairseq_params[fs_atten_prefix + 'k_proj.weight'].cpu().numpy()
+ fs_v_weight = fairseq_params[fs_atten_prefix + 'v_proj.weight'].cpu().numpy()
+ fs_q_bias = fairseq_params[fs_atten_prefix + 'q_proj.bias'].cpu().numpy()
+ fs_k_bias = fairseq_params[fs_atten_prefix + 'k_proj.bias'].cpu().numpy()
+ fs_v_bias = fairseq_params[fs_atten_prefix + 'v_proj.bias'].cpu().numpy()
+ gl_qkv_prefix = \
+ '{}.layers.{}.{}.' \
+ .format(gluon_prefix, layer_id, gluon_attn_prefix)
+ gl_qkv_weight = gluon_params[gl_qkv_prefix + 'weight']
+ gl_qkv_bias = gluon_params[gl_qkv_prefix + 'bias']
+ all_keys.remove(gl_qkv_prefix + 'weight')
+ all_keys.remove(gl_qkv_prefix + 'bias')
+ gl_qkv_weight.set_data(
+ np.concatenate([fs_q_weight, fs_k_weight, fs_v_weight], axis=0))
+ gl_qkv_bias.set_data(
+ np.concatenate([fs_q_bias, fs_k_bias, fs_v_bias], axis=0))
+
+ def convert_ffn(num_layers, fairseq_prefix, gluon_prefix):
+ # convert feed forward layer in encoder
+ for layer_id in range(num_layers):
+ for k, v in [
+ ('fc1.weight', 'ffn.ffn_1.weight'),
+ ('fc1.bias', 'ffn.ffn_1.bias'),
+ ('fc2.weight', 'ffn.ffn_2.weight'),
+ ('fc2.bias', 'ffn.ffn_2.bias'),
+ ('final_layer_norm.weight', 'ffn.layer_norm.gamma'),
+ ('final_layer_norm.bias', 'ffn.layer_norm.beta')
+ ]:
+ fs_name = '{}.layers.{}.{}' \
+ .format(fairseq_prefix, layer_id, k)
+ gl_name = '{}.layers.{}.{}' \
+ .format(gluon_prefix, layer_id, v)
+ all_keys.remove(gl_name)
+ gluon_params[gl_name].set_data(
+ fairseq_params[fs_name].cpu().numpy())
+
+ print('converting embedding params')
+ padding_idx = fairseq_model.task.dictionary.pad_index
+ for fs_name, gl_name in [
+ ('model.encoder.embed_tokens.weight', 'src_embed_layer.weight'),
+ ('model.encoder.embed_positions.weight', 'src_pos_embed_layer._embed.weight'),
+ ('model.encoder.layernorm_embedding.weight', 'encoder.ln_data.gamma'),
+ ('model.encoder.layernorm_embedding.bias', 'encoder.ln_data.beta'),
+ ('model.decoder.embed_tokens.weight', 'tgt_embed_layer.weight'),
+ ('model.decoder.embed_positions.weight', 'tgt_pos_embed_layer._embed.weight'),
+ ('model.decoder.layernorm_embedding.weight', 'decoder.ln_data.gamma'),
+ ('model.decoder.layernorm_embedding.bias', 'decoder.ln_data.beta'),
+ # final projection in decoder
+ ('model.decoder.output_projection.weight', 'tgt_final_layer.weight'),
+ ]:
+ all_keys.remove(gl_name)
+ if 'embed_positions' in fs_name:
+ # position embed weight
+ gluon_params[gl_name].set_data(
+ fairseq_params[fs_name].cpu().numpy()[padding_idx + 1:, :])
+ else:
+ gluon_params[gl_name].set_data(
+ fairseq_params[fs_name].cpu().numpy())
+
+ print('converting encoder params')
+ encoder_num_layers = gluon_cfg.MODEL.ENCODER.num_layers
+ convert_attention(encoder_num_layers, 'model.encoder', 'encoder')
+ convert_ffn(encoder_num_layers, 'model.encoder', 'encoder')
+ for layer_id in range(encoder_num_layers):
+ for k, v in [
+ ('self_attn.out_proj.weight', 'attention_proj.weight'),
+ ('self_attn.out_proj.bias', 'attention_proj.bias'),
+ ('self_attn_layer_norm.weight', 'layer_norm.gamma'),
+ ('self_attn_layer_norm.bias', 'layer_norm.beta'),
+ ]:
+ fs_name = 'model.encoder.layers.{}.{}' \
+ .format(layer_id, k)
+ gl_name = 'encoder.layers.{}.{}' \
+ .format(layer_id, v)
+ all_keys.remove(gl_name)
+ gluon_params[gl_name].set_data(
+ fairseq_params[fs_name].cpu().numpy())
+
+ print('converting decoder params')
+ decoder_num_layers = gluon_cfg.MODEL.DECODER.num_layers
+ convert_attention(decoder_num_layers, 'model.decoder', 'decoder',
+ gluon_attn_prefix='attn_in_qkv')
+ convert_ffn(decoder_num_layers, 'model.decoder', 'decoder')
+
+ for layer_id in range(decoder_num_layers):
+ for k, v in [
+ ('self_attn.out_proj.weight', 'proj_in.weight'),
+ ('self_attn.out_proj.bias', 'proj_in.bias'),
+ ('self_attn_layer_norm.weight', 'ln_in.gamma'),
+ ('self_attn_layer_norm.bias', 'ln_in.beta'),
+ ('encoder_attn.out_proj.weight', 'proj_inter.weight'),
+ ('encoder_attn.out_proj.bias', 'proj_inter.bias'),
+ ('encoder_attn_layer_norm.weight', 'ln_inter.gamma'),
+ ('encoder_attn_layer_norm.bias', 'ln_inter.beta'),
+ ('encoder_attn.q_proj.weight', 'attn_inter_q.weight'),
+ ('encoder_attn.q_proj.bias', 'attn_inter_q.bias'),
+ ('encoder_attn.k_proj.weight', 'attn_inter_k.weight'),
+ ('encoder_attn.k_proj.bias', 'attn_inter_k.bias'),
+ ('encoder_attn.v_proj.weight', 'attn_inter_v.weight'),
+ ('encoder_attn.v_proj.bias', 'attn_inter_v.bias'),
+
+ ]:
+ fs_name = 'model.decoder.layers.{}.{}' \
+ .format(layer_id, k)
+ gl_name = 'decoder.layers.{}.{}' \
+ .format(layer_id, v)
+ all_keys.remove(gl_name)
+ gluon_params[gl_name].set_data(
+ fairseq_params[fs_name].cpu().numpy())
+
+ assert len(all_keys) == 0, 'parameters missing from tensorflow checkpoint'
+
+ # check parameters sharing if share_decoder_input_output_embed is true
+ assert np.array_equal(
+ fairseq_params['model.decoder.embed_tokens.weight'].cpu().numpy(),
+ fairseq_params['model.decoder.output_projection.weight'].cpu().numpy()
+ )
+ return gluon_model
+
+
+def test_model(fairseq_model, gluon_model, gpu):
+ print('testing model')
+ ctx = mx.gpu(gpu) if gpu is not None else mx.cpu()
+ batch_size = 3
+ seq_length = 32
+ vocab_size = len(fairseq_model.task.dictionary)
+ padding_id = fairseq_model.model.decoder.padding_idx
+ input_ids = np.random.randint( # skip padding_id
+ padding_id + 1,
+ vocab_size,
+ (batch_size, seq_length)
+ )
+ valid_length = np.random.randint(
+ seq_length // 2,
+ seq_length,
+ (batch_size,)
+ )
+
+ for i in range(batch_size): # add padding, for fairseq padding mask
+ input_ids[i, valid_length[i]:] = padding_id
+
+ gl_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
+ gl_valid_length = mx.np.array(valid_length, dtype=np.int32, ctx=ctx)
+ gl_dec_out = \
+ gluon_model(gl_input_ids, gl_valid_length, gl_input_ids, gl_valid_length)
+
+ fs_input_ids = torch.from_numpy(input_ids).cuda(gpu)
+ fairseq_model.model.eval()
+ fs_dec_out, fs_extra = \
+ fairseq_model.model.cuda(gpu)(
+ fs_input_ids,
+ valid_length,
+ fs_input_ids,
+ return_all_hiddens=True)
+
+ # checking decoder output
+ gl_dec_out = gl_dec_out.asnumpy()
+ fs_dec_out = fs_dec_out.detach().cpu().numpy()
+ for j in range(batch_size):
+ assert_allclose(
+ gl_dec_out[j, :valid_length[j], :],
+ fs_dec_out[j, :valid_length[j], :],
+ 1E-3,
+ 1E-3
+ )
+
+
+def rename(save_dir):
+ """Rename converted files with hash"""
+ old_names = os.listdir(save_dir)
+ for old_name in old_names:
+ old_path = os.path.join(save_dir, old_name)
+ long_hash = sha1sum(old_path)
+ file_prefix, file_sufix = old_name.split('.')
+ new_name = '{file_prefix}-{short_hash}.{file_sufix}'.format(
+ file_prefix=file_prefix,
+ short_hash=long_hash[:8],
+ file_sufix=file_sufix)
+ new_path = os.path.join(save_dir, new_name)
+ shutil.move(old_path, new_path)
+ file_size = os.path.getsize(new_path)
+ logging.info('\t{} {} {}'.format(new_path, long_hash, file_size))
+
+
+def convert_fairseq_model(args):
+ if not args.save_dir:
+ args.save_dir = os.path.basename(args.fairseq_model_path) + '_gluon'
+ if not os.path.exists(args.save_dir):
+ os.makedirs(args.save_dir)
+
+ fairseq_bart = fairseq_BARTModel.from_pretrained(args.fairseq_model_path,
+ checkpoint_file='model.pt')
+ vocab_size = convert_vocab(args, fairseq_bart)
+ gluon_cfg = convert_config(fairseq_bart.args, vocab_size,
+ BartModel.get_cfg().clone())
+ with open(os.path.join(args.save_dir, 'model.yml'), 'w') as of:
+ of.write(gluon_cfg.dump())
+
+ ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu()
+ gluon_bart = convert_params(fairseq_bart,
+ gluon_cfg,
+ ctx)
+ if args.test:
+ test_model(fairseq_bart, gluon_bart, args.gpu)
+
+ gluon_bart.save_parameters(os.path.join(args.save_dir, 'model.params'), deduplicate=True)
+ logging.info('Convert the BART MLM model in {} to {}'.
+ format(os.path.join(args.fairseq_model_path, 'model.pt'),
+ os.path.join(args.save_dir, 'model.params')))
+
+ logging.info('Conversion finished!')
+ logging.info('Statistics:')
+ old_names = os.listdir(args.save_dir)
+ for old_name in old_names:
+ new_name, long_hash = naming_convention(args.save_dir, old_name)
+ old_path = os.path.join(args.save_dir, old_name)
+ new_path = os.path.join(args.save_dir, new_name)
+ shutil.move(old_path, new_path)
+ file_size = os.path.getsize(new_path)
+ logging.info('\t{}/{} {} {}'.format(args.save_dir, new_name, long_hash, file_size))
+
+
+if __name__ == '__main__':
+ args = parse_args()
+ logging_config()
+ convert_fairseq_model(args)
diff --git a/scripts/conversion_toolkits/convert_fairseq_roberta.py b/scripts/conversion_toolkits/convert_fairseq_roberta.py
new file mode 100644
index 0000000000..bcdac44436
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_fairseq_roberta.py
@@ -0,0 +1,387 @@
+import os
+import re
+import sys
+import json
+import shutil
+import logging
+import argparse
+
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_allclose
+
+import torch
+from gluonnlp.data.vocab import Vocab as gluon_Vocab
+from gluonnlp.utils.misc import sha1sum, logging_config, naming_convention
+from fairseq.models.roberta import RobertaModel as fairseq_RobertaModel
+from gluonnlp.models.roberta import RobertaModel, RobertaForMLM
+from gluonnlp.data.tokenizers import HuggingFaceByteBPETokenizer
+
+mx.npx.set_np()
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Convert the fairseq RoBERTa Model to Gluon.')
+ parser.add_argument('--fairseq_model_path', type=str, required=True,
+ help='Directory of the fairseq RoBERTa model.')
+ parser.add_argument('--save_dir', type=str, default=None,
+ help='Directory path to save the converted RoBERTa model.')
+ parser.add_argument('--gpu', type=int, default=None,
+ help='The single gpu to run mxnet, (e.g. --gpu 0) the default device is cpu.')
+ parser.add_argument('--test', action='store_true',
+ help='Whether to test the conversion.')
+ return parser.parse_args()
+
+
+def convert_vocab(args, fairseq_model):
+ print('converting vocab')
+ fairseq_dict_path = os.path.join(args.fairseq_model_path, 'dict.txt')
+ merges_save_path = os.path.join(args.save_dir, 'gpt2.merges')
+ vocab_save_path = os.path.join(args.save_dir, 'gpt2.vocab')
+ fairseq_vocab = fairseq_model.task.dictionary
+ # bos_word attr missing in fairseq_vocab
+ fairseq_vocab.bos_word = fairseq_vocab[fairseq_vocab.bos_index]
+
+ assert os.path.exists(fairseq_dict_path), \
+ '{} not found'.format(fairseq_dict_path)
+ from mxnet.gluon.utils import download
+ temp_vocab_file = download(
+ 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json')
+ temp_merges_file = download(
+ 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe')
+ # copy merges directy
+ shutil.copy(temp_merges_file, merges_save_path)
+
+ # build vocab
+ transfer_dict = []
+ with open(fairseq_dict_path, 'r', encoding='utf-8') as f_dict:
+ for line in f_dict:
+ word_id, count = line.split(' ', 1)
+ transfer_dict.append(word_id)
+ transfer_dict = {transfer_dict[i]: i for i in range(len(transfer_dict))}
+ with open(temp_vocab_file, 'r', encoding='utf-8') as f_v:
+ inter_vocab = json.load(f_v)
+ # transfer by dict
+ for k in inter_vocab:
+ inter_vocab[k] = transfer_dict[str(inter_vocab[k])]
+ inter_vocab = list(inter_vocab.items())
+ inter_vocab = sorted(inter_vocab, key=lambda x: x[1])
+ tokens = [e[0] for e in inter_vocab]
+
+ tail = [
+ vocab for vocab in fairseq_vocab.indices.keys() if re.match(
+ r'^madeupword[\d]{4}$',
+ vocab) is not None]
+ all_tokens = ['', '', '', ''] + \
+ tokens + tail + ['']
+
+ gluon_vocab = gluon_Vocab(all_tokens,
+ unk_token=fairseq_vocab.unk_word,
+ pad_token=fairseq_vocab.pad_word,
+ eos_token=fairseq_vocab.eos_word,
+ bos_token=fairseq_vocab.bos_word,
+ mask_token=fairseq_vocab[-1])
+ gluon_vocab.save(vocab_save_path)
+ os.remove(temp_vocab_file)
+ os.remove(temp_merges_file)
+
+ gluon_tokenizer = HuggingFaceByteBPETokenizer(
+ merges_save_path,
+ vocab_save_path
+ )
+
+ if args.test:
+ test_vocab(fairseq_model, gluon_tokenizer)
+
+ vocab_size = len(fairseq_vocab)
+ print('| converted dictionary: {} types'.format(vocab_size))
+ return vocab_size
+
+
+def test_vocab(fairseq_model, gluon_tokenizer, check_all_tokens=False):
+ print('testing vocab')
+ fairseq_vocab = fairseq_model.task.dictionary
+ gluon_vocab = gluon_tokenizer.vocab
+ assert len(fairseq_vocab) == \
+ len(gluon_vocab)
+
+ # assert all_tokens
+ # roberta with gpt2 bytebpe bpe does not provide all tokens directly
+ if check_all_tokens:
+ for i in range(len(fairseq_vocab)):
+ assert fairseq_vocab[i] == gluon_vocab.all_tokens[i], \
+ '{}, {}, {}'.format(i, fairseq_vocab[i], gluon_vocab.all_tokens[i])
+
+ # assert special tokens
+ for special_tokens in ['unk', 'pad', 'eos', 'bos']:
+ assert getattr(fairseq_vocab, special_tokens + '_index') == \
+ getattr(gluon_vocab, special_tokens + '_id')
+ assert getattr(fairseq_vocab, special_tokens + '_word') == \
+ getattr(gluon_vocab, special_tokens + '_token')
+ # is the last one
+ assert fairseq_vocab[-1] == \
+ gluon_vocab.all_tokens[-1] == \
+ ''
+
+ sentence = "Hello, y'all! How are you Ⅷ 😁 😁 😁 ?" + \
+ 'GluonNLP is great!!!!!!' + \
+ "GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# 'abc'"
+ # assert encode
+ fs_tokens = fairseq_model.encode(sentence)
+ gl_tokens = gluon_tokenizer.encode(sentence, int)
+ # Notice: we may append bos and eos
+ # manuually after tokenizing sentences
+ assert fs_tokens.numpy().tolist()[1:-1] == gl_tokens
+
+ # assert decode
+ fs_sentence = fairseq_model.decode(fs_tokens)
+ gl_sentence = gluon_tokenizer.decode(gl_tokens)
+ assert fs_sentence == gl_sentence
+
+
+def convert_config(fairseq_cfg, vocab_size, cfg):
+ print('converting config')
+ cfg.defrost()
+ cfg.MODEL.vocab_size = vocab_size
+ cfg.MODEL.units = fairseq_cfg.encoder_embed_dim
+ cfg.MODEL.hidden_size = fairseq_cfg.encoder_ffn_embed_dim
+ cfg.MODEL.max_length = fairseq_cfg.max_positions
+ cfg.MODEL.num_heads = fairseq_cfg.encoder_attention_heads
+ cfg.MODEL.num_layers = fairseq_cfg.encoder_layers
+ cfg.MODEL.pos_embed_type = 'learned'
+ cfg.MODEL.activation = fairseq_cfg.activation_fn
+ cfg.MODEL.pooler_activation = fairseq_cfg.pooler_activation_fn
+ cfg.MODEL.layer_norm_eps = 1E-5
+ cfg.MODEL.hidden_dropout_prob = fairseq_cfg.dropout
+ cfg.MODEL.attention_dropout_prob = fairseq_cfg.attention_dropout
+ cfg.MODEL.dtype = 'float32'
+ cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
+ cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]
+ cfg.INITIALIZER.bias = ['zeros']
+ cfg.VERSION = 1
+ cfg.freeze()
+ return cfg
+
+
+def convert_params(fairseq_model,
+ gluon_cfg,
+ ctx):
+ fairseq_params = fairseq_model.state_dict()
+ fairseq_prefix = 'model.encoder.'
+ gluon_prefix = 'backbone_model.'
+ print('converting {} params'.format(gluon_prefix))
+
+ gluon_model = RobertaForMLM(backbone_cfg=gluon_cfg)
+ # output all hidden states for testing
+ gluon_model.backbone_model._output_all_encodings = True
+ gluon_model.backbone_model.encoder._output_all_encodings = True
+
+ gluon_model.initialize(ctx=ctx)
+ gluon_model.hybridize()
+ gluon_params = gluon_model.collect_params()
+ num_layers = gluon_cfg.MODEL.num_layers
+ for layer_id in range(num_layers):
+ fs_atten_prefix = \
+ '{}sentence_encoder.layers.{}.self_attn.' \
+ .format(fairseq_prefix, layer_id)
+ fs_q_weight = fairseq_params[fs_atten_prefix + 'q_proj.weight'].cpu().numpy()
+ fs_k_weight = fairseq_params[fs_atten_prefix + 'k_proj.weight'].cpu().numpy()
+ fs_v_weight = fairseq_params[fs_atten_prefix + 'v_proj.weight'].cpu().numpy()
+ fs_q_bias = fairseq_params[fs_atten_prefix + 'q_proj.bias'].cpu().numpy()
+ fs_k_bias = fairseq_params[fs_atten_prefix + 'k_proj.bias'].cpu().numpy()
+ fs_v_bias = fairseq_params[fs_atten_prefix + 'v_proj.bias'].cpu().numpy()
+ gl_qkv_prefix = \
+ '{}encoder.all_layers.{}.attn_qkv.' \
+ .format(gluon_prefix, layer_id)
+ gl_qkv_weight = gluon_params[gl_qkv_prefix + 'weight']
+ gl_qkv_bias = gluon_params[gl_qkv_prefix + 'bias']
+ gl_qkv_weight.set_data(
+ np.concatenate([fs_q_weight, fs_k_weight, fs_v_weight], axis=0))
+ gl_qkv_bias.set_data(
+ np.concatenate([fs_q_bias, fs_k_bias, fs_v_bias], axis=0))
+
+ for k, v in [
+ ('self_attn.out_proj.weight', 'attention_proj.weight'),
+ ('self_attn.out_proj.bias', 'attention_proj.bias'),
+ ('self_attn_layer_norm.weight', 'layer_norm.gamma'),
+ ('self_attn_layer_norm.bias', 'layer_norm.beta'),
+ ('fc1.weight', 'ffn.ffn_1.weight'),
+ ('fc1.bias', 'ffn.ffn_1.bias'),
+ ('fc2.weight', 'ffn.ffn_2.weight'),
+ ('fc2.bias', 'ffn.ffn_2.bias'),
+ ('final_layer_norm.weight', 'ffn.layer_norm.gamma'),
+ ('final_layer_norm.bias', 'ffn.layer_norm.beta')
+ ]:
+ fs_name = '{}sentence_encoder.layers.{}.{}' \
+ .format(fairseq_prefix, layer_id, k)
+ gl_name = '{}encoder.all_layers.{}.{}' \
+ .format(gluon_prefix, layer_id, v)
+ gluon_params[gl_name].set_data(
+ fairseq_params[fs_name].cpu().numpy())
+
+ for k, v in [
+ ('sentence_encoder.embed_tokens.weight', 'word_embed.weight'),
+ ('sentence_encoder.emb_layer_norm.weight', 'embed_ln.gamma'),
+ ('sentence_encoder.emb_layer_norm.bias', 'embed_ln.beta'),
+ ]:
+ fs_name = fairseq_prefix + k
+ gl_name = gluon_prefix + v
+ gluon_params[gl_name].set_data(
+ fairseq_params[fs_name].cpu().numpy())
+
+ # position embed weight
+ padding_idx = fairseq_model.task.dictionary.pad_index
+ fs_pos_embed_name = fairseq_prefix + 'sentence_encoder.embed_positions.weight'
+ gl_pos_embed_name = gluon_prefix + 'pos_embed._embed.weight'
+ gluon_params[gl_pos_embed_name].set_data(
+ fairseq_params[fs_pos_embed_name].cpu().numpy()[padding_idx + 1:, :])
+
+ for k, v in [
+ ('lm_head.dense.weight', 'mlm_decoder.0.weight'),
+ ('lm_head.dense.bias', 'mlm_decoder.0.bias'),
+ ('lm_head.layer_norm.weight', 'mlm_decoder.2.gamma'),
+ ('lm_head.layer_norm.bias', 'mlm_decoder.2.beta'),
+ ('lm_head.bias', 'mlm_decoder.3.bias')
+ ]:
+ fs_name = fairseq_prefix + k
+ gluon_params[v].set_data(
+ fairseq_params[fs_name].cpu().numpy())
+ # assert untie=False
+ assert np.array_equal(
+ fairseq_params[fairseq_prefix + 'sentence_encoder.embed_tokens.weight'].cpu().numpy(),
+ fairseq_params[fairseq_prefix + 'lm_head.weight'].cpu().numpy()
+ )
+ return gluon_model
+
+
+def test_model(fairseq_model, gluon_model, gpu):
+ print('testing model')
+ ctx = mx.gpu(gpu) if gpu is not None else mx.cpu()
+ batch_size = 3
+ seq_length = 32
+ vocab_size = len(fairseq_model.task.dictionary)
+ padding_id = fairseq_model.model.encoder.sentence_encoder.padding_idx
+ input_ids = np.random.randint( # skip padding_id
+ padding_id + 1,
+ vocab_size,
+ (batch_size, seq_length)
+ )
+ valid_length = np.random.randint(
+ seq_length // 2,
+ seq_length,
+ (batch_size,)
+ )
+
+ for i in range(batch_size): # add padding, for fairseq padding mask
+ input_ids[i, valid_length[i]:] = padding_id
+
+ gl_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
+ gl_valid_length = mx.np.array(valid_length, dtype=np.int32, ctx=ctx)
+ # project the all tokens that is taking whole positions
+ gl_masked_positions = mx.npx.arange_like(gl_input_ids, axis=1)
+ gl_masked_positions = gl_masked_positions + mx.np.zeros_like(gl_input_ids)
+
+ fs_input_ids = torch.from_numpy(input_ids).cuda(gpu)
+
+ fairseq_model.model.eval()
+
+ gl_all_hiddens, gl_pooled, gl_mlm_scores = \
+ gluon_model(gl_input_ids, gl_valid_length, gl_masked_positions)
+
+ fs_mlm_scores, fs_extra = \
+ fairseq_model.model.cuda(gpu)(
+ fs_input_ids,
+ return_all_hiddens=True)
+ fs_all_hiddens = fs_extra['inner_states']
+
+ # checking all_encodings_outputs
+ num_layers = fairseq_model.args.encoder_layers
+ for i in range(num_layers + 1):
+ gl_hidden = gl_all_hiddens[i].asnumpy()
+ fs_hidden = fs_all_hiddens[i]
+ fs_hidden = fs_hidden.transpose(0, 1)
+ fs_hidden = fs_hidden.detach().cpu().numpy()
+ for j in range(batch_size):
+ assert_allclose(
+ gl_hidden[j, :valid_length[j], :],
+ fs_hidden[j, :valid_length[j], :],
+ 1E-3,
+ 1E-3
+ )
+ # checking masked_language_scores
+ gl_mlm_scores = gl_mlm_scores.asnumpy()
+ fs_mlm_scores = fs_mlm_scores.detach().cpu().numpy()
+ for j in range(batch_size):
+ assert_allclose(
+ gl_mlm_scores[j, :valid_length[j], :],
+ fs_mlm_scores[j, :valid_length[j], :],
+ 1E-3,
+ 1E-3
+ )
+
+
+def rename(save_dir):
+ """Rename converted files with hash"""
+ old_names = os.listdir(save_dir)
+ for old_name in old_names:
+ old_path = os.path.join(save_dir, old_name)
+ long_hash = sha1sum(old_path)
+ file_prefix, file_sufix = old_name.split('.')
+ new_name = '{file_prefix}-{short_hash}.{file_sufix}'.format(
+ file_prefix=file_prefix,
+ short_hash=long_hash[:8],
+ file_sufix=file_sufix)
+ new_path = os.path.join(save_dir, new_name)
+ shutil.move(old_path, new_path)
+ file_size = os.path.getsize(new_path)
+ logging.info('\t{} {} {}'.format(new_path, long_hash, file_size))
+
+
+def convert_fairseq_model(args):
+ if not args.save_dir:
+ args.save_dir = os.path.basename(args.fairseq_model_path) + '_gluon'
+ if not os.path.exists(args.save_dir):
+ os.makedirs(args.save_dir)
+
+ fairseq_roberta = fairseq_RobertaModel.from_pretrained(args.fairseq_model_path,
+ checkpoint_file='model.pt')
+ vocab_size = convert_vocab(args, fairseq_roberta)
+
+ gluon_cfg = convert_config(fairseq_roberta.args, vocab_size,
+ RobertaModel.get_cfg().clone())
+ with open(os.path.join(args.save_dir, 'model.yml'), 'w') as of:
+ of.write(gluon_cfg.dump())
+
+ ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu()
+ gluon_roberta = convert_params(fairseq_roberta,
+ gluon_cfg,
+ ctx)
+ if args.test:
+ test_model(fairseq_roberta, gluon_roberta, args.gpu)
+
+ gluon_roberta.save_parameters(os.path.join(args.save_dir, 'model_mlm.params'), deduplicate=True)
+ logging.info('Convert the RoBERTa MLM model in {} to {}'.
+ format(os.path.join(args.fairseq_model_path, 'model.pt'),
+ os.path.join(args.save_dir, 'model_mlm.params')))
+ gluon_roberta.backbone_model.save_parameters(
+ os.path.join(args.save_dir, 'model.params'), deduplicate=True)
+ logging.info('Convert the RoBERTa backbone model in {} to {}'.
+ format(os.path.join(args.fairseq_model_path, 'model.pt'),
+ os.path.join(args.save_dir, 'model.params')))
+
+ logging.info('Conversion finished!')
+ logging.info('Statistics:')
+ old_names = os.listdir(args.save_dir)
+ for old_name in old_names:
+ new_name, long_hash = naming_convention(args.save_dir, old_name)
+ old_path = os.path.join(args.save_dir, old_name)
+ new_path = os.path.join(args.save_dir, new_name)
+ shutil.move(old_path, new_path)
+ file_size = os.path.getsize(new_path)
+ logging.info('\t{}/{} {} {}'.format(args.save_dir, new_name, long_hash, file_size))
+
+
+if __name__ == '__main__':
+ args = parse_args()
+ logging_config()
+ convert_fairseq_model(args)
diff --git a/scripts/conversion_toolkits/convert_fairseq_xlmr.py b/scripts/conversion_toolkits/convert_fairseq_xlmr.py
new file mode 100644
index 0000000000..4b3ec74da6
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_fairseq_xlmr.py
@@ -0,0 +1,120 @@
+import os
+import copy
+import logging
+import argparse
+
+import mxnet as mx
+
+from gluonnlp.utils.misc import logging_config
+from gluonnlp.models.xlmr import XLMRModel, XLMRForMLM
+from gluonnlp.third_party import sentencepiece_model_pb2
+from fairseq.models.roberta import XLMRModel as fairseq_XLMRModel
+from convert_fairseq_roberta import rename, test_model, test_vocab, convert_config, convert_params
+from gluonnlp.data.tokenizers import SentencepieceTokenizer
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Convert the fairseq XLM-R Model to Gluon.')
+ parser.add_argument('--fairseq_model_path', type=str, required=True,
+ help='Directory of the fairseq XLM-R model.')
+ parser.add_argument('--model_size', type=str, choices=['base', 'large'], default='base',
+ help='Size of XLM-R model.')
+ parser.add_argument('--save_dir', type=str, default=None,
+ help='Directory path to save the converted XLM-R model.')
+ parser.add_argument('--gpu', type=int, default=None,
+ help='The single gpu to run mxnet, (e.g. --gpu 0) the default device is cpu.')
+ parser.add_argument('--test', action='store_true',
+ help='Whether to test the conversion.')
+ return parser.parse_args()
+
+def convert_vocab(args, fairseq_model):
+ print('converting vocab')
+ origin_spm_path = os.path.join(args.fairseq_model_path, 'sentencepiece.bpe.model')
+ assert os.path.exists(origin_spm_path)
+ new_spm_path = os.path.join(args.save_dir, 'sentencepiece.model')
+ fairseq_vocab = fairseq_model.task.dictionary
+ # bos_word attr missing in fairseq_vocab
+ fairseq_vocab.bos_word = fairseq_vocab[fairseq_vocab.bos_index]
+
+ # model.pieces: other_tokens ->
+ # model.pieces: other_tokens
+ model = sentencepiece_model_pb2.ModelProto()
+ with open(origin_spm_path, 'rb') as f_m:
+ model.ParseFromString(f_m.read())
+ p0 = model.pieces[0]
+ p1 = model.pieces[1]
+ p2 = model.pieces[2]
+
+ pad_piece = copy.deepcopy(p0)
+ pad_piece.piece = fairseq_vocab.pad_word
+ pad_piece.type = pad_piece.CONTROL
+ mask_piece = copy.deepcopy(p0)
+ mask_piece.piece = ''
+ mask_piece.type = mask_piece.CONTROL
+
+ p0.type = p0.CONTROL
+ p0.piece = fairseq_vocab.bos_word
+ p1.type = p1.CONTROL
+ p1.piece = fairseq_vocab.eos_word
+ p2.type = p2.UNKNOWN
+ p2.piece = fairseq_vocab.unk_word
+ model.pieces.insert(fairseq_vocab.pad_index, pad_piece)
+ model.pieces.append(mask_piece)
+
+ model.trainer_spec.vocab_size = len(fairseq_vocab)
+ model.trainer_spec.unk_id = fairseq_vocab.unk_index
+ model.trainer_spec.bos_id = fairseq_vocab.bos_index
+ model.trainer_spec.eos_id = fairseq_vocab.eos_index
+ model.trainer_spec.pad_id = fairseq_vocab.pad_index
+
+ with open(new_spm_path, 'wb') as f:
+ f.write(model.SerializeToString())
+
+ gluon_tokenizer = SentencepieceTokenizer(new_spm_path)
+ if args.test:
+ test_vocab(fairseq_model, gluon_tokenizer, check_all_tokens=True)
+
+ vocab_size = len(fairseq_model.task.dictionary)
+ print('| converted dictionary: {} types'.format(vocab_size))
+ return vocab_size
+
+def convert_fairseq_model(args):
+ if not args.save_dir:
+ args.save_dir = os.path.basename(args.fairseq_model_path) + '_gluon'
+ if not os.path.exists(args.save_dir):
+ os.makedirs(args.save_dir)
+ fairseq_xlmr = fairseq_XLMRModel.from_pretrained(args.fairseq_model_path,
+ checkpoint_file='model.pt')
+ vocab_size = convert_vocab(args, fairseq_xlmr)
+
+ gluon_cfg = convert_config(fairseq_xlmr.args, vocab_size,
+ XLMRModel.get_cfg().clone())
+ with open(os.path.join(args.save_dir, 'model.yml'), 'w') as of:
+ of.write(gluon_cfg.dump())
+
+ ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu()
+
+ gluon_xlmr = convert_params(fairseq_xlmr,
+ gluon_cfg,
+ ctx)
+ if args.test:
+ test_model(fairseq_xlmr, gluon_xlmr, args.gpu)
+
+ gluon_xlmr.save_parameters(os.path.join(args.save_dir, 'model_mlm.params'), deduplicate=True)
+ logging.info('Convert the RoBERTa MLM model in {} to {}'.
+ format(os.path.join(args.fairseq_model_path, 'model.pt'), \
+ os.path.join(args.save_dir, 'model_mlm.params')))
+ gluon_xlmr.backbone_model.save_parameters(
+ os.path.join(args.save_dir, 'model.params'), deduplicate=True)
+ logging.info('Convert the RoBERTa backbone model in {} to {}'.
+ format(os.path.join(args.fairseq_model_path, 'model.pt'), \
+ os.path.join(args.save_dir, 'model.params')))
+
+ logging.info('Conversion finished!')
+ logging.info('Statistics:')
+ rename(args.save_dir)
+
+if __name__ == '__main__':
+ args = parse_args()
+ logging_config()
+ convert_fairseq_model(args)
diff --git a/scripts/conversion_toolkits/convert_gpt2.py b/scripts/conversion_toolkits/convert_gpt2.py
new file mode 100644
index 0000000000..7efe720922
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_gpt2.py
@@ -0,0 +1,257 @@
+import os
+import re
+import json
+import shutil
+import logging
+import argparse
+
+import tensorflow as tf
+from tensorflow.contrib.training import HParams
+from gpt_2.src import model
+
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_allclose
+
+from gluonnlp.data.vocab import Vocab
+from gluonnlp.utils.misc import sha1sum, logging_config, naming_convention
+from gluonnlp.models.gpt2 import GPT2Model, GPT2ForLM
+
+mx.npx.set_np()
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Convert the tf GPT-2 Model to Gluon.')
+ parser.add_argument('--tf_model_path', type=str, required=True,
+ help='Directory of the tf GPT-2 model.')
+ parser.add_argument('--save_dir', type=str, default=None,
+ help='Directory path to save the converted GPT-2 model.')
+ parser.add_argument('--test', action='store_true',
+ help='Whether to test the conversion.')
+ return parser.parse_args()
+
+def convert_vocab(args):
+ print('converting vocab')
+ merges_path = os.path.join(args.tf_model_path, 'vocab.bpe')
+ vocab_path = os.path.join(args.tf_model_path, 'encoder.json')
+ gluon_merges_path = os.path.join(args.save_dir, 'gpt2.merges')
+ gluon_vocab_path = os.path.join(args.save_dir, 'gpt2.vocab')
+
+ shutil.copy(merges_path, gluon_merges_path)
+ with open(vocab_path, 'r', encoding='utf-8') as f_v:
+ tf_vocab = json.load(f_v)
+ tf_vocab = list(tf_vocab.items())
+ tf_vocab = sorted(tf_vocab, key=lambda x: x[1])
+ all_tokens = [e[0] for e in tf_vocab]
+ eos_token = all_tokens[-1]
+ assert eos_token == '<|endoftext|>'
+ gluon_vocab = Vocab(all_tokens,
+ unk_token=None,
+ eos_token=eos_token)
+ gluon_vocab.save(gluon_vocab_path)
+
+ vocab_size = len(gluon_vocab)
+ print('| converted dictionary: {} types'.format(vocab_size))
+ return vocab_size
+
+
+def convert_config(tf_cfg, vocab_size):
+ print('converting config')
+ cfg = GPT2Model.get_cfg().clone()
+ cfg.defrost()
+ cfg.MODEL.vocab_size = tf_cfg['n_vocab']
+ cfg.MODEL.units = tf_cfg['n_embd']
+ cfg.MODEL.max_length = tf_cfg['n_ctx']
+ cfg.MODEL.num_heads = tf_cfg['n_head']
+ cfg.MODEL.num_layers = tf_cfg['n_layer']
+ cfg.VERSION = 1
+ cfg.freeze()
+ return cfg
+
+
+def read_tf_ckpt(path):
+ from tensorflow.python import pywrap_tensorflow
+ tensors = {}
+ reader = pywrap_tensorflow.NewCheckpointReader(path)
+ var_to_shape_map = reader.get_variable_to_shape_map()
+ for key in sorted(var_to_shape_map):
+ tensor = reader.get_tensor(key)
+ tensors[key] = tensor
+ return tensors
+
+
+def convert_backbone_params(tf_params, gluon_backbone_model):
+ TF_GLUON_NAME_MAP = {
+ 'model/wte' : '_embed.weight',
+ 'model/wpe' : '_pos_embed._embed.weight',
+ 'model/h(\d+)/ln_1/b' : '_layers.{}.atten.ln.beta',
+ 'model/h(\d+)/ln_1/g' : '_layers.{}.atten.ln.gamma',
+ 'model/h(\d+)/ln_2/b' : '_layers.{}.ffn.layer_norm.beta',
+ 'model/h(\d+)/ln_2/g' : '_layers.{}.ffn.layer_norm.gamma',
+ 'model/h(\d+)/mlp/c_fc/w' : '_layers.{}.ffn.ffn_1.weight',
+ 'model/h(\d+)/mlp/c_fc/b' : '_layers.{}.ffn.ffn_1.bias',
+ 'model/h(\d+)/mlp/c_proj/w' : '_layers.{}.ffn.ffn_2.weight',
+ 'model/h(\d+)/mlp/c_proj/b' : '_layers.{}.ffn.ffn_2.bias',
+ 'model/h(\d+)/attn/c_attn/w' : '_layers.{}.atten.qkv.weight',
+ 'model/h(\d+)/attn/c_attn/b' : '_layers.{}.atten.qkv.bias',
+ 'model/h(\d+)/attn/c_proj/w' : '_layers.{}.atten.out_proj.weight',
+ 'model/h(\d+)/attn/c_proj/b' : '_layers.{}.atten.out_proj.bias',
+ 'model/ln_f/b' : '_final_ln.beta',
+ 'model/ln_f/g' : '_final_ln.gamma'
+ }
+
+ params = gluon_backbone_model.collect_params()
+ loaded = {k: False for k in params}
+ for name, param_value in tf_params.items():
+ gluon_param_name = None
+ for lhs, rhs in TF_GLUON_NAME_MAP.items():
+ match = re.match(lhs, name)
+ if match is not None:
+ if len(match.groups()) > 0:
+ gluon_param_name = rhs.format(match.groups()[0])
+ break
+ else:
+ gluon_param_name = rhs
+ assert gluon_param_name is not None
+ print('{} --> {}'.format(name, gluon_param_name))
+ if param_value.shape != params[gluon_param_name].shape:
+ params[gluon_param_name].set_data(param_value[0].T)
+ else:
+ params[gluon_param_name].set_data(param_value)
+ loaded[gluon_param_name] = True
+ for name in params:
+ if not loaded[name]:
+ print('{} is not loaded!'.format(name))
+
+
+def rename(save_dir):
+ """Rename converted files with hash"""
+ old_names = os.listdir(save_dir)
+ for old_name in old_names:
+ old_path = os.path.join(save_dir, old_name)
+ long_hash = sha1sum(old_path)
+ file_prefix, file_sufix = old_name.split('.')
+ new_name = '{file_prefix}-{short_hash}.{file_sufix}'.format(
+ file_prefix=file_prefix,
+ short_hash=long_hash[:8],
+ file_sufix=file_sufix)
+ new_path = os.path.join(save_dir, new_name)
+ shutil.move(old_path, new_path)
+ file_size = os.path.getsize(new_path)
+ logging.info('\t{} {} {}'.format(new_path, long_hash, file_size))
+
+
+def test_model(tf_model_path, gluon_model):
+ # test data
+ ctx = mx.cpu()
+
+ seed = 123
+ batch_size = 3
+ seq_length = 32
+ vocab_size = gluon_model._backbone_model._vocab_size
+ np.random.seed(seed)
+ input_ids = np.random.randint(
+ 0,
+ vocab_size,
+ (batch_size, seq_length)
+ )
+
+ with open(os.path.join(tf_model_path, 'hparams.json'), 'r') as hf:
+ tf_cfg = json.load(hf)
+ hparams = HParams(
+ n_vocab=tf_cfg['n_vocab'],
+ n_ctx=tf_cfg['n_ctx'],
+ n_embd=tf_cfg['n_embd'],
+ n_head=tf_cfg['n_head'],
+ n_layer=tf_cfg['n_layer'],
+ )
+ tf_start_states = np.zeros((batch_size, hparams.n_layer, 2, hparams.n_head, 0, hparams.n_embd // hparams.n_head))
+ gl_start_states = gluon_model.init_states(batch_size, ctx)
+
+ # gluon model
+ gl_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
+ gl_logits_1, gl_states = gluon_model(gl_input_ids, gl_start_states, mx.np.array(0, dtype=np.int32, ctx=ctx))
+ gl_logits_2, _ = gluon_model(gl_input_ids, gl_states, mx.np.array(seq_length, dtype=np.int32, ctx=ctx))
+
+ # tf model
+ with tf.Session(graph=tf.Graph()) as sess:
+ tf.set_random_seed(None)
+ tf_context = tf.placeholder(tf.int32, [batch_size, seq_length])
+ tf_past = tf.placeholder(tf.float32, [batch_size, hparams.n_layer, 2, hparams.n_head,
+ None, hparams.n_embd // hparams.n_head])
+ tf_lm_output = model.model(hparams=hparams, X=tf_context, past=tf_past, reuse=tf.AUTO_REUSE)
+
+ tf_saver = tf.train.Saver()
+ tf_ckpt = tf.train.latest_checkpoint(tf_model_path)
+ tf_saver.restore(sess, tf_ckpt)
+
+ tf_output_1 = sess.run(tf_lm_output, feed_dict={tf_context:input_ids, tf_past:tf_start_states})
+ tf_logits_1 = tf_output_1['logits']
+ tf_present = tf_output_1['present']
+
+ tf_output_2 = sess.run(tf_lm_output, feed_dict={tf_context:input_ids, tf_past:tf_present})
+ tf_logits_2 = tf_output_2['logits']
+
+ for j in range(batch_size):
+ assert_allclose(
+ gl_logits_1[j, :, :].asnumpy(),
+ tf_logits_1[j, :, :],
+ 1E-3,
+ 1E-3
+ )
+ for j in range(batch_size):
+ assert_allclose(
+ gl_logits_2[j, :, :].asnumpy(),
+ tf_logits_2[j, :, :],
+ 1E-3,
+ 1E-3
+ )
+
+def convert_gpt2(args):
+ if not os.path.exists(args.save_dir):
+ os.makedirs(args.save_dir)
+
+ tf_ckpt_path = os.path.join(args.tf_model_path, 'model.ckpt')
+ tf_params = read_tf_ckpt(tf_ckpt_path)
+ with open(os.path.join(args.tf_model_path, 'hparams.json'), 'r') as hf:
+ tf_cfg = json.load(hf)
+
+ vocab_size = convert_vocab(args)
+ gluon_backbone_cfg = convert_config(tf_cfg, vocab_size)
+ with open(os.path.join(args.save_dir, 'model.yml'), 'w') as of:
+ of.write(gluon_backbone_cfg.dump())
+
+ gluon_gpt2forlm_model = GPT2ForLM(gluon_backbone_cfg)
+ gluon_gpt2forlm_model.initialize(ctx=mx.cpu())
+ gluon_gpt2forlm_model.hybridize()
+ gluon_backbone_model = gluon_gpt2forlm_model._backbone_model
+ convert_backbone_params(tf_params, gluon_backbone_model)
+
+ if args.test:
+ test_model(args.tf_model_path, gluon_gpt2forlm_model)
+
+ gluon_gpt2forlm_model.save_parameters(os.path.join(args.save_dir, 'model_lm.params'), deduplicate=True)
+ logging.info('Convert the GPT2 LM model in {} to {}'.
+ format(os.path.join(args.tf_model_path, 'model.ckpt'),
+ os.path.join(args.save_dir, 'model_lm.params')))
+ gluon_backbone_model.save_parameters(os.path.join(args.save_dir, 'model.params'), deduplicate=True)
+ logging.info('Convert the GPT2 backbone model in {} to {}'.
+ format(os.path.join(args.tf_model_path, 'model.ckpt'),
+ os.path.join(args.save_dir, 'model.params')))
+
+ logging.info('Conversion finished!')
+ logging.info('Statistics:')
+ old_names = os.listdir(args.save_dir)
+ for old_name in old_names:
+ new_name, long_hash = naming_convention(args.save_dir, old_name)
+ old_path = os.path.join(args.save_dir, old_name)
+ new_path = os.path.join(args.save_dir, new_name)
+ shutil.move(old_path, new_path)
+ file_size = os.path.getsize(new_path)
+ logging.info('\t{}/{} {} {}'.format(args.save_dir, new_name, long_hash, file_size))
+
+
+if __name__ == '__main__':
+ args = parse_args()
+ logging_config()
+ convert_gpt2(args)
diff --git a/scripts/conversion_toolkits/convert_gpt2.sh b/scripts/conversion_toolkits/convert_gpt2.sh
new file mode 100644
index 0000000000..a551250c4b
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_gpt2.sh
@@ -0,0 +1,8 @@
+python3 -m pip install tensorflow==1.15 --upgrade --user
+git clone https://github.com/openai/gpt-2.git gpt_2
+for model in 124M 355M 774M
+do
+ python3 gpt_2/download_model.py ${model}
+ mkdir gpt2_${model}
+ CUDA_VISIBLE_DEVICES="" python3 convert_gpt2.py --tf_model_path models/${model} --save_dir gpt2_${model} --test
+done
diff --git a/scripts/conversion_toolkits/convert_mobilebert.py b/scripts/conversion_toolkits/convert_mobilebert.py
new file mode 100644
index 0000000000..756b86ca31
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_mobilebert.py
@@ -0,0 +1,343 @@
+import os
+import re
+import json
+import sys
+import shutil
+import logging
+import argparse
+
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_allclose
+
+from gluonnlp.utils.misc import sha1sum, naming_convention, logging_config
+from gluonnlp.data.tokenizers import HuggingFaceWordPieceTokenizer
+from gluonnlp.models.mobilebert import MobileBertModel, MobileBertForPretrain
+import tensorflow.compat.v1 as tf
+
+tf.disable_eager_execution()
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
+
+mx.npx.set_np()
+np.random.seed(1234)
+mx.npx.random.seed(1234)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Convert the TF Mobile Bert Model to Gluon')
+ parser.add_argument('--tf_model_path', type=str,
+ help='Directory of the model downloaded from TF hub.')
+ parser.add_argument('--mobilebert_dir', type=str,
+ help='Path to the github repository of electra, you may clone it by '
+ '`svn checkout https://github.com/google-research/google-research/trunk/mobilebert`.')
+ parser.add_argument('--save_dir', type=str, default=None,
+ help='directory path to save the converted Mobile Bert model.')
+ parser.add_argument('--gpu', type=int, default=None,
+ help='a single gpu to run mxnet, e.g. 0 or 1 The default device is cpu ')
+ parser.add_argument('--test', action='store_true')
+ args = parser.parse_args()
+ return args
+
+
+def read_tf_checkpoint(path):
+ """read tensorflow checkpoint"""
+ from tensorflow.python import pywrap_tensorflow
+ tensors = {}
+ reader = pywrap_tensorflow.NewCheckpointReader(path)
+ var_to_shape_map = reader.get_variable_to_shape_map()
+ for key in sorted(var_to_shape_map):
+ tensor = reader.get_tensor(key)
+ tensors[key] = tensor
+ return tensors
+
+
+def convert_tf_config(config_dict_path, vocab_size):
+ """Convert the config file"""
+ with open(config_dict_path, encoding='utf-8') as f:
+ config_dict = json.load(f)
+ assert vocab_size == config_dict['vocab_size']
+ cfg = MobileBertModel.get_cfg().clone()
+ cfg.defrost()
+ cfg.MODEL.vocab_size = vocab_size
+ cfg.MODEL.units = config_dict['hidden_size']
+ cfg.MODEL.embed_size = config_dict['embedding_size']
+ cfg.MODEL.inner_size = config_dict['intra_bottleneck_size']
+ cfg.MODEL.hidden_size = config_dict['intermediate_size']
+ cfg.MODEL.max_length = config_dict['max_position_embeddings']
+ cfg.MODEL.num_heads = config_dict['num_attention_heads']
+ cfg.MODEL.num_layers = config_dict['num_hidden_layers']
+ cfg.MODEL.bottleneck_strategy
+ cfg.MODEL.num_stacked_ffn = config_dict['num_feedforward_networks']
+ cfg.MODEL.pos_embed_type = 'learned'
+ cfg.MODEL.activation = config_dict['hidden_act']
+ cfg.MODEL.num_token_types = config_dict['type_vocab_size']
+ cfg.MODEL.hidden_dropout_prob = float(config_dict['hidden_dropout_prob'])
+ cfg.MODEL.attention_dropout_prob = float(config_dict['attention_probs_dropout_prob'])
+ cfg.MODEL.normalization = config_dict['normalization_type']
+ cfg.MODEL.dtype = 'float32'
+
+ if 'use_bottleneck_attention' in config_dict.keys():
+ cfg.MODEL.bottleneck_strategy = 'from_bottleneck'
+ elif 'key_query_shared_bottleneck' in config_dict.keys():
+ cfg.MODEL.bottleneck_strategy = 'qk_sharing'
+ else:
+ cfg.MODEL.bottleneck_strategy = 'from_input'
+
+ cfg.INITIALIZER.weight = ['truncnorm', 0,
+ config_dict['initializer_range']] # TruncNorm(0, 0.02)
+ cfg.INITIALIZER.bias = ['zeros']
+ cfg.VERSION = 1
+ cfg.freeze()
+ return cfg
+
+
+def convert_tf_assets(tf_assets_dir):
+ """Convert the assets file including config, vocab and tokenizer model"""
+ file_names = os.listdir(tf_assets_dir)
+ vocab_path = None
+ json_cfg_path = None
+ for ele in file_names:
+ if ele.endswith('.txt'):
+ assert vocab_path is None
+ vocab_path = ele
+ elif ele.endswith('.json'):
+ assert json_cfg_path is None
+ json_cfg_path = ele
+ assert vocab_path is not None and json_cfg_path is not None
+
+ vocab_path = os.path.join(tf_assets_dir, vocab_path)
+ vocab_size = len(open(vocab_path, 'r', encoding='utf-8').readlines())
+ json_cfg_path = os.path.join(tf_assets_dir, json_cfg_path)
+ cfg = convert_tf_config(json_cfg_path, vocab_size)
+ return cfg, json_cfg_path, vocab_path
+
+
+CONVERT_MAP = [
+ # mlm model
+ ('cls/', ''),
+ ('predictions/extra_output_weights', 'extra_table.weight'),
+ ('predictions/output_bias', 'embedding_table.bias'),
+ ('predictions/transform/LayerNorm', 'mlm_decoder.2'),
+ ('predictions/transform/dense', 'mlm_decoder.0'),
+ ('seq_relationship/output_bias', 'nsp_classifier.bias'),
+ ('seq_relationship/output_weights', 'nsp_classifier.weight'),
+ # backbone
+ ('bert/', 'backbone_model.'),
+ ('layer_', 'all_layers.'),
+ ('attention/output/FakeLayerNorm', 'layer_norm'),
+ ('attention/output/dense', 'attention_proj'),
+ # inner ffn layer denoted by xxx
+ ('ffn_layers_xxx/intermediate/dense', 'stacked_ffn.xxx.ffn_1'),
+ ('ffn_layers_xxx/output/FakeLayerNorm', 'stacked_ffn.xxx.layer_norm'),
+ ('ffn_layers_xxx/output/dense', 'stacked_ffn.xxx.ffn_2'),
+ # last ffn layer denoted by xxy
+ ('intermediate/dense', 'stacked_ffn.xxy.ffn_1'),
+ ('output/FakeLayerNorm', 'stacked_ffn.xxy.layer_norm'),
+ ('output/dense', 'stacked_ffn.xxy.ffn_2'),
+ # embeddings
+ ('embeddings/word_embeddings', 'word_embed.weight'),
+ ('embeddings/token_type_embeddings', 'token_type_embed.weight'),
+ ('embeddings/position_embeddings', 'token_pos_embed._embed.weight'),
+ ('embeddings/embedding_transformation', 'embed_factorized_proj'),
+ ('embeddings/FakeLayerNorm', 'embed_layer_norm'),
+ ('bottleneck/input/FakeLayerNorm', 'in_bottleneck_ln'),
+ ('bottleneck/input/dense', 'in_bottleneck_proj'),
+ ('bottleneck/attention/FakeLayerNorm', 'shared_qk_ln'),
+ ('bottleneck/attention/dense', 'shared_qk'),
+ ('output/bottleneck/FakeLayerNorm', 'out_bottleneck_ln'),
+ ('output/bottleneck/dense', 'out_bottleneck_proj'),
+ ('attention/self/key', 'attn_key'),
+ ('attention/self/query', 'attn_query'),
+ ('attention/self/value', 'attn_value'),
+ ('output/', ''),
+ ('kernel', 'weight'),
+ ('FakeLayerNorm', 'layer_norm'),
+ ('LayerNorm', 'layer_norm'),
+ ('/', '.'),
+]
+
+
+def get_name_map(tf_names, num_stacked_ffn):
+ """
+ Get the converting mapping between tensor names and mxnet names.
+ The above mapping CONVERT_MAP is effectively adaptive to Bert and Albert,
+ but there is no guarantee that it can match to other tf models in case of
+ some sepecial variable_scope (tensorflow) and prefix (mxnet).
+
+ Redefined mapping is encouraged to adapt the personalization model.
+
+ Parameters
+ ----------
+ tf_names
+ the parameters names of tensorflow model
+ Returns
+ -------
+ A dictionary with the following format:
+ {tf_names : mx_names}
+ """
+ name_map = {}
+ for source_name in tf_names:
+ target_name = source_name
+ ffn_idx = re.findall(r'ffn_layer_\d+', target_name)
+ if ffn_idx:
+ target_name = target_name.replace(ffn_idx[0], 'ffn_layers_xxx')
+ for old, new in CONVERT_MAP:
+ target_name = target_name.replace(old, new)
+ if ffn_idx:
+ target_name = target_name.replace('stacked_ffn.xxx', 'stacked_ffn.' + ffn_idx[0][10:])
+ if 'stacked_ffn.xxy' in target_name:
+ target_name = target_name.replace(
+ 'stacked_ffn.xxy', 'stacked_ffn.' + str(num_stacked_ffn - 1))
+ name_map[source_name] = target_name
+
+ return name_map
+
+
+def convert_tf_model(model_dir, save_dir, test_conversion, gpu, mobilebert_dir):
+ ctx = mx.gpu(gpu) if gpu is not None else mx.cpu()
+ if not os.path.exists(save_dir):
+ os.makedirs(save_dir)
+
+ cfg, json_cfg_path, vocab_path = convert_tf_assets(model_dir)
+ with open(os.path.join(save_dir, 'model.yml'), 'w') as of:
+ of.write(cfg.dump())
+ new_vocab = HuggingFaceWordPieceTokenizer(
+ vocab_file=vocab_path,
+ unk_token='[UNK]',
+ pad_token='[PAD]',
+ cls_token='[CLS]',
+ sep_token='[SEP]',
+ mask_token='[MASK]',
+ lowercase=True).vocab
+ new_vocab.save(os.path.join(save_dir, 'vocab.json'))
+
+ # test input data
+ batch_size = 3
+ seq_length = 32
+ num_mask = 5
+ input_ids = np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length))
+ valid_length = np.random.randint(seq_length // 2, seq_length, (batch_size,))
+ input_mask = np.broadcast_to(np.arange(seq_length).reshape(1, -1), (batch_size, seq_length)) \
+ < np.expand_dims(valid_length, 1)
+ segment_ids = np.random.randint(0, 2, (batch_size, seq_length))
+ mlm_positions = np.random.randint(0, seq_length // 2, (batch_size, num_mask))
+
+ tf_input_ids = tf.constant(input_ids, dtype=np.int32)
+ tf_input_mask = tf.constant(input_mask, dtype=np.int32)
+ tf_segment_ids = tf.constant(segment_ids, dtype=np.int32)
+
+ init_checkpoint = os.path.join(model_dir, 'mobilebert_variables.ckpt')
+ tf_params = read_tf_checkpoint(init_checkpoint)
+ # get parameter names for tensorflow with unused parameters filtered out.
+ tf_names = sorted(tf_params.keys())
+ tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names)
+ tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names)
+ tf_names = filter(lambda name: name != 'global_step', tf_names)
+ tf_names = list(tf_names)
+
+ sys.path.append(mobilebert_dir)
+ from mobilebert import modeling
+
+ tf_bert_config = modeling.BertConfig.from_json_file(json_cfg_path)
+ bert_model = modeling.BertModel(
+ config=tf_bert_config,
+ is_training=False,
+ input_ids=tf_input_ids,
+ input_mask=tf_input_mask,
+ token_type_ids=tf_segment_ids,
+ use_one_hot_embeddings=False)
+ tvars = tf.trainable_variables()
+ assignment_map, _ = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+ tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+ with tf.Session() as sess:
+ sess.run(tf.global_variables_initializer())
+ # the name of the parameters are ending with ':0' like 'Mobile
+ # Bert/embeddings/word_embeddings:0'
+ backbone_params = {v.name.split(":")[0]: v.read_value() for v in tvars}
+ backbone_params = sess.run(backbone_params)
+ tf_token_outputs_np = {
+ 'pooled_output': sess.run(bert_model.get_pooled_output()),
+ 'sequence_output': sess.run(bert_model.get_sequence_output()),
+ }
+
+ # The following part only ensure the parameters in backbone model are valid
+ for k in backbone_params:
+ assert_allclose(tf_params[k], backbone_params[k])
+
+ # Build gluon model and initialize
+ gluon_pretrain_model = MobileBertForPretrain(cfg)
+ gluon_pretrain_model.initialize(ctx=ctx)
+ gluon_pretrain_model.hybridize()
+
+ # pepare test data
+ mx_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
+ mx_valid_length = mx.np.array(valid_length, dtype=np.int32, ctx=ctx)
+ mx_token_types = mx.np.array(segment_ids, dtype=np.int32, ctx=ctx)
+ mx_masked_positions = mx.np.array(mlm_positions, dtype=np.int32, ctx=ctx)
+
+ has_mlm = True
+ name_map = get_name_map(tf_names, cfg.MODEL.num_stacked_ffn)
+ # go through the gluon model to infer the shape of parameters
+ model = gluon_pretrain_model
+ contextual_embedding, pooled_output, nsp_score, mlm_scores = \
+ model(mx_input_ids, mx_token_types, mx_valid_length, mx_masked_positions)
+ # replace tensorflow parameter names with gluon parameter names
+ mx_params = model.collect_params()
+ all_keys = set(mx_params.keys())
+ for (src_name, dst_name) in name_map.items():
+ tf_param_val = tf_params[src_name]
+ if dst_name is None:
+ continue
+ all_keys.remove(dst_name)
+ if src_name.endswith('kernel'):
+ mx_params[dst_name].set_data(tf_param_val.T)
+ else:
+ mx_params[dst_name].set_data(tf_param_val)
+
+ if has_mlm:
+ # 'embedding_table.weight' is shared with word_embed.weight
+ all_keys.remove('embedding_table.weight')
+ assert len(all_keys) == 0, 'parameters missing from tensorflow checkpoint'
+
+ # test conversion results for backbone model
+ if test_conversion:
+ tf_contextual_embedding = tf_token_outputs_np['sequence_output']
+ tf_pooled_output = tf_token_outputs_np['pooled_output']
+ contextual_embedding, pooled_output = model.backbone_model(
+ mx_input_ids, mx_token_types, mx_valid_length)
+ assert_allclose(pooled_output.asnumpy(), tf_pooled_output, 1E-2, 1E-2)
+ for i in range(batch_size):
+ ele_valid_length = valid_length[i]
+ assert_allclose(contextual_embedding[i, :ele_valid_length, :].asnumpy(),
+ tf_contextual_embedding[i, :ele_valid_length, :], 1E-2, 1E-2)
+ model.backbone_model.save_parameters(os.path.join(save_dir, 'model.params'), deduplicate=True)
+ logging.info('Convert the backbone model in {} to {}/{}'.format(model_dir, save_dir, 'model.params'))
+ model.save_parameters(os.path.join(save_dir, 'model_mlm.params'), deduplicate=True)
+ logging.info('Convert the MLM and NSP model in {} to {}/{}'.format(model_dir,
+ save_dir, 'model_mlm.params'))
+
+ logging.info('Conversion finished!')
+ logging.info('Statistics:')
+
+ old_names = os.listdir(save_dir)
+ for old_name in old_names:
+ new_name, long_hash = naming_convention(save_dir, old_name)
+ old_path = os.path.join(save_dir, old_name)
+ new_path = os.path.join(save_dir, new_name)
+ shutil.move(old_path, new_path)
+ file_size = os.path.getsize(new_path)
+ logging.info('\t{}/{} {} {}'.format(save_dir, new_name, long_hash, file_size))
+
+
+if __name__ == '__main__':
+ args = parse_args()
+ logging_config()
+ save_dir = args.save_dir if args.save_dir is not None else os.path.basename(
+ args.tf_model_path) + '_gluon'
+ mobilebert_dir = os.path.abspath(
+ os.path.join(
+ os.path.dirname(
+ args.mobilebert_dir),
+ os.path.pardir))
+ convert_tf_model(args.tf_model_path, save_dir, args.test, args.gpu, mobilebert_dir)
diff --git a/scripts/conversion_toolkits/convert_mobilebert.sh b/scripts/conversion_toolkits/convert_mobilebert.sh
new file mode 100644
index 0000000000..f550ce8f3b
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_mobilebert.sh
@@ -0,0 +1,9 @@
+python3 -m pip install tensorflow==1.15 --upgrade --user
+export TF_FORCE_GPU_ALLOW_GROWTH="true"
+svn checkout https://github.com/google-research/google-research/trunk/mobilebert
+
+mkdir mobilebert_model
+url='https://storage.googleapis.com/cloud-tpu-checkpoints/mobilebert/uncased_L-24_H-128_B-512_A-4_F-4_OPT.tar.gz'
+wget ${url} -O "mobilebert.tar.gz"
+tar -xvf mobilebert.tar.gz --directory mobilebert_model
+python3 convert_mobilebert.py --tf_model_path mobilebert_model/mobilebert --mobilebert_dir mobilebert --test
diff --git a/scripts/conversion_toolkits/convert_roberta.sh b/scripts/conversion_toolkits/convert_roberta.sh
new file mode 100644
index 0000000000..8bb08b0607
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_roberta.sh
@@ -0,0 +1,8 @@
+python3 -m pip install git+https://github.com/pytorch/fairseq.git@master --upgrade --user
+for model in base large
+do
+ mkdir roberta_${model}
+ wget "https://dl.fbaipublicfiles.com/fairseq/models/roberta.${model}.tar.gz"
+ tar zxf roberta.${model}.tar.gz --directory roberta_${model}
+ python3 convert_fairseq_roberta.py --fairseq_model_path roberta_${model}/roberta.${model} --test
+done
diff --git a/scripts/conversion_toolkits/convert_tf_hub_model.py b/scripts/conversion_toolkits/convert_tf_hub_model.py
new file mode 100644
index 0000000000..b54726e54b
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_tf_hub_model.py
@@ -0,0 +1,534 @@
+import tensorflow_hub as hub
+import tensorflow.compat.v1 as tf
+import os
+import re
+import json
+import shutil
+import logging
+import argparse
+
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_allclose
+
+from gluonnlp.data.vocab import Vocab
+from gluonnlp.utils.misc import naming_convention, logging_config
+from gluonnlp.models.bert import BertModel, BertForMLM
+from gluonnlp.models.albert import AlbertModel, AlbertForMLM
+from gluonnlp.data.tokenizers import SentencepieceTokenizer, HuggingFaceWordPieceTokenizer
+
+import tensorflow
+USE_TF_V1 = tensorflow.version.VERSION.split('.')[0] < '2'
+tf.disable_eager_execution()
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
+
+mx.npx.set_np()
+np.random.seed(1234)
+mx.npx.random.seed(1234)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Convert the TF pretrained model to Gluon')
+ parser.add_argument('--tf_hub_model_path', type=str,
+ help='Directory of the model downloaded from TF hub.')
+ parser.add_argument('--model_type', type=str, choices=['bert', 'albert'],
+ help='The name of the model to be converted. '
+ 'Only Bert and Albert are currently supported.')
+ parser.add_argument('--save_dir', type=str, default=None,
+ help='directory path to save the converted pretrained model.')
+ parser.add_argument('--gpu', type=int, default=None,
+ help='a single gpu to run mxnet, e.g. 0 or 1 The default device is cpu ')
+ parser.add_argument('--test', action='store_true')
+ args = parser.parse_args()
+ return args
+
+
+def read_tf_checkpoint(path):
+ """read tensorflow checkpoint"""
+ from tensorflow.python import pywrap_tensorflow
+ tensors = {}
+ reader = pywrap_tensorflow.NewCheckpointReader(path)
+ var_to_shape_map = reader.get_variable_to_shape_map()
+ for key in sorted(var_to_shape_map):
+ tensor = reader.get_tensor(key)
+ tensors[key] = tensor
+ return tensors
+
+
+def convert_tf_config(json_cfg_path, vocab_size, model_type):
+ """Convert the config file"""
+
+ with open(json_cfg_path, encoding='utf-8') as f:
+ json_cfg = json.load(f)
+ if model_type == 'bert':
+ # For bert model, the config file are copied from local configuration file
+ # leaving the vocab_size indistinguishable. Actually, the verification of
+ # vocab_size would be done in the process of embedding weights conversion.
+ cfg = BertModel.get_cfg().clone()
+ elif model_type == 'albert':
+ assert vocab_size == json_cfg['vocab_size']
+ cfg = AlbertModel.get_cfg().clone()
+ else:
+ raise NotImplementedError
+ cfg.defrost()
+ cfg.MODEL.vocab_size = vocab_size
+
+ cfg.MODEL.units = json_cfg['hidden_size']
+ cfg.MODEL.hidden_size = json_cfg['intermediate_size']
+ cfg.MODEL.max_length = json_cfg['max_position_embeddings']
+ cfg.MODEL.num_heads = json_cfg['num_attention_heads']
+ cfg.MODEL.num_layers = json_cfg['num_hidden_layers']
+ cfg.MODEL.pos_embed_type = 'learned'
+ if json_cfg['hidden_act'] == 'gelu':
+ cfg.MODEL.activation = 'gelu(tanh)'
+ else:
+ cfg.MODEL.activation = json_cfg['hidden_act']
+ cfg.MODEL.layer_norm_eps = 1E-12
+ cfg.MODEL.num_token_types = json_cfg['type_vocab_size']
+ cfg.MODEL.hidden_dropout_prob = float(json_cfg['hidden_dropout_prob'])
+ cfg.MODEL.attention_dropout_prob = float(json_cfg['attention_probs_dropout_prob'])
+ cfg.MODEL.dtype = 'float32'
+ cfg.INITIALIZER.weight = ['truncnorm', 0, json_cfg['initializer_range']] # TruncNorm(0, 0.02)
+ cfg.INITIALIZER.bias = ['zeros']
+ cfg.VERSION = 1
+ if model_type == 'albert':
+ # The below configurations are not supported in bert
+ cfg.MODEL.embed_size = json_cfg['embedding_size']
+ cfg.MODEL.num_groups = json_cfg['num_hidden_groups']
+ cfg.freeze()
+ return cfg
+
+
+def convert_tf_assets(tf_assets_dir, model_type):
+ """Convert the assets file including config, vocab and tokenizer model"""
+ file_names = os.listdir(tf_assets_dir)
+ json_cfg_path = None
+ spm_model_path = None
+ vocab_path = None
+ for ele in file_names:
+ if ele.endswith('.model'):
+ assert spm_model_path is None
+ spm_model_path = ele
+ elif ele.endswith('.json'):
+ assert json_cfg_path is None
+ json_cfg_path = ele
+ elif ele.endswith('.txt'):
+ assert vocab_path is None
+ vocab_path = ele
+ assert json_cfg_path is not None and \
+ (spm_model_path is not None or vocab_path is not None), "The file to be" \
+ "converted is missing and at least one word segmentation tool or dictionary exists"
+
+ json_cfg_path = os.path.join(tf_assets_dir, json_cfg_path)
+ if spm_model_path:
+ spm_model_path = os.path.join(tf_assets_dir, spm_model_path)
+ tokenizer = SentencepieceTokenizer(spm_model_path)
+ vocab_size = len(tokenizer.vocab)
+ elif vocab_path:
+ vocab_path = os.path.join(tf_assets_dir, vocab_path)
+ vocab_size = len(open(vocab_path, 'r', encoding='utf-8').readlines())
+ cfg = convert_tf_config(json_cfg_path, vocab_size, model_type)
+ return cfg, vocab_path, spm_model_path
+
+
+CONVERT_MAP_TF1 = [
+ ('bert/', 'backbone_model.'),
+ ('cls/', ''),
+ ('predictions/transform/dense', 'mlm_decoder.0'),
+ ('predictions/transform/LayerNorm', 'mlm_decoder.2'),
+ ('predictions/output_bias', 'mlm_decoder.3.bias'),
+ ('transformer/', ''),
+ ('transform/', ''),
+ ('embeddings/word_embeddings', 'word_embed.weight'),
+ ('embeddings/token_type_embeddings', 'token_type_embed.weight'),
+ ('embeddings/position_embeddings', 'token_pos_embed._embed.weight'),
+ ('encoder/embedding_hidden_mapping_in', 'embed_factorized_proj'),
+ ('group_0/inner_group_0/', 'all_encoder_groups.0.'), # albert
+ ('layer_', 'all_layers.'), # bert
+ ('embeddings/LayerNorm', 'embed_layer_norm'),
+ ('attention/output/LayerNorm', 'layer_norm'), # bert
+ ('output/LayerNorm', 'ffn.layer_norm'), # bert
+ ('LayerNorm_1', 'ffn.layer_norm'), # albert
+ ('LayerNorm', 'layer_norm'), # albert
+ ('attention_1', 'attention'), # albert
+ ('attention/output/dense', 'attention_proj'),
+ ('ffn_1/', ''), # bert & albert
+ ('intermediate/dense', 'ffn.ffn_1'), # albert
+ ('intermediate/output/dense', 'ffn.ffn_2'), # albert
+ ('output/dense', 'ffn.ffn_2'), # bert
+ ('output/', ''),
+ ('pooler/dense', 'pooler'),
+ ('kernel', 'weight'),
+ ('attention/', ''),
+ ('/', '.'),
+]
+
+CONVERT_MAP_TF2 = [
+ (':0', ''),
+ ('cls/', ''),
+ ('predictions/output_bias', 'mlm_decoder.3.bias'),
+ ('transformer/layer_', 'encoder.all_layers.'),
+ ('word_embeddings/embeddings', 'word_embed.weight'),
+ ('type_embeddings/embeddings', 'token_type_embed.weight'),
+ ('position_embedding/embeddings', 'token_pos_embed._embed.weight'),
+ ('embeddings/layer_norm', 'embed_layer_norm'),
+ ('embedding_projection', 'embed_factorized_proj'),
+ ('self_attention/attention_output', 'attention_proj'),
+ ('self_attention_layer_norm', 'layer_norm'),
+ ('intermediate', 'ffn.ffn_1'),
+ ('output_layer_norm', 'ffn.layer_norm'),
+ ('output', 'ffn.ffn_2'),
+ ("pooler_transform", "pooler"),
+ ('kernel', 'weight'),
+ ('/', '.'),
+]
+
+
+def get_name_map(tf_names, is_TF1=True):
+ """
+ Get the converting mapping between TF names and mxnet names.
+ The above mapping CONVERT_MAP is effectively adaptive to Bert and Albert,
+ but there is no guarantee that it can match to other tf models in case of
+ some special variable_scope (tensorflow) and prefix (mxnet).
+
+ Redefined mapping is encouraged to adapt the personalization model.
+
+ Parameters
+ ----------
+ tf_names
+ the parameters names of tensorflow model
+ is_TF1
+ whether load from TF1 Hub Modules
+
+ Returns
+ -------
+ A dictionary with the following format:
+ {tf_names : mx_names}
+ """
+ convert_map = CONVERT_MAP_TF1 if is_TF1 else CONVERT_MAP_TF2
+ name_map = {}
+ for source_name in tf_names:
+ target_name = source_name
+ # skip the qkv weights
+ if 'self/' in source_name:
+ name_map[source_name] = None
+ continue
+ if re.match(r'^transformer\/layer_[\d]+\/self_attention\/(key|value|query)\/(kernel|bias)$',
+ source_name) is not None:
+ name_map[source_name] = None
+ continue
+ for old, new in convert_map:
+ target_name = target_name.replace(old, new)
+ name_map[source_name] = target_name
+ return name_map
+
+
+def convert_tf_model(hub_model_dir, save_dir, test_conversion, model_type, gpu):
+ ctx = mx.gpu(gpu) if gpu is not None else mx.cpu()
+ # set up the model type to be converted
+ if model_type == 'bert':
+ PretrainedModel, PretrainedMLMModel = BertModel, BertForMLM
+ elif model_type == 'albert':
+ PretrainedModel, PretrainedMLMModel = AlbertModel, AlbertForMLM
+ else:
+ raise NotImplementedError
+
+ if not os.path.exists(save_dir):
+ os.makedirs(save_dir)
+
+ cfg, vocab_path, spm_model_path = convert_tf_assets(os.path.join(hub_model_dir, 'assets'),
+ model_type)
+ with open(os.path.join(save_dir, 'model.yml'), 'w') as of:
+ of.write(cfg.dump())
+ if spm_model_path:
+ # Sentencepiece Tokenizer that used in albert model
+ tokenizer = SentencepieceTokenizer(spm_model_path)
+ new_vocab = Vocab(tokenizer.vocab.all_tokens,
+ unk_token='',
+ pad_token='',
+ cls_token='[CLS]',
+ sep_token='[SEP]',
+ mask_token='[MASK]')
+ shutil.copy(spm_model_path, os.path.join(save_dir, 'spm.model'))
+ elif vocab_path:
+ # Wordpiece Tokenizer that used in bert and electra model
+
+ # In this step, the vocabulary is converted with the help of the tokenizer,
+ # so whether tokenzier is case-dependent does not matter.
+ new_vocab = HuggingFaceWordPieceTokenizer(
+ vocab_file=vocab_path,
+ unk_token='[UNK]',
+ pad_token='[PAD]',
+ cls_token='[CLS]',
+ sep_token='[SEP]',
+ mask_token='[MASK]',
+ lowercase=True).vocab
+
+ new_vocab.save(os.path.join(save_dir, 'vocab.json'))
+
+ # test input data
+ batch_size = 2
+ seq_length = 16
+ num_mask = 5
+ input_ids = np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length))
+ valid_length = np.random.randint(seq_length // 2, seq_length, (batch_size,))
+ input_mask = np.broadcast_to(np.arange(seq_length).reshape(1, -1), (batch_size, seq_length)) \
+ < np.expand_dims(valid_length, 1)
+ segment_ids = np.random.randint(0, 2, (batch_size, seq_length))
+ mlm_positions = np.random.randint(0, seq_length // 2, (batch_size, num_mask))
+ TF1_Hub_Modules = True
+ try:
+ tf_model = hub.Module(hub_model_dir, trainable=True)
+ # see https://www.tensorflow.org/hub/tf1_hub_module for details
+ logging.info('The model is loaded as the TF1 Hub Model')
+ tf_input_ids = tf.constant(input_ids, dtype=np.int32)
+ tf_input_mask = tf.constant(input_mask, dtype=np.int32)
+ tf_segment_ids = tf.constant(segment_ids, dtype=np.int32)
+ tf_mlm_positions = tf.constant(mlm_positions, dtype=np.int32)
+ tf_mlm_outputs = tf_model(
+ dict(input_ids=tf_input_ids,
+ input_mask=tf_input_mask,
+ segment_ids=tf_segment_ids,
+ mlm_positions=tf_mlm_positions), signature="mlm", as_dict=True)
+ tf_token_outputs = tf_model(
+ dict(input_ids=tf_input_ids,
+ input_mask=tf_input_mask,
+ segment_ids=tf_segment_ids), signature="tokens", as_dict=True)
+ with tf.Session() as sess:
+ sess.run(tf.global_variables_initializer())
+ tf_params = sess.run(tf_model.variable_map)
+ tf_token_outputs_np = sess.run(tf_token_outputs)
+ tf_mlm_outputs_np = sess.run(tf_mlm_outputs)
+ except RuntimeError as _:
+ logging.warning('The provided model directory is not valid for TF1 Hub Modules. '
+ 'Now try to load as TF2 SavedModels')
+ bert_layer = hub.KerasLayer(hub_model_dir, trainable=True)
+ # see https://www.tensorflow.org/hub/tf2_saved_model for details
+ logging.info('The model is loaded as the TF2 SavedModel')
+ TF1_Hub_Modules = False
+ input_word_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32,
+ name="input_word_ids")
+ input_word_mask = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32,
+ name="input_mask")
+ segment_type_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32,
+ name="segment_ids")
+ pooled_output, sequence_output = bert_layer([input_word_ids, input_word_mask,
+ segment_type_ids])
+ tf_model = tf.keras.Model(
+ inputs=[input_word_ids, input_word_mask, segment_type_ids],
+ outputs=[pooled_output, sequence_output]
+ )
+ tf_params = {}
+ with tf.Session() as sess:
+ sess.run(tf.global_variables_initializer())
+ pooled_output, sequence_output = tf_model.predict([input_ids, input_mask, segment_ids])
+ tf_token_outputs_np = {'pooled_output': pooled_output,
+ 'sequence_output': sequence_output}
+ # The name of the parameters in TF2 SavedModel are ending with ':0'
+ # like 'bert_model/word_embeddings/embeddings_2:0'
+ tf_params = {v.name.split(":")[0]: v.read_value() for v in tf_model.variables}
+ tf_params = sess.run(tf_params)
+
+ if USE_TF_V1 and TF1_Hub_Modules:
+ tf_params_by_read = read_tf_checkpoint(
+ os.path.join(hub_model_dir, 'variables', 'variables'))
+ for k in tf_params:
+ assert_allclose(tf_params[k], tf_params_by_read[k])
+
+ # Get parameter names for Tensorflow with unused parameters filtered out.
+ tf_names = sorted(tf_params.keys())
+ tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names)
+ tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names)
+ tf_names = filter(lambda name: name != 'Variable', tf_names)
+ tf_names = filter(lambda name: name != 'global_step', tf_names)
+ tf_names = list(tf_names)
+
+ # Build gluon model and initialize
+ gluon_model = PretrainedModel.from_cfg(cfg, use_pooler=True)
+ gluon_model.initialize(ctx=ctx)
+ gluon_model.hybridize()
+ gluon_mlm_model = PretrainedMLMModel(backbone_cfg=cfg)
+ gluon_mlm_model.initialize(ctx=ctx)
+ gluon_mlm_model.hybridize()
+
+ # Pepare test data
+ mx_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
+ mx_valid_length = mx.np.array(valid_length, dtype=np.int32, ctx=ctx)
+ mx_token_types = mx.np.array(segment_ids, dtype=np.int32, ctx=ctx)
+ mx_masked_positions = mx.np.array(mlm_positions, dtype=np.int32, ctx=ctx)
+
+ # start converting for 'backbone' and 'mlm' model.
+ # However sometimes there is no mlm parameter in Tf2 SavedModels like bert wmm large
+ if any(['cls' in name for name in tf_names]):
+ has_mlm = True
+ else:
+ has_mlm = False
+ logging.info('There is no mask language model parameter in this pretrained model')
+ name_map = get_name_map(tf_names, is_TF1=TF1_Hub_Modules)
+ # go through the gluon model to infer the shape of parameters
+ if has_mlm:
+ model = gluon_mlm_model
+ contextual_embedding, pooled_output, mlm_scores = \
+ model(mx_input_ids, mx_token_types, mx_valid_length, mx_masked_positions)
+ else:
+ model = gluon_model
+ contextual_embedding, pooled_output = model(mx_input_ids, mx_token_types,
+ mx_valid_length)
+
+ # replace tensorflow parameter names with gluon parameter names
+ mx_params = model.collect_params()
+ all_keys = set(mx_params.keys())
+ for (src_name, dst_name) in name_map.items():
+ tf_param_val = tf_params[src_name]
+ if dst_name is None:
+ continue
+ all_keys.remove(dst_name)
+ if 'self_attention/attention_output/kernel' in src_name:
+ mx_params[dst_name].set_data(tf_param_val.reshape((cfg.MODEL.units, -1)).T)
+ continue
+ if src_name.endswith('kernel'):
+ mx_params[dst_name].set_data(tf_param_val.T)
+ else:
+ mx_params[dst_name].set_data(tf_param_val)
+
+ # Merge query/kernel, key/kernel, value/kernel to encoder.all_encoder_groups.0.attn_qkv.weight
+ def convert_qkv_weights(tf_prefix, mx_prefix, is_mlm):
+ """
+ To convert the qkv weights with different prefix.
+
+ In tensorflow framework, the prefix of query/key/value for the albert model is
+ 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/kernel',
+ and that for the bert model is 'bert/encoder/layer_{}/attention/self/key/bias'.
+ In gluonnlp framework, the prefix is slightly different as
+ 'encoder.all_encoder_groups.0.attn_qkv.weight' for albert model and
+ 'encoder.all_layers.{}.attn_qkv.weight' for bert model, as the
+ curly braces {} can be filled with the layer number.
+ """
+ query_weight = tf_params[
+ '{}/query/kernel'.format(tf_prefix)]
+ key_weight = tf_params[
+ '{}/key/kernel'.format(tf_prefix)]
+ value_weight = tf_params[
+ '{}/value/kernel'.format(tf_prefix)]
+ query_bias = tf_params[
+ '{}/query/bias'.format(tf_prefix)]
+ key_bias = tf_params[
+ '{}/key/bias'.format(tf_prefix)]
+ value_bias = tf_params[
+ '{}/value/bias'.format(tf_prefix)]
+ if 'self_attention' in tf_prefix:
+ query_weight = query_weight.reshape((cfg.MODEL.units, -1))
+ key_weight = key_weight.reshape((cfg.MODEL.units, -1))
+ value_weight = value_weight.reshape((cfg.MODEL.units, -1))
+ query_bias = query_bias.reshape((-1,))
+ key_bias = key_bias.reshape((-1,))
+ value_bias = value_bias.reshape((-1,))
+ # Merge query_weight, key_weight, value_weight to mx_params
+ mx_weight_name = 'encoder.{}.attn_qkv.weight'.format(mx_prefix)
+ mx_bias_name = 'encoder.{}.attn_qkv.bias'.format(mx_prefix)
+ if is_mlm:
+ mx_weight_name = 'backbone_model.' + mx_weight_name
+ mx_bias_name = 'backbone_model.' + mx_bias_name
+ mx_params[mx_weight_name].set_data(
+ np.concatenate([query_weight, key_weight, value_weight], axis=1).T)
+ # Merge query_bias, key_bias, value_bias to mx_params
+ mx_params[mx_bias_name].set_data(
+ np.concatenate([query_bias, key_bias, value_bias], axis=0))
+
+ tf_prefix = None
+ if has_mlm:
+ all_keys.remove('mlm_decoder.3.weight')
+ if model_type == 'bert':
+ assert all(
+ [
+ re.match(
+ r'^(backbone_model\.){0,1}encoder\.all_layers\.[\d]+\.attn_qkv\.(weight|bias)$',
+ key) is not None for key in all_keys])
+ for layer_id in range(cfg.MODEL.num_layers):
+ mx_prefix = 'all_layers.{}'.format(layer_id)
+ if TF1_Hub_Modules:
+ tf_prefix = 'bert/encoder/layer_{}/attention/self'.format(layer_id)
+ else:
+ tf_prefix = 'transformer/layer_{}/self_attention'.format(layer_id)
+ convert_qkv_weights(tf_prefix, mx_prefix, has_mlm)
+ elif model_type == 'albert':
+ assert all(
+ [
+ re.match(
+ r'^(backbone_model\.){0,1}encoder\.all_encoder_groups\.0\.attn_qkv\.(weight|bias)$',
+ key) is not None for key in all_keys])
+ mx_prefix = 'all_encoder_groups.0'
+ assert TF1_Hub_Modules, 'Please download thr alber model from TF1 Hub'
+ tf_prefix = 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self'
+ convert_qkv_weights(tf_prefix, mx_prefix, has_mlm)
+ else:
+ raise NotImplementedError
+
+ tolerance = 1E-2 if cfg.MODEL.num_layers == 24 else 1E-3
+ # The pooled_output of albert large will have 0.5% mismatch under the tolerance of 1E-2,
+ # for that we are going to use a small tolerance to pass the difference checking
+ tolerance = 0.2 if 'albert_large' in args.tf_hub_model_path else tolerance
+ def check_backbone(tested_model, tf_token_outputs_np):
+ # test conversion results for backbone model
+ tf_contextual_embedding = tf_token_outputs_np['sequence_output']
+ tf_pooled_output = tf_token_outputs_np['pooled_output']
+ contextual_embedding, pooled_output = \
+ tested_model(mx_input_ids, mx_token_types, mx_valid_length)
+ assert_allclose(pooled_output.asnumpy(), tf_pooled_output, tolerance, tolerance)
+ for i in range(batch_size):
+ ele_valid_length = valid_length[i]
+ assert_allclose(contextual_embedding[i, :ele_valid_length, :].asnumpy(),
+ tf_contextual_embedding[i, :ele_valid_length, :], tolerance, tolerance)
+
+ if not has_mlm:
+ if test_conversion:
+ check_backbone(model, tf_token_outputs_np)
+ model.save_parameters(os.path.join(save_dir, 'model.params'), deduplicate=True)
+ logging.info('Convert the backbone model in {} to {}/{}'.format(hub_model_dir,
+ save_dir, 'model.params'))
+ else:
+ # test conversion results for mlm model
+ # TODO(zheyuye), figure out how to check the mlm model from TF2 SavedModel
+ if test_conversion:
+ check_backbone(model.backbone_model, tf_mlm_outputs_np)
+ if TF1_Hub_Modules:
+ tf_contextual_embedding = tf_mlm_outputs_np['sequence_output']
+ tf_pooled_output = tf_mlm_outputs_np['pooled_output']
+ tf_mlm_scores = tf_mlm_outputs_np['mlm_logits'].reshape((batch_size, num_mask, -1))
+ contextual_embedding, pooled_output, mlm_scores = \
+ model(mx_input_ids, mx_token_types, mx_valid_length, mx_masked_positions)
+ assert_allclose(pooled_output.asnumpy(), tf_pooled_output, tolerance, tolerance)
+ assert_allclose(mlm_scores.asnumpy(), tf_mlm_scores, tolerance, tolerance)
+ for i in range(batch_size):
+ ele_valid_length = valid_length[i]
+ assert_allclose(contextual_embedding[i, :ele_valid_length, :].asnumpy(),
+ tf_contextual_embedding[i, :ele_valid_length, :], tolerance, tolerance)
+ model.backbone_model.save_parameters(os.path.join(
+ save_dir, 'model.params'), deduplicate=True)
+ logging.info('Convert the backbone model in {} to {}/{}'.format(hub_model_dir,
+ save_dir, 'model.params'))
+ model.save_parameters(os.path.join(save_dir, 'model_mlm.params'), deduplicate=True)
+ logging.info('Convert the MLM model in {} to {}/{}'.format(hub_model_dir,
+ save_dir, 'model_mlm.params'))
+
+ # TODO(zheyuye) the gradient checking could be explored in further development
+
+ logging.info('Conversion finished!')
+ logging.info('Statistics:')
+
+ old_names = os.listdir(save_dir)
+ for old_name in old_names:
+ new_name, long_hash = naming_convention(save_dir, old_name)
+ old_path = os.path.join(save_dir, old_name)
+ new_path = os.path.join(save_dir, new_name)
+ shutil.move(old_path, new_path)
+ file_size = os.path.getsize(new_path)
+ logging.info('\t{}/{} {} {}'.format(save_dir, new_name, long_hash, file_size))
+
+
+if __name__ == '__main__':
+ args = parse_args()
+ logging_config()
+ save_dir = args.save_dir \
+ if args.save_dir is not None else os.path.basename(args.tf_hub_model_path) + '_gluon'
+ convert_tf_model(args.tf_hub_model_path, save_dir, args.test, args.model_type, args.gpu)
diff --git a/scripts/conversion_toolkits/convert_xlmr.sh b/scripts/conversion_toolkits/convert_xlmr.sh
new file mode 100644
index 0000000000..20fefff7a6
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_xlmr.sh
@@ -0,0 +1,8 @@
+python3 -m pip install git+https://github.com/pytorch/fairseq.git@master --upgrade --user
+for model in base large
+do
+ mkdir xlmr_${model}
+ wget "https://dl.fbaipublicfiles.com/fairseq/models/xlmr.${model}.tar.gz"
+ tar zxf xlmr.${model}.tar.gz --directory xlmr_${model}
+ python3 convert_fairseq_xlmr.py --fairseq_model_path xlmr_${model}/xlmr.${model} --model_size ${model} --test
+done
diff --git a/scripts/conversion_tools/compare_gluon_ernie.py b/scripts/conversion_tools/compare_gluon_ernie.py
deleted file mode 100644
index 6c6bd63e33..0000000000
--- a/scripts/conversion_tools/compare_gluon_ernie.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import gluonnlp as nlp
-import argparse
-import os
-import mxnet as mx
-import json
-
-parser = argparse.ArgumentParser(description='inference compare script for ernie model in gluon',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--input_file', type=str, default='input_cn.txt',
- help='sample input file for testing')
-parser.add_argument('--cased', action='store_true',
- help='if not set, inputs are converted to lower case')
-parser.add_argument('--gluon_dataset', type=str, default='baidu_ernie_uncased',
- help='gluon dataset name')
-parser.add_argument('--gluon_model', type=str, default='ernie_12_768_12',
- help='gluon model name')
-parser.add_argument('--gluon_parameter_file', type=str, default=None,
- help='gluon parameter file name.')
-parser.add_argument('--gluon_vocab_file', type=str, default=None,
- help='gluon vocab file corresponding to --gluon_parameter_file.')
-
-args = parser.parse_args()
-
-input_file = os.path.expanduser(args.input_file)
-do_lower_case = not args.cased
-max_length = 11
-if not args.gluon_dataset:
- with open(args.gluon_vocab_file) as f:
- vocab_str = json.load(f)
- vocab = nlp.vocab.BERTVocab.from_json(json.dumps(vocab_str))
-else:
- vocab = None
-bert, vocabulary = nlp.model.get_model(args.gluon_model,
- dataset_name=args.gluon_dataset,
- vocab=vocab,
- pretrained=not args.gluon_parameter_file,
- use_pooler=False,
- use_decoder=False,
- use_classifier=False)
-if args.gluon_parameter_file:
- try:
- bert.cast('float16')
- bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
- bert.cast('float32')
- except AssertionError:
- bert.cast('float32')
- bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
-
-print(bert)
-tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=do_lower_case)
-dataset = nlp.data.TSVDataset(input_file, field_separator=nlp.data.Splitter('|||'))
-
-trans = nlp.data.BERTSentenceTransform(tokenizer, max_length)
-dataset = dataset.transform(trans)
-
-bert_dataloader = mx.gluon.data.DataLoader(dataset, batch_size=1,
- shuffle=True, last_batch='rollover')
-
-# verify the output of the first sample
-for i, seq in enumerate(bert_dataloader):
- input_ids, valid_length, type_ids = seq
- out = bert(input_ids, type_ids,
- valid_length.astype('float32'))
- length = valid_length.asscalar()
- gluon_np = out.asnumpy().squeeze(0)
- print(out)
- import numpy as np
- paddle_np = np.load(os.path.expanduser(
- 'ernie_top_layer_emb.npy'))
- np.testing.assert_array_almost_equal(paddle_np, gluon_np, decimal=6)
- break
-print("verify success")
diff --git a/scripts/conversion_tools/compare_tf_gluon_model.py b/scripts/conversion_tools/compare_tf_gluon_model.py
deleted file mode 100644
index 8895194b28..0000000000
--- a/scripts/conversion_tools/compare_tf_gluon_model.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Script for model comparison between TF and Gluon."""
-
-# pylint: disable=wrong-import-position, wrong-import-order, wildcard-import
-
-import sys
-import os
-import argparse
-import numpy as np
-import mxnet as mx
-import gluonnlp as nlp
-
-sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-
-parser = argparse.ArgumentParser(description='Comparison script for BERT model in Tensorflow'
- 'and that in Gluon. This script works with '
- 'google/bert@f39e881b',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--input_file', type=str, default='input.txt',
- help='sample input file for testing')
-parser.add_argument('--tf_bert_repo_dir', type=str,
- default='~/bert/',
- help='path to the original Tensorflow bert repository. '
- 'The repo should be at f39e881b.')
-parser.add_argument('--tf_model_dir', type=str,
- default='~/uncased_L-12_H-768_A-12/',
- help='path to the original Tensorflow bert checkpoint directory.')
-parser.add_argument('--tf_model_prefix', type=str,
- default='bert_model.ckpt',
- help='name of bert checkpoint file.')
-parser.add_argument('--tf_config_name', type=str,
- default='bert_config.json',
- help='Name of Bert config file')
-parser.add_argument('--cased', action='store_true',
- help='if not set, inputs are converted to lower case')
-parser.add_argument('--gluon_dataset', type=str, default='book_corpus_wiki_en_uncased',
- help='gluon dataset name')
-parser.add_argument('--gluon_model', type=str, default='bert_12_768_12',
- help='gluon model name')
-parser.add_argument('--gluon_parameter_file', type=str, default=None,
- help='gluon parameter file name.')
-parser.add_argument('--gluon_vocab_file', type=str, default=None,
- help='gluon vocab file corresponding to --gluon_parameter_file.')
-
-args = parser.parse_args()
-
-input_file = os.path.expanduser(args.input_file)
-tf_bert_repo_dir = os.path.expanduser(args.tf_bert_repo_dir)
-tf_model_dir = os.path.expanduser(args.tf_model_dir)
-vocab_file = os.path.join(tf_model_dir, 'vocab.txt')
-bert_config_file = os.path.join(tf_model_dir, args.tf_config_name)
-init_checkpoint = os.path.join(tf_model_dir, args.tf_model_prefix)
-do_lower_case = not args.cased
-max_length = 128
-
-###############################################################################
-# Tensorflow MODEL #
-###############################################################################
-# import tensorflow modules
-sys.path.insert(0, tf_bert_repo_dir)
-
-# tensorflow model inference
-import modeling
-import tokenization
-from extract_features import *
-
-# data
-num_layers = int(args.gluon_model.split('_')[1])
-layer_indexes = list(range(num_layers))
-bert_config = modeling.BertConfig.from_json_file(bert_config_file)
-tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
-examples = read_examples(input_file)
-
-features = convert_examples_to_features(
- examples=examples, seq_length=max_length, tokenizer=tokenizer)
-
-is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
-run_config = tf.contrib.tpu.RunConfig(
- master=None,
- tpu_config=tf.contrib.tpu.TPUConfig(
- num_shards=1,
- per_host_input_for_training=is_per_host))
-# model
-model_fn = model_fn_builder(
- bert_config=bert_config,
- init_checkpoint=init_checkpoint,
- layer_indexes=layer_indexes,
- use_tpu=False,
- use_one_hot_embeddings=False)
-
-estimator = tf.contrib.tpu.TPUEstimator(
- use_tpu=False,
- model_fn=model_fn,
- config=run_config,
- predict_batch_size=1)
-
-input_fn = input_fn_builder(
- features=features, seq_length=max_length)
-
-tensorflow_all_out = []
-for result in estimator.predict(input_fn, yield_single_examples=True):
- output_json = collections.OrderedDict()
- tensorflow_all_out_features = []
- all_layers = []
- for (j, layer_index) in enumerate(layer_indexes):
- layer_output = result['layer_output_%d' % j]
- layers = collections.OrderedDict()
- layers['index'] = layer_index
- layers['values'] = layer_output
- all_layers.append(layers)
- tensorflow_out_features = collections.OrderedDict()
- tensorflow_out_features['layers'] = all_layers
- tensorflow_all_out_features.append(tensorflow_out_features)
-
- output_json['features'] = tensorflow_all_out_features
- tensorflow_all_out.append(output_json)
-
-tf_outputs = [tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes]
-
-###############################################################################
-# Gluon MODEL #
-###############################################################################
-
-if args.gluon_parameter_file:
- assert args.gluon_vocab_file, \
- 'Must specify --gluon_vocab_file when specifying --gluon_parameter_file'
- with open(args.gluon_vocab_file, 'r') as f:
- vocabulary = nlp.Vocab.from_json(f.read())
- bert, vocabulary = nlp.model.get_model(args.gluon_model,
- dataset_name=None,
- vocab=vocabulary,
- pretrained=not args.gluon_parameter_file,
- use_pooler=False,
- use_decoder=False,
- use_classifier=False)
- try:
- bert.cast('float16')
- bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
- bert.cast('float32')
- except AssertionError:
- bert.cast('float32')
- bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
-else:
- assert not args.gluon_vocab_file, \
- 'Cannot specify --gluon_vocab_file without specifying --gluon_parameter_file'
- bert, vocabulary = nlp.model.get_model(args.gluon_model,
- dataset_name=args.gluon_dataset,
- pretrained=not args.gluon_parameter_file,
- use_pooler=False,
- use_decoder=False,
- use_classifier=False)
-
-print(bert)
-tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=do_lower_case)
-dataset = nlp.data.TSVDataset(input_file, field_separator=nlp.data.Splitter(' ||| '))
-
-trans = nlp.data.BERTSentenceTransform(tokenizer, max_length)
-dataset = dataset.transform(trans)
-
-bert_dataloader = mx.gluon.data.DataLoader(dataset, batch_size=1,
- shuffle=True, last_batch='rollover')
-
-# verify the output of the first sample
-for i, seq in enumerate(bert_dataloader):
- input_ids, valid_length, type_ids = seq
- out = bert(input_ids, type_ids,
- valid_length.astype('float32'))
- length = valid_length.asscalar()
- a = tf_outputs[-1][:length]
- b = out[0][:length].asnumpy()
-
- print('stdev = %s' % (np.std(a - b)))
- mx.test_utils.assert_almost_equal(a, b, atol=5e-6, rtol=5e-6)
- break
diff --git a/scripts/conversion_tools/convert_fairseq_model.py b/scripts/conversion_tools/convert_fairseq_model.py
deleted file mode 100644
index 2dc97fcfa2..0000000000
--- a/scripts/conversion_tools/convert_fairseq_model.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-""" Script for converting Fairseq Roberta Model to Gluon. """
-import argparse
-import logging
-import os
-import sys
-import io
-import numpy as np
-
-import torch
-from fairseq.models.roberta import RobertaModel
-
-import mxnet as mx
-import gluonnlp as nlp
-from gluonnlp.model import BERTEncoder, BERTModel
-from gluonnlp.model.bert import bert_hparams
-from gluonnlp.data.utils import _load_pretrained_vocab
-
-from utils import get_hash, load_text_vocab, tf_vocab_to_gluon_vocab
-
-parser = argparse.ArgumentParser(description='Conversion script for Fairseq RoBERTa model',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--ckpt_dir', type=str, help='Full path to the roberta folder',
- default='/home/ubuntu/roberta/roberta.base')
-parser.add_argument('--model', type=str, help='Model type. ',
- choices=['roberta_12_768_12', 'roberta_24_1024_16'],
- default='roberta_12_768_12')
-parser.add_argument('--verbose', action='store_true', help='Verbose logging')
-
-args = parser.parse_args()
-
-ckpt_dir = os.path.expanduser(args.ckpt_dir)
-
-ckpt = torch.load(os.path.join(ckpt_dir, 'model.pt'))
-pytorch_params = ckpt['model']
-
-if args.verbose:
- print(ckpt['args'])
- for k, v in pytorch_params.items():
- print(k, v.shape)
-
-# Load the model in fairseq
-roberta = RobertaModel.from_pretrained(ckpt_dir)
-roberta.eval()
-
-def fairseq_vocab_to_gluon_vocab(torch_vocab):
- index_to_words = [None] * len(torch_vocab)
-
- bos_idx = torch_vocab.bos()
- pad_idx = torch_vocab.pad()
- eos_idx = torch_vocab.eos()
- unk_idx = torch_vocab.unk()
-
- index_to_words[bos_idx] = torch_vocab.symbols[bos_idx]
- index_to_words[pad_idx] = torch_vocab.symbols[pad_idx]
- index_to_words[eos_idx] = torch_vocab.symbols[eos_idx]
- index_to_words[unk_idx] = torch_vocab.symbols[unk_idx]
-
- specials = [bos_idx, pad_idx, eos_idx, unk_idx]
-
- openai_to_roberta = {}
- openai_vocab = _load_pretrained_vocab('openai_webtext', '.')
-
- with io.open(os.path.join(ckpt_dir, 'dict.txt'), encoding='utf-8') as f:
- for i, line in enumerate(f):
- token, count = line.split(' ')
- try:
- fake_token = int(token)
- openai_to_roberta[token] = i + len(specials)
- except ValueError:
- index_to_words[i + len(specials)] = token
-
- for idx, token in enumerate(openai_vocab.idx_to_token):
- if str(idx) in openai_to_roberta:
- index_to_words[openai_to_roberta[str(idx)]] = token
- else:
- assert token == u'', token
-
- mask_idx = torch_vocab.index(u'')
- index_to_words[mask_idx] = torch_vocab.string([mask_idx])
- assert None not in index_to_words
- word2idx = {}
- for idx, token in enumerate(index_to_words):
- word2idx[token] = idx
-
- vocab = nlp.vocab.Vocab(word2idx, token_to_idx=word2idx,
- unknown_token=index_to_words[unk_idx],
- padding_token=index_to_words[pad_idx],
- bos_token=index_to_words[bos_idx],
- eos_token=index_to_words[eos_idx],
- mask_token=u'')
- return vocab
-
-vocab = fairseq_vocab_to_gluon_vocab(roberta.task.dictionary)
-
-predefined_args = bert_hparams[args.model]
-
-# BERT encoder
-encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
- num_layers=predefined_args['num_layers'], units=predefined_args['units'],
- hidden_size=predefined_args['hidden_size'],
- max_length=predefined_args['max_length'],
- num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'],
- dropout=predefined_args['dropout'],
- use_residual=predefined_args['use_residual'],
- layer_norm_eps=predefined_args['layer_norm_eps'])
-
-# BERT model
-bert = BERTModel(encoder, len(vocab),
- units=predefined_args['units'], embed_size=predefined_args['embed_size'],
- word_embed=predefined_args['word_embed'], use_pooler=False,
- use_token_type_embed=False, use_classifier=False)
-
-bert.initialize(init=mx.init.Normal(0.02))
-
-ones = mx.nd.ones((2, 8))
-out = bert(ones, None, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
-params = bert._collect_params_with_prefix()
-
-
-
-mapping = {
- 'decoder.2' : 'decoder.lm_head.layer_norm',
- 'decoder.0' : 'decoder.lm_head.dense',
- 'decoder.3' : 'decoder.lm_head',
- 'encoder.layer_norm' : 'decoder.sentence_encoder.emb_layer_norm',
- 'encoder.position_weight' : 'decoder.sentence_encoder.embed_positions.weight',
- 'encoder.transformer_cells': 'decoder.sentence_encoder.layers',
- 'attention_cell.proj_key.' : 'self_attn.in_proj_',
- 'attention_cell.proj_value.' : 'self_attn.in_proj_',
- 'attention_cell.proj_query.' : 'self_attn.in_proj_',
- 'ffn.ffn_1' : 'fc1',
- 'ffn.ffn_2' : 'fc2',
- 'layer_norm.gamma' : 'layer_norm.weight',
- 'layer_norm.beta' : 'layer_norm.bias',
- 'ffn.layer_norm' : 'final_layer_norm',
- 'word_embed.0.weight' : 'decoder.sentence_encoder.embed_tokens.weight',
-}
-
-for i in range(24):
- mapping['{}.layer_norm'.format(i)] = '{}.self_attn_layer_norm'.format(i)
- mapping['{}.proj'.format(i)] = '{}.self_attn.out_proj'.format(i)
-
-# set parameter data
-loaded_params = {}
-visited_pytorch_params = {}
-for name in params:
- pytorch_name = name
- for source, dest in mapping.items():
- pytorch_name = pytorch_name.replace(source, dest)
-
- assert pytorch_name in pytorch_params.keys(), 'Key ' + pytorch_name + ' for ' + name + ' not found.'
- torch_arr = pytorch_params[pytorch_name].cpu()
- # fairseq positional embedding starts with index 2
- if pytorch_name == 'decoder.sentence_encoder.embed_positions.weight':
- torch_arr = torch_arr[2:]
-
- arr = mx.nd.array(torch_arr)
- if 'attention_cell.proj' in name:
- unfused = ['query', 'key', 'value']
- arrs = arr.split(num_outputs=3, axis=0)
- for i, p in enumerate(unfused):
- if p in name:
- arr = arrs[i]
- else:
- assert arr.shape == params[name].shape, (arr.shape, params[name].shape, name, pytorch_name)
- params[name].set_data(arr)
- loaded_params[name] = True
- visited_pytorch_params[pytorch_name] = True
-
-assert len(params) == len(loaded_params)
-assert len(visited_pytorch_params) == len(pytorch_params), "Gluon model does not match PyTorch model. " \
- "Please fix the BERTModel hyperparameters\n" + str(len(visited_pytorch_params)) + ' v.s. ' + str(len(pytorch_params))
-
-
-texts = 'Hello world. abc, def and 中文!'
-torch_tokens = roberta.encode(texts)
-
-torch_features = roberta.extract_features(torch_tokens)
-pytorch_out = torch_features.detach().numpy()
-
-mx_tokenizer = nlp.data.GPT2BPETokenizer()
-mx_tokens = [vocab.bos_token] + mx_tokenizer(texts) + [vocab.eos_token]
-mx_data = vocab[mx_tokens]
-print(mx_tokens)
-print(vocab[mx_tokens])
-print(torch_tokens)
-assert mx_data == torch_tokens.tolist()
-
-mx_out = bert(mx.nd.array([mx_data]))
-print('stdev = ', np.std(mx_out.asnumpy() - pytorch_out))
-mx.test_utils.assert_almost_equal(mx_out.asnumpy(), pytorch_out, atol=1e-3, rtol=1e-3)
-mx.test_utils.assert_almost_equal(mx_out.asnumpy(), pytorch_out, atol=5e-6, rtol=5e-6)
-
-bert.save_parameters(os.path.join(ckpt_dir, args.model + '.params'))
-with io.open(os.path.join(ckpt_dir, args.model + '.vocab'), 'w', encoding='utf-8') as f:
- f.write(vocab.to_json())
diff --git a/scripts/conversion_tools/convert_paddle_to_gluon.py b/scripts/conversion_tools/convert_paddle_to_gluon.py
deleted file mode 100644
index b5f71c2be9..0000000000
--- a/scripts/conversion_tools/convert_paddle_to_gluon.py
+++ /dev/null
@@ -1,254 +0,0 @@
-#!/usr/bin/env python
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import collections
-import os
-import sys
-import numpy as np
-import argparse
-import logging
-import json
-import mxnet as mx
-import gluonnlp as nlp
-import paddle.fluid as fluid
-
-from gluonnlp.model import BERTEncoder, BERTModel
-from gluonnlp.model.bert import bert_hparams
-from utils import get_hash, tf_vocab_to_gluon_vocab, load_text_vocab
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--gluon_bert_model_base", default='ernie_12_768_12', type=str, help=".")
-parser.add_argument("--init_pretraining_params", default='./ERNIE_stable-1.0.1/params',
- type=str, help=".")
-parser.add_argument("--ernie_config_path", default='./ERNIE_stable-1.0.1/ernie_config.json',
- type=str, help=".")
-parser.add_argument("--ernie_vocab_path", default='./ERNIE_stable-1.0.1/vocab.txt',
- type=str, help=".")
-parser.add_argument("--out_dir", default='./ernie_gluon_model2', type=str, help=".")
-parser.add_argument("--baidu_lark_repo_dir", default='../../../../LARK', type=str,
- help='path to the original baidu lark repository. '
- 'The repo should be at f97e3c8581e36dc1979560d62f75df862acd9585.'
- '(https://github.com/PaddlePaddle/LARK.git)')
-args = parser.parse_args()
-
-sys.path = [os.path.join(args.baidu_lark_repo_dir,'ERNIE')] + sys.path
-try:
- from model.ernie import ErnieConfig
- from finetune.classifier import create_model
-except:
- raise ImportError('Place clone ERNIE first')
-
-def if_exist(var):
- return os.path.exists(os.path.join(args.init_pretraining_params, var.name))
-
-
-def build_weight_map():
- weight_map = collections.OrderedDict({
- 'word_embedding': 'word_embed.0.weight',
- 'pos_embedding': 'encoder.position_weight',
- 'sent_embedding': 'token_type_embed.0.weight',
- 'pre_encoder_layer_norm_scale': 'encoder.layer_norm.gamma',
- 'pre_encoder_layer_norm_bias': 'encoder.layer_norm.beta',
- })
-
- def add_w_and_b(ernie_pre, gluon_pre):
- weight_map[ernie_pre + ".w_0"] = gluon_pre + ".weight"
- weight_map[ernie_pre + ".b_0"] = gluon_pre + ".bias"
-
- def add_one_encoder_layer(layer_number):
- # attention
- add_w_and_b("encoder_layer_{}_multi_head_att_query_fc".format(layer_number),
- "encoder.transformer_cells.{}.attention_cell.proj_query".format(layer_number))
- add_w_and_b("encoder_layer_{}_multi_head_att_key_fc".format(layer_number),
- "encoder.transformer_cells.{}.attention_cell.proj_key".format(layer_number))
- add_w_and_b("encoder_layer_{}_multi_head_att_value_fc".format(layer_number),
- "encoder.transformer_cells.{}.attention_cell.proj_value".format(layer_number))
- add_w_and_b("encoder_layer_{}_multi_head_att_output_fc".format(layer_number),
- "encoder.transformer_cells.{}.proj".format(layer_number))
- weight_map["encoder_layer_{}_post_att_layer_norm_bias".format(layer_number)] = \
- "encoder.transformer_cells.{}.layer_norm.beta".format(layer_number)
- weight_map["encoder_layer_{}_post_att_layer_norm_scale".format(layer_number)] = \
- "encoder.transformer_cells.{}.layer_norm.gamma".format(layer_number)
- # intermediate
- add_w_and_b("encoder_layer_{}_ffn_fc_0".format(layer_number),
- "encoder.transformer_cells.{}.ffn.ffn_1".format(layer_number))
- # output
- add_w_and_b("encoder_layer_{}_ffn_fc_1".format(layer_number),
- "encoder.transformer_cells.{}.ffn.ffn_2".format(layer_number))
- weight_map["encoder_layer_{}_post_ffn_layer_norm_bias".format(layer_number)] = \
- "encoder.transformer_cells.{}.ffn.layer_norm.beta".format(layer_number)
- weight_map["encoder_layer_{}_post_ffn_layer_norm_scale".format(layer_number)] = \
- "encoder.transformer_cells.{}.ffn.layer_norm.gamma".format(layer_number)
-
- for i in range(12):
- add_one_encoder_layer(i)
- add_w_and_b('pooled_fc', 'pooler')
- return weight_map
-
-
-def extract_weights(args):
- # add ERNIE to environment
- print('extract weights start'.center(60, '='))
- startup_prog = fluid.Program()
- test_prog = fluid.Program()
- place = fluid.CPUPlace()
- exe = fluid.Executor(place)
- exe.run(startup_prog)
- args.max_seq_len = 512
- args.use_fp16 = False
- args.num_labels = 2
- args.loss_scaling = 1.0
- print('model config:')
- ernie_config = ErnieConfig(args.ernie_config_path)
- ernie_config.print_config()
- with fluid.program_guard(test_prog, startup_prog):
- with fluid.unique_name.guard():
- _, _ = create_model(
- args,
- pyreader_name='train',
- ernie_config=ernie_config)
- fluid.io.load_vars(exe, args.init_pretraining_params, main_program=test_prog, predicate=if_exist)
- state_dict = collections.OrderedDict()
- weight_map = build_weight_map()
- for ernie_name, gluon_name in weight_map.items():
- fluid_tensor = fluid.global_scope().find_var(ernie_name).get_tensor()
- fluid_array = np.array(fluid_tensor, dtype=np.float32)
- if 'w_0' in ernie_name:
- fluid_array = fluid_array.transpose()
- state_dict[gluon_name] = fluid_array
- print('{} -> {} {}'.format(ernie_name, gluon_name, fluid_array.shape))
- print('extract weights done!'.center(60, '='))
- return state_dict
-
-
-def save_model(new_gluon_parameters, output_dir):
- print('save model start'.center(60, '='))
- if not os.path.exists(output_dir):
- os.makedirs(output_dir)
- # save model
- # load vocab
- vocab_f = open(os.path.join(output_dir, "vocab.txt"), "wt", encoding='utf-8')
- with open(args.ernie_vocab_path, "rt", encoding='utf-8') as f:
- for line in f:
- data = line.strip().split("\t")
- vocab_f.writelines(data[0] + "\n")
- vocab_f.close()
- vocab = tf_vocab_to_gluon_vocab(load_text_vocab(os.path.join(output_dir, "vocab.txt")))
- # vocab serialization
- tmp_file_path = os.path.expanduser(os.path.join(output_dir, 'tmp'))
- if not os.path.exists(os.path.join(args.out_dir)):
- os.makedirs(os.path.join(args.out_dir))
- with open(tmp_file_path, 'w') as f:
- f.write(vocab.to_json())
- hash_full, hash_short = get_hash(tmp_file_path)
- gluon_vocab_path = os.path.expanduser(os.path.join(output_dir, hash_short + '.vocab'))
- with open(gluon_vocab_path, 'w') as f:
- f.write(vocab.to_json())
- logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full)
-
- # BERT config
- tf_config_names_to_gluon_config_names = {
- 'attention_probs_dropout_prob': 'dropout',
- 'hidden_act': None,
- 'hidden_dropout_prob': 'dropout',
- 'hidden_size': 'units',
- 'initializer_range': None,
- # 'intermediate_size': 'hidden_size',
- 'max_position_embeddings': 'max_length',
- 'num_attention_heads': 'num_heads',
- 'num_hidden_layers': 'num_layers',
- 'type_vocab_size': 'token_type_vocab_size',
- 'vocab_size': None
- }
- predefined_args = bert_hparams[args.gluon_bert_model_base]
- with open(args.ernie_config_path, 'r') as f:
- tf_config = json.load(f)
- if 'layer_norm_eps' in tf_config: # ignore layer_norm_eps
- del tf_config['layer_norm_eps']
- assert len(tf_config) == len(tf_config_names_to_gluon_config_names)
- for tf_name, gluon_name in tf_config_names_to_gluon_config_names.items():
- if tf_name is None or gluon_name is None:
- continue
- if gluon_name != 'max_length':
- assert tf_config[tf_name] == predefined_args[gluon_name]
-
- encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
- num_layers=predefined_args['num_layers'], units=predefined_args['units'],
- hidden_size=predefined_args['hidden_size'],
- max_length=predefined_args['max_length'],
- num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'],
- dropout=predefined_args['dropout'],
- use_residual=predefined_args['use_residual'],
- activation='relu')
-
- bert = BERTModel(encoder, len(vocab),
- token_type_vocab_size=predefined_args['token_type_vocab_size'],
- units=predefined_args['units'], embed_size=predefined_args['embed_size'],
- word_embed=predefined_args['word_embed'], use_pooler=True,
- use_decoder=False, use_classifier=False)
-
- bert.initialize(init=mx.init.Normal(0.02))
-
- ones = mx.nd.ones((2, 8))
- out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
- params = bert._collect_params_with_prefix()
- assert len(params) == len(new_gluon_parameters), "Gluon model does not match paddle model. " \
- "Please fix the BERTModel hyperparameters"
-
- # post processings for parameters:
- # - handle tied decoder weight
- new_gluon_parameters['decoder.3.weight'] = new_gluon_parameters['word_embed.0.weight']
- # set parameter data
- loaded_params = {}
- for name in params:
- if name == 'word_embed.0.weight':
- arr = mx.nd.array(new_gluon_parameters[name][:params[name].shape[0]])
- else:
- arr = mx.nd.array(new_gluon_parameters[name])
- try:
- assert arr.shape == params[name].shape
- except:
- print(name)
- params[name].set_data(arr)
- loaded_params[name] = True
-
- # post processings for parameters:
- # - handle tied decoder weight
- # - update word embedding for reserved tokens
-
- if len(params) != len(loaded_params):
- raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, '
- 'but {} have been extracted from the paddle model. '.format(
- len(params), len(loaded_params)))
-
- # param serialization
- bert.save_parameters(tmp_file_path)
- hash_full, hash_short = get_hash(tmp_file_path)
- gluon_param_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.params'))
- logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full)
- bert.save_parameters(gluon_param_path)
- mx.nd.waitall()
- # save config
- print('finish save vocab')
- print('save model done!'.center(60, '='))
-
-
-if __name__ == "__main__":
- state_dict = extract_weights(args)
- save_model(state_dict, args.out_dir)
diff --git a/scripts/conversion_tools/convert_pytorch_model.py b/scripts/conversion_tools/convert_pytorch_model.py
deleted file mode 100644
index 26f0f4a06f..0000000000
--- a/scripts/conversion_tools/convert_pytorch_model.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-""" Script for converting PyTorch Model to Gluon. """
-
-import argparse
-import json
-import logging
-import os
-import sys
-
-import mxnet as mx
-import gluonnlp as nlp
-import torch
-from gluonnlp.model import BERTEncoder, BERTModel
-from gluonnlp.model.bert import bert_hparams
-
-sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-from utils import get_hash, load_text_vocab, tf_vocab_to_gluon_vocab
-
-parser = argparse.ArgumentParser(description='Conversion script for PyTorch BERT model',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--model', type=str, default='bert_12_768_12',
- choices=['bert_12_768_12', 'bert_24_1024_16'], help='BERT model name')
-parser.add_argument('--pytorch_checkpoint_dir', type=str,
- help='Path to Tensorflow checkpoint folder.')
-parser.add_argument('--vocab_file', type=str, help='Full path to the vocab.txt')
-parser.add_argument('--gluon_pytorch_name_mapping', type=str,
- default='gluon_to_pytorch_naming.json',
- help='Output of infer_pytorch_gluon_parameter_name_mapping.py')
-parser.add_argument('--out_dir', type=str, default=os.path.join('~', 'output'),
- help='Path to output folder. The folder must exist.')
-parser.add_argument('--debug', action='store_true', help='debugging mode')
-args = parser.parse_args()
-logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
-logging.info(args)
-
-# convert vocabulary
-vocab = tf_vocab_to_gluon_vocab(load_text_vocab(args.vocab_file))
-
-# vocab serialization
-tmp_file_path = os.path.expanduser(os.path.join(args.out_dir, 'tmp'))
-with open(tmp_file_path, 'w') as f:
- f.write(vocab.to_json())
-hash_full, hash_short = get_hash(tmp_file_path)
-gluon_vocab_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.vocab'))
-with open(gluon_vocab_path, 'w') as f:
- f.write(vocab.to_json())
- logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full)
-
-# Load PyTorch Model
-pytorch_parameters = torch.load(os.path.join(args.pytorch_checkpoint_dir, 'pytorch_model.bin'),
- map_location=lambda storage, loc: storage)
-pytorch_parameters = {k: v.numpy() for k, v in pytorch_parameters.items()}
-
-# Make sure vocab fits to model
-assert pytorch_parameters['bert.embeddings.word_embeddings.weight'].shape[0] == len(
- vocab.idx_to_token)
-
-# Load Mapping
-with open(args.gluon_pytorch_name_mapping, 'r') as f:
- mapping = json.load(f)
-
-# BERT config
-tf_config_names_to_gluon_config_names = {
- 'attention_probs_dropout_prob': 'dropout',
- 'hidden_act': None,
- 'hidden_dropout_prob': 'dropout',
- 'hidden_size': 'units',
- 'initializer_range': None,
- 'intermediate_size': 'hidden_size',
- 'max_position_embeddings': 'max_length',
- 'num_attention_heads': 'num_heads',
- 'num_hidden_layers': 'num_layers',
- 'type_vocab_size': 'token_type_vocab_size',
- 'vocab_size': None
-}
-predefined_args = bert_hparams[args.model]
-with open(os.path.join(args.pytorch_checkpoint_dir, 'bert_config.json'), 'r') as f:
- tf_config = json.load(f)
- assert len(tf_config) == len(tf_config_names_to_gluon_config_names)
- for tf_name, gluon_name in tf_config_names_to_gluon_config_names.items():
- if tf_name is None or gluon_name is None:
- continue
- assert tf_config[tf_name] == predefined_args[gluon_name]
-
-# BERT encoder
-encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
- num_layers=predefined_args['num_layers'], units=predefined_args['units'],
- hidden_size=predefined_args['hidden_size'],
- max_length=predefined_args['max_length'],
- num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'],
- dropout=predefined_args['dropout'],
- use_residual=predefined_args['use_residual'])
-
-# Infer enabled BERTModel components
-use_pooler = any('pooler' in n for n in pytorch_parameters)
-use_decoder = any('cls.predictions.transform.dense.weight' in n for n in pytorch_parameters)
-use_classifier = any('cls.seq_relationship.weight' in n for n in pytorch_parameters)
-
-if not use_classifier and 'classifier.weight' in pytorch_parameters and \
- pytorch_parameters['classifier.weight'].shape[0] == 2:
- logging.info('Assuming classifier weights in provided Pytorch model are '
- 'from next sentence prediction task.')
- use_classifier = True
-
-logging.info('Inferred that the pytorch model provides the following parameters:')
-logging.info('- use_pooler = {}'.format(use_pooler))
-logging.info('- use_decoder = {}'.format(use_decoder))
-logging.info('- use_classifier = {}'.format(use_classifier))
-
-# BERT model
-bert = BERTModel(encoder, len(vocab),
- token_type_vocab_size=predefined_args['token_type_vocab_size'],
- units=predefined_args['units'], embed_size=predefined_args['embed_size'],
- word_embed=predefined_args['word_embed'], use_pooler=use_pooler,
- use_decoder=use_decoder, use_classifier=use_classifier)
-
-bert.initialize(init=mx.init.Normal(0.02))
-
-ones = mx.nd.ones((2, 8))
-out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
-params = bert._collect_params_with_prefix()
-assert len(params) == len(pytorch_parameters), "Gluon model does not match PyTorch model. " \
- "Please fix the BERTModel hyperparameters"
-
-# set parameter data
-loaded_params = {}
-for name in params:
- if name not in mapping:
- raise RuntimeError('Invalid json mapping file. '
- 'The parameter {} is not described in the mapping file.'.format(name))
- pytorch_name = mapping[name]
- if pytorch_name not in pytorch_parameters.keys():
- # Handle inconsistent naming in PyTorch
- # The Expected names here are based on the PyTorch version of SciBert.
- # The Inconsistencies were found in ClinicalBert
- if 'LayerNorm' in pytorch_name:
- pytorch_name = pytorch_name.replace('weight', 'gamma')
- pytorch_name = pytorch_name.replace('bias', 'beta')
- assert pytorch_name in pytorch_parameters.keys()
-
- if 'cls.seq_relationship' in pytorch_name:
- pytorch_name = pytorch_name.replace('cls.seq_relationship', 'classifier')
-
- arr = mx.nd.array(pytorch_parameters[pytorch_name])
-
- assert arr.shape == params[name].shape
- params[name].set_data(arr)
- loaded_params[name] = True
-
-if len(params) != len(loaded_params):
- raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, '
- 'but {} have been extracted from the pytorch model. '.format(
- len(params), len(loaded_params)))
-
-# param serialization
-bert.save_parameters(tmp_file_path)
-hash_full, hash_short = get_hash(tmp_file_path)
-gluon_param_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.params'))
-logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full)
-bert.save_parameters(gluon_param_path)
-mx.nd.waitall()
diff --git a/scripts/conversion_tools/convert_pytorch_transformers.py b/scripts/conversion_tools/convert_pytorch_transformers.py
deleted file mode 100644
index 7dad51244c..0000000000
--- a/scripts/conversion_tools/convert_pytorch_transformers.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# coding: utf-8
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-""" Script for converting the distilbert model from pytorch-transformer to Gluon.
-
-Usage:
-
-pip3 install pytorch-transformers
-
-python3 convert_pytorch_transformers.py
-
-If you are not converting the distilbert model, please change the code section noted
-by "TODO".
-
- """
-
-import argparse
-import pytorch_transformers
-import torch
-import mxnet as mx
-import gluonnlp as nlp
-import os, logging, json
-from utils import get_hash, load_text_vocab, tf_vocab_to_gluon_vocab
-
-parser = argparse.ArgumentParser(description='Conversion script for pytorch-transformer '
- 'distilbert model',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--out_dir', type=str, help='Full path to the output folder',
- default='./converted-model')
-
-args = parser.parse_args()
-
-
-####################################################################
-# LOAD A BERT MODEL FROM PYTORCH #
-####################################################################
-# TODO: change this to your bert model and tokenizer used in pytorch-transformer
-tokenizer = pytorch_transformers.tokenization_distilbert.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-model = pytorch_transformers.DistilBertModel.from_pretrained('distilbert-base-uncased')
-
-dir_name = './temp'
-gluon_dir_name = args.out_dir
-nlp.utils.mkdir(dir_name)
-nlp.utils.mkdir(gluon_dir_name)
-model_name = 'bert_12_768_12'
-model.save_pretrained(dir_name)
-tokenizer.save_pretrained(dir_name)
-
-####################################################################
-# SHOW PYTORCH PARAMETER LIST #
-####################################################################
-pytorch_parameters = torch.load(os.path.join(dir_name, 'pytorch_model.bin'))
-print('parameters in pytorch')
-print(sorted(list(pytorch_parameters)))
-
-####################################################################
-# CONVERT VOCAB #
-####################################################################
-# convert vocabulary
-vocab = tf_vocab_to_gluon_vocab(load_text_vocab(os.path.join(dir_name, 'vocab.txt')))
-# vocab serialization
-tmp_file_path = os.path.expanduser(os.path.join(gluon_dir_name, 'temp'))
-with open(tmp_file_path, 'w') as f:
- f.write(vocab.to_json())
-
-hash_full, hash_short = get_hash(tmp_file_path)
-gluon_vocab_path = os.path.expanduser(os.path.join(gluon_dir_name, hash_short + '.vocab'))
-with open(gluon_vocab_path, 'w') as f:
- f.write(vocab.to_json())
- print('vocab file saved to {}. hash = {}'.format(gluon_vocab_path, hash_full))
-
-####################################################################
-# CONVERT PARAMS OPTIONS #
-####################################################################
-torch_to_gluon_config_names = {
- "attention_dropout": 'dropout',
- "dim": 'embed_size',
- "dropout": 'dropout',
- "hidden_dim": 'hidden_size',
- "max_position_embeddings": 'max_length',
- "n_heads": 'num_heads',
- "n_layers": 'num_layers',
- "output_attentions": 'output_attention',
- "output_hidden_states": 'output_all_encodings',
- "vocab_size": 'vocab_size',
-}
-
-predefined_args = nlp.model.bert.bert_hparams[model_name]
-
-with open(os.path.join(dir_name, 'config.json'), 'r') as f:
- torch_config = json.load(f)
- for name, value in torch_config.items():
- if name in torch_to_gluon_config_names:
- predefined_args[torch_to_gluon_config_names[name]] = value
-
-# BERT encoder
-encoder = nlp.model.BERTEncoder(attention_cell=predefined_args['attention_cell'],
- num_layers=predefined_args['num_layers'], units=predefined_args['units'],
- hidden_size=predefined_args['hidden_size'],
- max_length=predefined_args['max_length'],
- num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'],
- dropout=predefined_args['dropout'],
- use_residual=predefined_args['use_residual'])
-
-# BERT model
-bert = nlp.model.BERTModel(encoder, len(vocab),
- units=predefined_args['units'], embed_size=predefined_args['embed_size'],
- embed_dropout=predefined_args['embed_dropout'],
- word_embed=predefined_args['word_embed'], use_pooler=False,
- # TODO: for some models, we may need to change the value for use_token_type_embed,
- # use_classifier, and use_decoder
- use_token_type_embed=False,
- token_type_vocab_size=predefined_args['token_type_vocab_size'],
- use_classifier=False, use_decoder=False)
-
-bert.initialize(init=mx.init.Normal(0.02))
-
-ones = mx.nd.ones((2, 8))
-out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
-params = bert._collect_params_with_prefix()
-print('parameters in gluon')
-print(sorted(list(params.keys())))
-assert len(params) == len(pytorch_parameters), ("Gluon model does not match PyTorch model. " \
- "Please fix the BERTModel hyperparameters", len(params), len(pytorch_parameters))
-
-####################################################################
-# CONVERT PARAMS VALUES #
-####################################################################
-mapping = {
-'encoder.layer_norm.beta': 'embeddings.LayerNorm.bias',
-'encoder.layer_norm.gamma': 'embeddings.LayerNorm.weight',
-'encoder.position_weight': 'embeddings.position_embeddings.weight',
-'word_embed.0.weight': 'embeddings.word_embeddings.weight',
-'encoder.transformer_cells': 'transformer.layer',
-'attention_cell': 'attention',
-'.proj.': '.attention.out_lin.',
-'proj_key':'k_lin',
-'proj_query':'q_lin',
-'proj_value':'v_lin',
-'ffn_1':'lin1',
-'ffn_2':'lin2',
-'ffn.layer_norm.beta':'output_layer_norm.bias',
-'ffn.layer_norm.gamma':'output_layer_norm.weight',
-}
-secondary_map = {'layer_norm.beta':'sa_layer_norm.bias',
- 'layer_norm.gamma':'sa_layer_norm.weight'
-}
-
-# set parameter data
-loaded_params = {}
-for name in params:
- pytorch_name = name
- for k, v in mapping.items():
- pytorch_name = pytorch_name.replace(k, v)
- for k, v in secondary_map.items():
- pytorch_name = pytorch_name.replace(k, v)
- arr = mx.nd.array(pytorch_parameters[pytorch_name])
- assert arr.shape == params[name].shape
- params[name].set_data(arr)
- loaded_params[name] = True
-
-if len(params) != len(loaded_params):
- raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, '
- 'but {} have been extracted from the pytorch model. '.format(
- len(params), len(loaded_params)))
-
-####################################################################
-# SAVE CONVERTED PARAMS #
-####################################################################
-# param serialization
-param_path = os.path.join(gluon_dir_name, 'net.params')
-bert.save_parameters(param_path)
-hash_full, hash_short = get_hash(param_path)
-print('param saved to {}. hash = {}'.format(param_path, hash_full))
-
-
-####################################################################
-# COMPARE OUTPUTS #
-####################################################################
-text = 'Hello, my dog is cute'
-# TODO: use nlp.data.GPT2Tokenizer if the GPT2 tokenizer in pytorch is used
-gluon_tokenizer = nlp.data.BERTTokenizer(vocab, lower=True)
-transform = nlp.data.BERTSentenceTransform(gluon_tokenizer, max_seq_length=512, pair=False, pad=False)
-sample = transform([text])
-words, valid_len, _ = mx.nd.array([sample[0]]), mx.nd.array([sample[1]]), mx.nd.array([sample[2]]);
-# TODO: for some tokenizers, no need to truncate words
-words = words[:,1:-1]
-seq_encoding = bert(words, None)
-print('\nconverted vocab:')
-print(vocab)
-
-print('\ntesting sample:')
-print(sample)
-print('\ngluon output: ', seq_encoding)
-
-input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
-outputs = model(input_ids)
-last_hidden_states = outputs[0]
-print('\npytorch output: ')
-print(last_hidden_states)
-
-mx.nd.waitall()
-mx.test_utils.assert_almost_equal(seq_encoding.asnumpy(), last_hidden_states.detach().numpy(), atol=1e-3, rtol=1e-3)
-mx.test_utils.assert_almost_equal(seq_encoding.asnumpy(), last_hidden_states.detach().numpy(), atol=1e-5, rtol=1e-5)
-print('\nCongrats! The result is the same. Assertion passed.')
diff --git a/scripts/conversion_tools/convert_tf_model.py b/scripts/conversion_tools/convert_tf_model.py
deleted file mode 100644
index 09599dc85e..0000000000
--- a/scripts/conversion_tools/convert_tf_model.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-""" Script for converting TF Model to Gluon. """
-
-import argparse
-import json
-import logging
-import os
-import sys
-
-import mxnet as mx
-import gluonnlp as nlp
-from gluonnlp.model import BERTEncoder, BERTModel
-from gluonnlp.model.bert import bert_hparams
-
-sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-
-from utils import (get_hash, load_text_vocab, read_tf_checkpoint,
- tf_vocab_to_gluon_vocab)
-
-
-parser = argparse.ArgumentParser(
- description='Conversion script for Tensorflow BERT model',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--model',
- type=str,
- default='bert_12_768_12',
- choices=['bert_12_768_12', 'bert_24_1024_16'],
- help='BERT model name')
-parser.add_argument('--tf_checkpoint_dir',
- type=str,
- help='Path to Tensorflow checkpoint folder.')
-parser.add_argument('--tf_model_prefix', type=str,
- default='bert_model.ckpt',
- help='name of bert checkpoint file.')
-parser.add_argument('--tf_config_name', type=str,
- default='bert_config.json',
- help='Name of Bert config file')
-parser.add_argument('--out_dir',
- type=str,
- default=os.path.join('~', 'output'),
- help='Path to output folder.')
-parser.add_argument('--debug', action='store_true', help='debugging mode')
-args = parser.parse_args()
-logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
-logging.info(args)
-
-# convert vocabulary
-vocab_path = os.path.join(args.tf_checkpoint_dir, 'vocab.txt')
-vocab = tf_vocab_to_gluon_vocab(load_text_vocab(vocab_path))
-
-# vocab serialization
-out_dir = os.path.expanduser(args.out_dir)
-nlp.utils.mkdir(out_dir)
-tmp_file_path = os.path.join(out_dir, 'tmp')
-with open(tmp_file_path, 'w') as f:
- f.write(vocab.to_json())
-hash_full, hash_short = get_hash(tmp_file_path)
-gluon_vocab_path = os.path.join(out_dir, hash_short + '.vocab')
-with open(gluon_vocab_path, 'w') as f:
- f.write(vocab.to_json())
- logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full)
-
-# load tf model
-tf_checkpoint_file = os.path.expanduser(
- os.path.join(args.tf_checkpoint_dir, args.tf_model_prefix))
-logging.info('loading Tensorflow checkpoint %s ...', tf_checkpoint_file)
-tf_tensors = read_tf_checkpoint(tf_checkpoint_file)
-tf_names = sorted(tf_tensors.keys())
-
-tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names)
-tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names)
-tf_names = filter(lambda name: name != 'global_step', tf_names)
-tf_names = list(tf_names)
-if len(tf_tensors) != len(tf_names):
- logging.info('Tensorflow model was saved with Optimizer parameters. '
- 'Ignoring them.')
-
-for name in tf_names:
- logging.debug('%s: %s', name, tf_tensors[name].shape)
-
-# replace tensorflow parameter names with gluon parameter names
-NAME_MAP = [
- ('bert/encoder/layer_', 'encoder.transformer_cells.'),
- ('/attention/self/', '.attention_cell.'),
- ('key', 'proj_key'),
- ('query', 'proj_query'),
- ('value', 'proj_value'),
- ('/attention/output/LayerNorm/', '.layer_norm.'),
- ('/attention/output/dense/', '.proj.'),
- ('cls/seq_relationship/output_weights', 'classifier.weight'),
- ('cls/seq_relationship/output_bias', 'classifier.bias'),
- ('cls/predictions/output_bias', 'decoder.3.bias'),
- ('cls/predictions/transform/dense/', 'decoder.0.'),
- ('cls/predictions/transform/LayerNorm/', 'decoder.2.'),
- ('kernel', 'weight'),
- ('/intermediate/dense/', '.ffn.ffn_1.'),
- ('/output/dense/', '.ffn.ffn_2.'),
- ('/output/LayerNorm/', '.ffn.layer_norm.'),
- ('bert/embeddings/LayerNorm/', 'encoder.layer_norm.'),
- ('bert/embeddings/position_embeddings', 'encoder.position_weight'),
- ('bert/embeddings/token_type_embeddings', 'token_type_embed.0.weight'),
- ('bert/embeddings/word_embeddings', 'word_embed.0.weight'),
- ('bert/pooler/dense/', 'pooler.'),
- ('/', '.'),
-]
-
-# convert to gluon parameters
-mx_tensors = {}
-logging.info('converting to Gluon checkpoint ... ')
-for source_name in tf_names:
- # get the source tensor and its transpose
- source, source_t = tf_tensors[source_name], tf_tensors[source_name].T
- target, target_name = source, source_name
- for old, new in NAME_MAP:
- target_name = target_name.replace(old, new)
- # transpose kernel layer parameters
- if 'kernel' in source_name:
- target = source_t
- mx_tensors[target_name] = target
- if source_t.shape == source.shape and len(source.shape) > 1 and target is not source_t:
- logging.info('warning: %s has symmetric shape %s', target_name, target.shape)
- logging.debug('%s: %s', target_name, target.shape)
-
-# BERT config
-tf_config_names_to_gluon_config_names = {
- 'attention_probs_dropout_prob': 'dropout',
- 'hidden_act': None,
- 'hidden_dropout_prob': 'dropout',
- 'hidden_size': 'units',
- 'initializer_range': None,
- 'intermediate_size': 'hidden_size',
- 'max_position_embeddings': 'max_length',
- 'num_attention_heads': 'num_heads',
- 'num_hidden_layers': 'num_layers',
- 'type_vocab_size': 'token_type_vocab_size',
- 'vocab_size': None
-}
-predefined_args = bert_hparams[args.model]
-with open(os.path.join(args.tf_checkpoint_dir, args.tf_config_name), 'r') as f:
- tf_config = json.load(f)
- assert len(tf_config) == len(tf_config_names_to_gluon_config_names)
- for tf_name, gluon_name in tf_config_names_to_gluon_config_names.items():
- if tf_name is None or gluon_name is None:
- continue
- assert tf_config[tf_name] == predefined_args[gluon_name]
-
-# BERT encoder
-encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
- num_layers=predefined_args['num_layers'],
- units=predefined_args['units'],
- hidden_size=predefined_args['hidden_size'],
- max_length=predefined_args['max_length'],
- num_heads=predefined_args['num_heads'],
- scaled=predefined_args['scaled'],
- dropout=predefined_args['dropout'],
- use_residual=predefined_args['use_residual'])
-
-# Infer enabled BERTModel components
-use_pooler = any('pooler' in n for n in mx_tensors)
-use_decoder = any('decoder.0' in n for n in mx_tensors)
-use_classifier = any('classifier.weight' in n for n in mx_tensors)
-
-logging.info('Inferred that the tensorflow model provides the following parameters:')
-logging.info('- use_pooler = {}'.format(use_pooler))
-logging.info('- use_decoder = {}'.format(use_decoder))
-logging.info('- use_classifier = {}'.format(use_classifier))
-
-# post processings for parameters:
-# - handle tied decoder weight
-logging.info('total number of tf parameters = %d', len(tf_names))
-if use_decoder:
- mx_tensors['decoder.3.weight'] = mx_tensors['word_embed.0.weight']
- logging.info('total number of mx parameters = %d'
- '(including decoder param for weight tying)', len(mx_tensors))
-else:
- logging.info('total number of mx parameters = %d', len(mx_tensors))
-
-# BERT model
-bert = BERTModel(encoder, len(vocab),
- token_type_vocab_size=predefined_args['token_type_vocab_size'],
- units=predefined_args['units'],
- embed_size=predefined_args['embed_size'],
- word_embed=predefined_args['word_embed'],
- use_pooler=use_pooler, use_decoder=use_decoder,
- use_classifier=use_classifier)
-
-bert.initialize(init=mx.init.Normal(0.02))
-
-ones = mx.nd.ones((2, 8))
-out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
-params = bert._collect_params_with_prefix()
-if len(params) != len(mx_tensors):
- raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, '
- 'but {} have been extracted from the tf model. '
- 'Most likely the BERTModel hyperparameters do not match '
- 'the hyperparameters of the tf model.'.format(len(params), len(mx_tensors)))
-
-# set parameter data
-loaded_params = {}
-for name in params:
- try:
- arr = mx.nd.array(mx_tensors[name])
- params[name].set_data(arr)
- loaded_params[name] = True
- # pylint: disable=broad-except
- except Exception:
- if name not in mx_tensors:
- raise RuntimeError('cannot initialize %s from tf checkpoint' % name)
- else:
- raise RuntimeError('cannot initialize %s. Expect shape = %s, but found %s' %
- name, params[name].shape, arr.shape)
-
-logging.info('num loaded params = %d, total num params = %d',
- len(loaded_params), len(mx_tensors))
-for name in mx_tensors:
- if name not in loaded_params:
- logging.info('%s is not loaded', name)
-
-# param serialization
-bert.save_parameters(tmp_file_path)
-hash_full, hash_short = get_hash(tmp_file_path)
-gluon_param_path = os.path.join(out_dir, hash_short + '.params')
-logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full)
-bert.save_parameters(gluon_param_path)
-mx.nd.waitall()
diff --git a/scripts/conversion_tools/index.rst b/scripts/conversion_tools/index.rst
deleted file mode 100644
index aac3a4cf6c..0000000000
--- a/scripts/conversion_tools/index.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Model Conversion Tools
-----------------------
-
-:download:`Download scripts `
-
-Converting DistilBERT from PyTorch Transformer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The following command downloads the distilBERT model from pytorch-transformer,
-and converts the model to Gluon.
-
-.. code-block:: bash
-
- pip3 install pytorch-transformers
- python3 convert_pytorch_transformers.py --out_dir converted-model
-
-Converting RoBERTa from Fairseq
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The following command converts the `roberta checkpoint ` from fairseq to Gluon.
-The converted Gluon model is saved in the same folder as the checkpoint's.
-
-.. code-block:: bash
-
- pip3 install fairseq
- # download the roberta checkpoint from the website, then do:
- python3 convert_fairseq_model.py --ckpt_dir ./roberta/roberta.base --model roberta_12_768_12
diff --git a/scripts/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py b/scripts/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py
deleted file mode 100644
index ea1bedd33d..0000000000
--- a/scripts/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-"""PyTorch BERT parameter naming to Gluon BERT parameter naming.
-
-Given a Gluon BERT model (eg. obtained with the convert_tf_gluon.py script) and
-a pytorch_model.bin containing the same parameters, this script infers the
-naming convention of PyTorch.
-
-"""
-
-import argparse
-import json
-import logging
-import os
-import sys
-
-import gluonnlp as nlp
-import torch
-
-sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-from utils import load_text_vocab, tf_vocab_to_gluon_vocab
-
-parser = argparse.ArgumentParser(description='Pytorch BERT Naming Convention',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--model', type=str, default='bert_12_768_12',
- choices=['bert_12_768_12', 'bert_24_1024_16'], help='BERT model name')
-parser.add_argument('--dataset_name', type=str, default='scibert_scivocab_uncased',
- help='Dataset name')
-parser.add_argument('--pytorch_checkpoint_dir', type=str,
- help='Path to Tensorflow checkpoint folder.')
-parser.add_argument('--debug', action='store_true', help='debugging mode')
-parser.add_argument('--out', default='gluon_to_pytorch_naming.json',
- help='Output file to store gluon to pytorch name mapping.')
-args = parser.parse_args()
-logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
-logging.info(args)
-
-# Load Gluon Model
-bert, vocab = nlp.model.get_model(args.model, dataset_name=args.dataset_name, pretrained=True)
-parameters = bert._collect_params_with_prefix()
-parameters = {k: v.data().asnumpy() for k, v in parameters.items()}
-
-# Load PyTorch Model
-pytorch_parameters = torch.load(os.path.join(args.pytorch_checkpoint_dir, 'pytorch_model.bin'),
- map_location=lambda storage, loc: storage)
-pytorch_vocab = tf_vocab_to_gluon_vocab(
- load_text_vocab(os.path.join(args.pytorch_checkpoint_dir, 'vocab.txt')))
-pytorch_parameters = {k: v.numpy() for k, v in pytorch_parameters.items()}
-
-# Assert that vocabularies are equal
-assert pytorch_vocab.idx_to_token == vocab.idx_to_token
-
-mapping = dict()
-
-for name, param in parameters.items():
- found_match = False
- for pytorch_name, pytorch_param in pytorch_parameters.items():
- if param.shape == pytorch_param.shape:
- if (param == pytorch_param).all():
- if found_match:
- print('Found multiple matches for {}. '
- 'Ignoring new match {}'.format(name, pytorch_name))
- else:
- found_match = True
- mapping.update({name: pytorch_name})
-
- # We don't break here, in case there are mulitple matches
-
- if not found_match:
- raise RuntimeError('Pytorch and Gluon model do not match. '
- 'Cannot infer mapping of names.')
-
-assert len(mapping) == len(parameters)
-
-with open(args.out, 'w') as f:
- json.dump(mapping, f, indent=" ")
- print('Wrote mapping to {}'.format(args.out))
diff --git a/scripts/conversion_tools/input.txt b/scripts/conversion_tools/input.txt
deleted file mode 100644
index d1e3f410d0..0000000000
--- a/scripts/conversion_tools/input.txt
+++ /dev/null
@@ -1 +0,0 @@
-Who was Jim Henson ? ||| Jim Henson was a puppeteer
diff --git a/scripts/conversion_tools/input_cn.txt b/scripts/conversion_tools/input_cn.txt
deleted file mode 100644
index d1f598b9c0..0000000000
--- a/scripts/conversion_tools/input_cn.txt
+++ /dev/null
@@ -1 +0,0 @@
-这是百度的ERNIE模型 |||
diff --git a/scripts/conversion_tools/utils.py b/scripts/conversion_tools/utils.py
deleted file mode 100644
index a056ceb834..0000000000
--- a/scripts/conversion_tools/utils.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility functions for BERT."""
-
-import logging
-import collections
-import hashlib
-import io
-
-import mxnet as mx
-import gluonnlp as nlp
-
-__all__ = ['tf_vocab_to_gluon_vocab', 'load_text_vocab']
-
-
-def tf_vocab_to_gluon_vocab(tf_vocab):
- special_tokens = ['[UNK]', '[PAD]', '[SEP]', '[MASK]', '[CLS]']
- assert all(t in tf_vocab for t in special_tokens)
- counter = nlp.data.count_tokens(tf_vocab.keys())
- vocab = nlp.vocab.BERTVocab(counter, token_to_idx=tf_vocab)
- return vocab
-
-
-def get_hash(filename):
- sha1 = hashlib.sha1()
- with open(filename, 'rb') as f:
- while True:
- data = f.read(1048576)
- if not data:
- break
- sha1.update(data)
- return sha1.hexdigest(), str(sha1.hexdigest())[:8]
-
-
-def read_tf_checkpoint(path):
- """read tensorflow checkpoint"""
- from tensorflow.python import pywrap_tensorflow
- tensors = {}
- reader = pywrap_tensorflow.NewCheckpointReader(path)
- var_to_shape_map = reader.get_variable_to_shape_map()
- for key in sorted(var_to_shape_map):
- tensor = reader.get_tensor(key)
- tensors[key] = tensor
- return tensors
-
-def load_text_vocab(vocab_file):
- """Loads a vocabulary file into a dictionary."""
- vocab = collections.OrderedDict()
- index = 0
- with io.open(vocab_file, 'r') as reader:
- while True:
- token = reader.readline()
- if not token:
- break
- token = token.strip()
- vocab[token] = index
- index += 1
- return vocab
diff --git a/scripts/datasets/README.md b/scripts/datasets/README.md
new file mode 100644
index 0000000000..50cd555495
--- /dev/null
+++ b/scripts/datasets/README.md
@@ -0,0 +1,57 @@
+# Datasets
+
+This page describes how to download and prepare the datasets used in GluonNLP.
+
+Essentially, we provide scripts for downloading and preparing the datasets.
+The directory structure and the format of the processed datasets are well documented so that you are able to
+reuse the scripts with your own data (as long as the structure/format matches).
+
+Thus, the typical workflow for running experiments:
+
+- Download and prepare data with scripts in [datasets](.).
+In case you will need to preprocess the dataset, there are toolkits in [preprocess](../preprocess).
+- Run the experiments in [scripts](..)
+
+
+## Available Datasets
+- [Machine Translation](./machine_translation)
+ - [WMT](./machine_translation/README.md#wmt)
+- [Question Answering](./question_answering)
+ - [SQuAD](./question_answering/README.md#squad)
+ - [SearchQA](./question_answering/README.md#searchqa)
+ - [TriviaQA](./question_answering/README.md#triviaqa)
+ - [HotpotQA](./question_answering/README.md#hotpotqa)
+- [Language Modeling](./language_modeling)
+ - [WikiText-2](./language_modeling)
+ - [WikiText-103](./language_modeling)
+ - [Text8](./language_modeling)
+ - [Enwiki8](./language_modeling)
+ - [Google Billion Words](./language_modeling)
+- [Music Generation](./music_generation)
+ - [LakhMIDI](./music_generation/README.md#lakh-midi)
+ - [MAESTRO](./music_generation/README.md#maestro)
+- [Pretraining Corpus](./pretrain_corpus)
+ - [Wikipedia](./pretrain_corpus/README.md#wikipedia)
+ - [BookCorpus](./pretrain_corpus/README.md#bookcorpus)
+ - [OpenWebText](./pretrain_corpus/README.md#openwebtext)
+- [General NLP Benchmarks](./general_nlp_benchmark)
+ - [GLUE](./general_nlp_benchmark/README.md#glue-benchmark)
+ - [SuperGLUE](./general_nlp_benchmark/README.md#superglue-benchmark)
+
+## Contribution Guide
+
+We are very happy to receive and merge your contributions about new datasets :smiley:.
+
+To add a new dataset, you may create a `prepare_{DATASET_NAME}.py` file in the specific folder.
+Also, remember to add the documentation in the `README.md` about 1) the directory structure and 2) how to use the CLI tool for downloading + preprocessing.
+In addition, add citations in the `prepare_{DATASET_NAME}.py` to assign credit to the original author.
+Refer to the existing scripts or ask questions in Github if you need help.
+
+All URLs are bound with SHA1-hash keys to make sure that the downloaded files are not corrupted. You can refer to the files in [url_checksums](./url_checksums) for examples.
+
+In order to generate the hash values of the data files, you can revise [update_download_stats.py](update_download_stats.py)
+and include the new URLS + create the stats file that will store the hash keys. Use the following command to update the hash key:
+
+```bash
+python3 update_download_stats.py
+```
diff --git a/scripts/datasets/__init__.py b/scripts/datasets/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/datasets/__main__.py b/scripts/datasets/__main__.py
new file mode 100644
index 0000000000..301c7036e9
--- /dev/null
+++ b/scripts/datasets/__main__.py
@@ -0,0 +1,41 @@
+import argparse
+from .machine_translation import prepare_wmt
+from .question_answering import prepare_squad, prepare_hotpotqa, prepare_searchqa, prepare_triviaqa
+from .language_modeling import prepare_lm
+from .music_generation import prepare_music_midi
+from .pretrain_corpus import prepare_bookcorpus, prepare_wikipedia, prepare_openwebtext
+from .general_nlp_benchmark import prepare_glue
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+
+# TODO(zheyuye), lazy_import theses data parser functions and data main function
+# and their dependencies by a dictionary mapping the datasets names to the functions.
+def list_all_subcommands():
+ out = []
+ for key in DATA_PARSER_REGISTRY.list_keys():
+ if key not in DATA_MAIN_REGISTRY._obj_map:
+ raise KeyError('The data cli "{}" is registered in parser but is missing'
+ ' in main'.format(key))
+ out.append(key)
+ return out
+
+
+def cli_main():
+ parser = argparse.ArgumentParser(
+ description='Build-in scripts for downloading and preparing the data in GluonNLP.',
+ prog='nlp_data', add_help=False)
+ parser.add_argument('command', type=str,
+ choices=list_all_subcommands() + ['help'],
+ metavar='[subcommand]',
+ help='The subcommand to use. '
+ 'Choices are {}.'.format(list_all_subcommands() + ['help']))
+ args, other_args = parser.parse_known_args()
+ if args.command == 'help':
+ parser.print_help()
+ else:
+ parser = DATA_PARSER_REGISTRY.create(args.command)
+ sub_args = parser.parse_args(other_args)
+ DATA_MAIN_REGISTRY.create(args.command, sub_args)
+
+
+if __name__ == '__main__':
+ cli_main()
diff --git a/scripts/datasets/general_nlp_benchmark/README.md b/scripts/datasets/general_nlp_benchmark/README.md
new file mode 100644
index 0000000000..ff902750a1
--- /dev/null
+++ b/scripts/datasets/general_nlp_benchmark/README.md
@@ -0,0 +1,104 @@
+# Language Understanding Benchmarks
+
+We provide the documentation about how to download and prepare the
+[GLUE](https://gluebenchmark.com/) and [SuperGLUE](https://super.gluebenchmark.com/).
+
+These benchmarks share the common goal of providing a robust set of downstream tasks for evaluating
+the NLP models' performance.
+
+In essence, these NLP tasks share a similar structure. We are interested in the question:
+can we design a model that can solve these tasks all in once?
+[BERT](https://arxiv.org/pdf/1810.04805.pdf) has done a good job in unifying the way to
+featurize the text data, in which we extract two types of embeddings: one for the
+whole sentence and the other for each tokens in the sentence. Later,
+in [T5](https://arxiv.org/pdf/1910.10683.pdf), the author proposed to convert every task
+into a text-to-text problem. However, it is difficult to convert tasks like sentence similarity
+match, or named-entity recognition into text-to-text, because they involve real-values or text
+spans that are difficult to be encoded as raw text data.
+
+In GluonNLP, we propose a unified way to tackle these NLP problems. We convert these datasets
+as tables. Each column in the table will be 1) raw text, 2) entity/list of entities associated with the
+raw text, 3) numerical values or a list of numerical values.
+In addition, we keep a metadata object that describes 1) the relationship among columns,
+2) certain properties of the columns.
+
+All tasks used in these general benchmarks are converted to this format.
+
+
+## GLUE Benchmark
+
+The details of the benchmark are described in [GLUE Paper](https://openreview.net/pdf?id=rJ4km2R5t7).
+
+To obtain the dataset, run:
+
+```
+nlp_data prepare_glue --benchmark glue
+```
+
+There will be multiple task folders. All data are converted into pandas dataframes + an additional
+`metadata.json` object.
+
+Here are the details of the datasets:
+
+| Dataset | #Train | #Dev | #Test | Columns | Task | Metrics | Domain |
+|---------|--------|------|--------|---------------------|------------------------------|------------------------------|---------------------|
+| CoLA | 8.5k | 1k | 1k | sentence, **label** | acceptability (0 / 1) | Matthews corr. | misc. |
+| SST-2 | 67k | 872 | 1.8k | sentence, **label** | sentiment | acc. | movie reviews |
+| MRPC | 3.7k | 408 | 1.7k | sentence1, sentence2, **label** | paraphrase | acc./F1 | news |
+| STS-B | 5.7k | 1.5k | 1.4k | sentence1, sentence2, **score** | sentence similarity | Pearson/Spearman corr. | misc. |
+| QQP | 364k | 40k | 391k | sentence1, sentence2, **label** | paraphrase | acc./F1 | social QA questions |
+| MNLI | 393k | 9.8k(m) / 9.8k(mm) | 9.8k(m) / 9.8k(mm) | sentence1, sentence2, genre, **label** | NLI | matched acc./mismatched acc. | misc |
+| QNLI | 105k | 5.4k | 5.4k | question, sentence, **label** | QA/NLI | acc. | Wikipedia |
+| RTE | 2.5k | 227 | 3k | sentence1, sentence2, **label** | NLI | acc. | news, Wikipedia |
+| WNLI | 634 | 71 | 146 | sentence1, sentence2, **label** | NLI | acc. | fiction books |
+
+In addition, GLUE has the diagnostic task that tries to analyze the system's performance on a broad range of linguistic phenomena.
+It is best described in [GLUE Diagnostic](https://gluebenchmark.com/diagnostics).
+The diagnostic dataset is based on Natural Language Inference (NLI) and you will need to use the model trained with
+MNLI on this dataset.
+
+| Dataset | #Sample | Data Format | Metrics |
+|-------------|---------|-------------|-----------------|
+| Diagnostic | 1104 | semantics, predicate, logic, knowledge, domain, premise, hypothesis, label | Matthews corr. |
+
+In addition, we provide the SNLI dataset, which is recommend as an auxiliary data source for training MNLI.
+This is the recommended approach in [GLUE](https://openreview.net/pdf?id=rJ4km2R5t7).
+
+| Dataset | #Train | #Test | Data Format | Task | Metrics | Domain |
+|---------|---------|--------|-----------------------------|------|---------|--------|
+| SNLI | 549K | 20k | sentence1, sentence2, **label** | NLI | acc. | misc |
+
+
+## SuperGLUE Benchmark
+
+The details are described in [SuperGLUE Paper](https://arxiv.org/pdf/1905.00537.pdf).
+
+To obtain the benchmark, run:
+
+```
+nlp_data prepare_glue --benchmark superglue
+```
+
+
+| Dataset | #Train | #Dev | #Test | Columns | Task | Metrics | Domain |
+|----------|---------|------|---------|---------------------|--------------|------------------------------|---------------------------------|
+| BoolQ | 9.4k | 3.3k | 3.2k | passage, question, **label** | QA | acc. | Google queries, Wikipedia |
+| CB | 250 | 57 | 250 | premise, hypothesis, **label** | NLI | acc./F1 | various |
+| COPA | 400 | 100 | 500 | premise, choice1, choice2, question, **label** | QA | acc. | blogs, photography encyclopedia |
+| MultiRC* | 5.1k (27k) | 953 (4.8k) | 1.8k (9.7k) | passage, question, answer, **label** | QA | F1/EM | various |
+| ReCoRD | 101k | 10k | 10k | source, text, entities, query, **answers** | QA | F1/EM | news |
+| RTE | 2.5k | 278 | 3k | premise, hypothesis, **label** | NLI | acc. | news, Wikipedia |
+| WiC | 6k | 638 | 1.4k | sentence1, sentence2, entities1, entities2, **label** | WSD | acc. | WordNet, VerbNet, Wiktionary |
+| WSC | 554 | 104 | 146 | text, entities, **label** | coref. | acc. | fiction books |
+
+*Note that for MultiRC, we enumerated all combinations of (passage, question, answer) triplets in
+the dataset and the number of samples in the expanded format is recorded inside parenthesis.
+
+Similar to GLUE, SuperGLUE has two diagnostic tasks to analyze the system performance
+on a broad range of linguistic phenomena. For more details,
+see [SuperGLUE Diagnostic](https://super.gluebenchmark.com/diagnostics).
+
+| Dataset | #Samples | Columns |Metrics |
+|---------------|----------|----------------------|----------------|
+| Winogender | 356 |hypothesis, premise, label | Accuracy |
+| Broadcoverage | 1104 | label, sentence1, sentence2, logic | Matthews corr. |
diff --git a/scripts/datasets/general_nlp_benchmark/__init__.py b/scripts/datasets/general_nlp_benchmark/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/datasets/general_nlp_benchmark/prepare_glue.py b/scripts/datasets/general_nlp_benchmark/prepare_glue.py
new file mode 100644
index 0000000000..bbaf01cf48
--- /dev/null
+++ b/scripts/datasets/general_nlp_benchmark/prepare_glue.py
@@ -0,0 +1,707 @@
+# Disclaimer! The script here is partially based on
+# https://github.com/nyu-mll/jiant/blob/master/scripts/download_glue_data.py
+# and
+# https://github.com/nyu-mll/jiant/blob/master/scripts/download_superglue_data.py
+import os
+import sys
+import shutil
+import tempfile
+import argparse
+import zipfile
+import json
+import pathlib
+import pandas as pd
+import pyarrow
+import pyarrow.json
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.base import get_data_home_dir
+from gluonnlp.registry import DATA_MAIN_REGISTRY, DATA_PARSER_REGISTRY
+from gluonnlp.data.tokenizers import WhitespaceTokenizer
+
+
+_CITATIONS = """
+@inproceedings{wang2019glue,
+ title={GLUE: A multi-task benchmark and analysis platform for natural language understanding},
+ author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},
+ booktitle={ICLR},
+ year={2019}
+}
+
+@inproceedings{wang2019superglue,
+ title={Superglue: A stickier benchmark for general-purpose language understanding systems},
+ author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and
+ Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel},
+ booktitle={Advances in Neural Information Processing Systems},
+ pages={3261--3275},
+ year={2019}
+}
+"""
+
+GLUE_TASKS = ["cola", "sst", "mrpc", "qqp", "sts", "mnli",
+ "snli", "qnli", "rte", "wnli", "diagnostic"]
+SUPERGLUE_TASKS = ["cb", "copa", "multirc", "rte", "wic", "wsc", "boolq", "record",
+ 'broadcoverage-diagnostic', 'winogender-diagnostic']
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_URL_FILE_STATS = load_checksum_stats(os.path.join(
+ _CURR_DIR, '..', 'url_checksums', 'glue.txt'))
+_URL_FILE_STATS.update(load_checksum_stats(os.path.join(
+ _CURR_DIR, '..', 'url_checksums', 'superglue.txt')))
+
+
+def read_tsv_glue(tsv_file, num_skip=1, keep_column_names=False):
+ out = []
+ nrows = None
+ if keep_column_names:
+ assert num_skip == 1
+ column_names = None
+ with open(tsv_file, 'r') as f:
+ for i, line in enumerate(f):
+ line = line.strip()
+ if i < num_skip:
+ if keep_column_names:
+ column_names = line.split()
+ continue
+ elements = line.split('\t')
+ out.append(elements)
+ if nrows is None:
+ nrows = len(elements)
+ else:
+ assert nrows == len(elements)
+ df = pd.DataFrame(out, columns=column_names)
+ series_l = []
+ for col_name in df.columns:
+ idx = df[col_name].first_valid_index()
+ val = df[col_name][idx]
+ if isinstance(val, str):
+ try:
+ dat = pd.to_numeric(df[col_name])
+ series_l.append(dat)
+ continue
+ except ValueError:
+ pass
+ finally:
+ pass
+ series_l.append(df[col_name])
+ new_df = pd.DataFrame({name: series for name, series in zip(df.columns, series_l)})
+ return new_df
+
+
+def read_jsonl_superglue(jsonl_file):
+ columns = None
+ out = []
+ with open(jsonl_file, 'r') as f:
+ for i, line in enumerate(f):
+ line = line.strip()
+ sample = json.loads(line)
+ if columns is None:
+ columns = list(sample.keys())
+ else:
+ assert sorted(columns) == sorted(list(sample.keys())),\
+ 'Columns={}, sample.keys()={}'.format(columns, sample.keys())
+ out.append([sample[col] for col in columns])
+ df = pd.DataFrame(out, columns=columns)
+ return df
+
+
+# Classification will be stored as pandas dataframe
+def read_cola(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'dev', 'test']:
+ csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+ if fold == 'test':
+ df = pd.read_csv(csv_file, '\t')
+ df = df[['sentence']]
+ df_dict[fold] = df
+ else:
+ df = pd.read_csv(csv_file, '\t', header=None)
+ df = df[[3, 1]]
+ df.columns = ['sentence', 'label']
+ df_dict[fold] = df
+ return df_dict, None
+
+
+def read_sst(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'dev', 'test']:
+ csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+ df = pd.read_csv(csv_file, '\t')
+ if fold == 'test':
+ df = df[['sentence']]
+ df_dict[fold] = df
+ return df_dict, None
+
+
+def read_mrpc(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'dev', 'test']:
+ tsv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+ df = read_tsv_glue(tsv_file)
+ if fold == 'test':
+ df = df[[3, 4]]
+ df.columns = ['sentence1', 'sentence2']
+ else:
+ df = df[[3, 4, 0]]
+ df.columns = ['sentence1', 'sentence2', 'label']
+ df_dict[fold] = df
+ return df_dict, None
+
+
+def read_qqp(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'dev', 'test']:
+ csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+ df = pd.read_csv(csv_file, '\t')
+ if fold == 'test':
+ df = df[['question1', 'question2']]
+ df.columns = ['sentence1', 'sentence2']
+ else:
+ df = df[['question1', 'question2', 'is_duplicate']]
+ df.columns = ['sentence1', 'sentence2', 'label']
+ df_dict[fold] = df
+ return df_dict, None
+
+
+def read_sts(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'dev', 'test']:
+ csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+ df = read_tsv_glue(csv_file)
+ if fold == 'test':
+ df = df[[7, 8, 1]]
+ df.columns = ['sentence1', 'sentence2', 'genre']
+ else:
+ df = df[[7, 8, 1, 9]]
+ df.columns = ['sentence1', 'sentence2', 'genre', 'score']
+ genre_l = []
+ for ele in df['genre'].tolist():
+ if ele == 'main-forum':
+ genre_l.append('main-forums')
+ else:
+ genre_l.append(ele)
+ df['genre'] = pd.Series(genre_l)
+ df_dict[fold] = df
+ return df_dict, None
+
+
+def read_mnli(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'dev_matched', 'dev_mismatched', 'test_matched', 'test_mismatched']:
+ csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+ df = read_tsv_glue(csv_file, 1, True)
+ if 'test' in fold:
+ df = df[['sentence1', 'sentence2', 'genre']]
+ else:
+ df = df[['sentence1', 'sentence2', 'genre', 'gold_label']]
+ df.columns = ['sentence1', 'sentence2', 'genre', 'label']
+ df_dict[fold] = df
+ return df_dict, None
+
+
+def read_snli(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'dev', 'test']:
+ csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+ column_names = None
+ out = []
+ with open(csv_file) as f:
+ for i, line in enumerate(f):
+ line = line.strip()
+ if i == 0:
+ column_names = line.split()
+ column_names = column_names[:10] + [column_names[-1]]
+ continue
+ elements = line.split('\t')
+ first_few_elements = elements[:10]
+ gold_label = elements[-1]
+ out.append(first_few_elements + [gold_label])
+ df = pd.DataFrame(out, columns=column_names)
+ df = df[['sentence1', 'sentence2', 'gold_label']]
+ df.columns = ['sentence1', 'sentence2', 'label']
+ df_dict[fold] = df
+ return df_dict, None
+
+
+def read_qnli(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'dev', 'test']:
+ csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+ df = read_tsv_glue(csv_file, 1, True)
+ if fold == 'test':
+ df_dict[fold] = df[['question', 'sentence']]
+ else:
+ df_dict[fold] = df[['question', 'sentence', 'label']]
+ return df_dict, None
+
+
+def read_rte(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'dev', 'test']:
+ csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+ df = pd.read_csv(csv_file, '\t')
+ if fold == 'test':
+ df_dict[fold] = df[['sentence1', 'sentence2']]
+ else:
+ df_dict[fold] = df[['sentence1', 'sentence2', 'label']]
+ return df_dict, None
+
+
+def read_wnli(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'dev', 'test']:
+ csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+ df = pd.read_csv(csv_file, '\t')
+ if fold == 'test':
+ df = df[['sentence1', 'sentence2']]
+ else:
+ df = df[['sentence1', 'sentence2', 'label']]
+ df_dict[fold] = df
+ return df_dict, None
+
+
+# The glue diagnostic will be in MNLI
+def read_glue_diagnostic(dir_path):
+ csv_file = os.path.join(dir_path, 'diagnostic-full.tsv')
+ df = pd.read_csv(csv_file, '\t')
+ df.columns = ['semantics', 'predicate', 'logic', 'knowledge', 'domain', 'premise',
+ 'hypothesis', 'label']
+ return df
+
+
+def read_cb(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'val', 'test']:
+ columns = ['premise', 'hypothesis']
+ if fold != 'test':
+ columns.append('label')
+ jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+ df = read_jsonl_superglue(jsonl_path)
+ df = df[columns]
+ df_dict[fold] = df
+ return df_dict, None
+
+
+def read_copa(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'val', 'test']:
+ columns = ['premise', 'choice1', 'choice2', 'question']
+ if fold != 'test':
+ columns.append('label')
+ jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+ df = read_jsonl_superglue(jsonl_path)
+ df = df[columns]
+ df_dict[fold] = df
+ return df_dict, None
+
+
+# passage, question, answer, passage_idx, question_idx, answer_idx
+def read_multirc(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'val', 'test']:
+ columns = ['passage', 'question', 'answer', 'psg_idx', 'qst_idx', 'ans_idx']
+ if fold != 'test':
+ columns.append('label')
+ out = []
+ jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+ with open(jsonl_path, 'r') as f:
+ for line in f:
+ sample = json.loads(line.strip())
+ psg_idx = sample['idx']
+ sample = json.loads(line.strip())
+ passage = sample['passage']['text']
+ for qa in sample['passage']['questions']:
+ qst_idx = qa['idx']
+ question = qa['question']
+ for ans in qa['answers']:
+ ans_idx = ans['idx']
+ answer = ans['text']
+ if fold == 'test':
+ out.append((passage, question, answer, psg_idx, qst_idx, ans_idx))
+ else:
+ label = ans['label']
+ out.append((passage, question, answer, psg_idx, qst_idx,
+ ans_idx, label))
+ df = pd.DataFrame(out, columns=columns)
+ df_dict[fold] = df
+ return df_dict, None
+
+
+def read_rte_superglue(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'val', 'test']:
+ if fold == 'test':
+ columns = ['premise', 'hypothesis']
+ else:
+ columns = ['premise', 'hypothesis', 'label']
+ jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+ df = read_jsonl_superglue(jsonl_path)
+ df = df[columns]
+ df_dict[fold] = df
+ return df_dict, None
+
+
+def read_wic(dir_path):
+ df_dict = dict()
+ meta_data = dict()
+ meta_data['entities1'] = {'type': 'entity', 'attrs': {'parent': 'sentence1'}}
+ meta_data['entities2'] = {'type': 'entity', 'attrs': {'parent': 'sentence2'}}
+
+ for fold in ['train', 'val', 'test']:
+ if fold != 'test':
+ columns = ['sentence1', 'sentence2', 'entities1', 'entities2', 'label']
+ else:
+ columns = ['sentence1', 'sentence2', 'entities1', 'entities2']
+ jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+ df = read_jsonl_superglue(jsonl_path)
+ out = []
+ for idx, row in df.iterrows():
+ sentence1 = row['sentence1']
+ sentence2 = row['sentence2']
+ start1 = row['start1']
+ end1 = row['end1']
+ start2 = row['start2']
+ end2 = row['end2']
+ if fold == 'test':
+ out.append([sentence1, sentence2,
+ {'start': start1, 'end': end1},
+ {'start': start2, 'end': end2}])
+ else:
+ label = row['label']
+ out.append([sentence1, sentence2,
+ {'start': start1, 'end': end1},
+ {'start': start2, 'end': end2},
+ label])
+ df = pd.DataFrame(out, columns=columns)
+ df_dict[fold] = df
+ return df_dict, meta_data
+
+
+def read_wsc(dir_path):
+ df_dict = dict()
+ tokenizer = WhitespaceTokenizer()
+ meta_data = dict()
+ meta_data['noun'] = {'type': 'entity', 'attrs': {'parent': 'text'}}
+ meta_data['pronoun'] = {'type': 'entity', 'attrs': {'parent': 'text'}}
+ for fold in ['train', 'val', 'test']:
+ jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+ df = read_jsonl_superglue(jsonl_path)
+ samples = []
+ for i in range(len(df)):
+ text = df.loc[i, 'text']
+ if fold != 'test':
+ label = df.loc[i, 'label']
+ target = df.loc[i, 'target']
+ span1_index = target['span1_index']
+ span2_index = target['span2_index']
+ span1_text = target['span1_text']
+ span2_text = target['span2_text']
+ # Build entity
+ # list of entities
+ # 'entities': {'start': 0, 'end': 100}
+ tokens, offsets = tokenizer.encode_with_offsets(text, str)
+ pos_start1 = offsets[span1_index][0]
+ pos_end1 = pos_start1 + len(span1_text)
+ pos_start2 = offsets[span2_index][0]
+ pos_end2 = pos_start2 + len(span2_text)
+ if fold == 'test':
+ samples.append({'text': text,
+ 'noun': {'start': pos_start1, 'end': pos_end1},
+ 'pronoun': {'start': pos_start2, 'end': pos_end2}})
+ else:
+ samples.append({'text': text,
+ 'noun': {'start': pos_start1, 'end': pos_end1},
+ 'pronoun': {'start': pos_start2, 'end': pos_end2},
+ 'label': label})
+ df = pd.DataFrame(samples)
+ df_dict[fold] = df
+ return df_dict, meta_data
+
+
+def read_boolq(dir_path):
+ df_dict = dict()
+ for fold in ['train', 'val', 'test']:
+ jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+ df = read_jsonl_superglue(jsonl_path)
+ df_dict[fold] = df
+ return df_dict, None
+
+
+def read_record(dir_path):
+ df_dict = dict()
+ meta_data = dict()
+ meta_data['entities'] = {'type': 'entity', 'attrs': {'parent': 'text'}}
+ meta_data['answers'] = {'type': 'entity', 'attrs': {'parent': 'text'}}
+ for fold in ['train', 'val', 'test']:
+ if fold != 'test':
+ columns = ['source', 'text', 'entities', 'query', 'answers']
+ else:
+ columns = ['source', 'text', 'entities', 'query']
+ jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+ df = read_jsonl_superglue(jsonl_path)
+ df_dict[fold] = df
+ out = []
+ for i, row in df.iterrows():
+ source = row['source']
+ passage = row['passage']
+ text = passage['text']
+ entities = passage['entities']
+ entities = [{'start': ele['start'], 'end': ele['end']} for ele in entities]
+ for qas in row['qas']:
+ query = qas['query']
+ if fold != 'test':
+ answer_entities = qas['answers']
+ out.append((source, text, entities, query, answer_entities))
+ else:
+ out.append((source, text, entities, query))
+ df = pd.DataFrame(out, columns=columns)
+ df_dict[fold] = df
+ return df_dict, meta_data
+
+
+def read_winogender_diagnostic(dir_path):
+ jsonl_path = os.path.join(dir_path, 'AX-g.jsonl')
+ df = read_jsonl_superglue(jsonl_path)
+ return df
+
+
+def read_broadcoverage_diagnostic(dir_path):
+ df = pyarrow.json.read_json(os.path.join(dir_path, 'AX-b.jsonl')).to_pandas()
+ return df
+
+
+GLUE_TASK2PATH = {
+ "cola": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4", # noqa
+ "sst": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8", # noqa
+ "mrpc": {
+ 'train': "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt",
+ 'dev': "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc",
+ 'test': "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt"
+ },
+ "qqp": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP-clean.zip?alt=media&token=11a647cb-ecd3-49c9-9d31-79f8ca8fe277", # noqa
+ "sts": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5", # noqa
+ "mnli": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce", # noqa
+ "snli": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df", # noqa
+ "qnli": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601", # noqa
+ "rte": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb", # noqa
+ "wnli": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf", # noqa
+ "diagnostic": [
+ "https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D", # noqa
+ "https://www.dropbox.com/s/ju7d95ifb072q9f/diagnostic-full.tsv?dl=1",
+ ],
+}
+
+GLUE_READERS = {
+ 'cola': read_cola,
+ 'sst': read_sst,
+ 'mrpc': read_mrpc,
+ 'qqp': read_qqp,
+ 'sts': read_sts,
+ 'mnli': read_mnli,
+ 'snli': read_snli,
+ 'qnli': read_qnli,
+ 'rte': read_rte,
+ 'wnli': read_wnli,
+ 'diagnostic': read_glue_diagnostic
+}
+
+
+SUPERGLUE_TASK2PATH = {
+ "cb": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/CB.zip",
+ "copa": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/COPA.zip",
+ "multirc": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/MultiRC.zip",
+ "rte": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/RTE.zip",
+ "wic": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WiC.zip",
+ "wsc": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip",
+ "broadcoverage-diagnostic": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-b.zip",
+ "winogender-diagnostic": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-g.zip",
+ "boolq": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/BoolQ.zip",
+ "record": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/ReCoRD.zip",
+}
+
+SUPERGLUE_READER = {
+ 'cb': read_cb,
+ 'copa': read_copa,
+ 'multirc': read_multirc,
+ 'rte': read_rte_superglue,
+ 'wic': read_wic,
+ 'wsc': read_wsc,
+ 'boolq': read_boolq,
+ 'record': read_record,
+ 'broadcoverage-diagnostic': read_broadcoverage_diagnostic,
+ 'winogender-diagnostic': read_winogender_diagnostic
+}
+
+
+def format_mrpc(data_dir):
+ mrpc_dir = os.path.join(data_dir, "mrpc")
+ os.makedirs(mrpc_dir, exist_ok=True)
+ mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
+ mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
+ download(GLUE_TASK2PATH["mrpc"]['train'], mrpc_train_file,
+ sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['train']])
+ download(GLUE_TASK2PATH["mrpc"]['test'], mrpc_test_file,
+ sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['test']])
+ assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
+ assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
+ download(GLUE_TASK2PATH["mrpc"]['dev'],
+ os.path.join(mrpc_dir, "dev_ids.tsv"),
+ sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['dev']])
+
+ dev_ids = []
+ with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh:
+ for row in ids_fh:
+ dev_ids.append(row.strip().split("\t"))
+
+ with open(mrpc_train_file, encoding="utf8") as data_fh, open(
+ os.path.join(mrpc_dir, "train.tsv"), "w", encoding="utf8"
+ ) as train_fh, open(os.path.join(mrpc_dir, "dev.tsv"), "w", encoding="utf8") as dev_fh:
+ header = data_fh.readline()
+ train_fh.write(header)
+ dev_fh.write(header)
+ for row in data_fh:
+ label, id1, id2, s1, s2 = row.strip().split("\t")
+ if [id1, id2] in dev_ids:
+ dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+ else:
+ train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+
+ with open(mrpc_test_file, encoding="utf8") as data_fh, open(
+ os.path.join(mrpc_dir, "test.tsv"), "w", encoding="utf8"
+ ) as test_fh:
+ header = data_fh.readline()
+ test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
+ for idx, row in enumerate(data_fh):
+ label, id1, id2, s1, s2 = row.strip().split("\t")
+ test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
+
+
+def get_tasks(benchmark, task_names):
+ task_names = task_names.split(",")
+ ALL_TASKS = GLUE_TASKS if benchmark == 'glue' else SUPERGLUE_TASKS
+ if "all" in task_names:
+ tasks = ALL_TASKS
+ else:
+ tasks = []
+ for task_name in task_names:
+ if task_name != 'diagnostic':
+ assert task_name in ALL_TASKS, "Task %s not found!" % task_name
+ tasks.append(task_name)
+ if "RTE" in tasks and "diagnostic" not in tasks:
+ tasks.append("diagnostic")
+ has_diagnostic = any(['diagnostic' in task for task in tasks])
+ if has_diagnostic:
+ tasks = [ele for ele in tasks if 'diagnostic' not in ele]
+ tasks.append('diagnostic')
+ return tasks
+
+
+@DATA_PARSER_REGISTRY.register('prepare_glue')
+def get_parser():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--benchmark", choices=['glue', 'superglue'],
+ default='glue', type=str)
+ parser.add_argument("-d", "--data_dir", help="directory to save data to", type=str,
+ default=None)
+ parser.add_argument(
+ "-t",
+ "--tasks",
+ help="tasks to download data for as a comma separated string",
+ type=str,
+ default="all"
+ )
+ parser.add_argument('--cache-path', type=str,
+ default=os.path.join(get_data_home_dir(), 'glue'),
+ help='The temporary path to download the dataset.')
+ return parser
+
+
+@DATA_MAIN_REGISTRY.register('prepare_glue')
+def main(args):
+ if args.data_dir is None:
+ args.data_dir = args.benchmark
+ args.cache_path = os.path.join(args.cache_path, args.benchmark)
+ print('Downloading {} to {}. Selected tasks = {}'.format(args.benchmark,
+ args.data_dir, args.tasks))
+ os.makedirs(args.cache_path, exist_ok=True)
+ os.makedirs(args.data_dir, exist_ok=True)
+ tasks = get_tasks(args.benchmark, args.tasks)
+ if args.benchmark == 'glue':
+ TASK2PATH = GLUE_TASK2PATH
+ TASK2READER = GLUE_READERS
+ elif args.benchmark == 'superglue':
+ TASK2PATH = SUPERGLUE_TASK2PATH
+ TASK2READER = SUPERGLUE_READER
+ else:
+ raise NotImplementedError
+ for task in tasks:
+ print('Processing {}...'.format(task))
+ if task == 'diagnostic' or 'diagnostic' in task:
+ if args.benchmark == 'glue':
+ reader = TASK2READER[task]
+ base_dir = os.path.join(args.data_dir, 'rte_diagnostic')
+ os.makedirs(base_dir, exist_ok=True)
+ download(TASK2PATH['diagnostic'][0],
+ path=os.path.join(base_dir, 'diagnostic.tsv'),
+ sha1_hash=_URL_FILE_STATS[TASK2PATH['diagnostic'][0]])
+ download(TASK2PATH['diagnostic'][1],
+ path=os.path.join(base_dir, 'diagnostic-full.tsv'),
+ sha1_hash=_URL_FILE_STATS[TASK2PATH['diagnostic'][1]])
+ df = reader(base_dir)
+ df.to_parquet(os.path.join(base_dir, 'diagnostic-full.parquet'))
+ else:
+ for key, name in [('broadcoverage-diagnostic', 'AX-b'),
+ ('winogender-diagnostic', 'AX-g')]:
+ data_file = os.path.join(args.cache_path, "{}.zip".format(key))
+ url = TASK2PATH[key]
+ reader = TASK2READER[key]
+ download(url, data_file, sha1_hash=_URL_FILE_STATS[url])
+ with zipfile.ZipFile(data_file) as zipdata:
+ zipdata.extractall(args.data_dir)
+ df = reader(os.path.join(args.data_dir, name))
+ df.to_parquet(os.path.join(args.data_dir, name, '{}.parquet'.format(name)))
+ elif task == 'mrpc':
+ reader = TASK2READER[task]
+ format_mrpc(args.data_dir)
+ df_dict, meta_data = reader(os.path.join(args.data_dir, 'mrpc'))
+ for key, df in df_dict.items():
+ if key == 'val':
+ key = 'dev'
+ df.to_parquet(os.path.join(args.data_dir, 'mrpc', '{}.parquet'.format(key)))
+ with open(os.path.join(args.data_dir, 'mrpc', 'metadata.json'), 'w') as f:
+ json.dump(meta_data, f)
+ else:
+ # Download data
+ data_file = os.path.join(args.cache_path, "{}.zip".format(task))
+ url = TASK2PATH[task]
+ reader = TASK2READER[task]
+ download(url, data_file, sha1_hash=_URL_FILE_STATS[url])
+ base_dir = os.path.join(args.data_dir, task)
+ if os.path.exists(base_dir):
+ print('Found!')
+ continue
+ zip_dir_name = None
+ with zipfile.ZipFile(data_file) as zipdata:
+ if zip_dir_name is None:
+ zip_dir_name = os.path.dirname(zipdata.infolist()[0].filename)
+ zipdata.extractall(args.data_dir)
+ shutil.move(os.path.join(args.data_dir, zip_dir_name),
+ base_dir)
+ df_dict, meta_data = reader(base_dir)
+ for key, df in df_dict.items():
+ if key == 'val':
+ key = 'dev'
+ df.to_parquet(os.path.join(base_dir, '{}.parquet'.format(key)))
+ if meta_data is not None:
+ with open(os.path.join(base_dir, 'metadata.json'), 'w') as f:
+ json.dump(meta_data, f)
+ print("\tCompleted!")
+
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == "__main__":
+ cli_main()
diff --git a/scripts/datasets/language_modeling/README.md b/scripts/datasets/language_modeling/README.md
new file mode 100644
index 0000000000..a75779ea42
--- /dev/null
+++ b/scripts/datasets/language_modeling/README.md
@@ -0,0 +1,24 @@
+# Language Modeling Benchmark
+
+Prepare the language modeling benchmarking datasets.
+In order to help reproduce the papers, we use
+the tokenized corpus as the training/validation/testing dataset.
+
+```bash
+# WikiText-2
+nlp_data prepare_lm --dataset wikitext2
+
+# WikiText-103
+nlp_data prepare_lm --dataset wikitext103
+
+# enwik8
+nlp_data prepare_lm --dataset enwik8
+
+# Text-8
+nlp_data prepare_lm --dataset text8
+
+# Google One-Billion-Word
+nlp_data prepare_lm --dataset gbw
+```
+
+Happy language modeling :)
diff --git a/scripts/datasets/language_modeling/__init__.py b/scripts/datasets/language_modeling/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/datasets/language_modeling/prepare_lm.py b/scripts/datasets/language_modeling/prepare_lm.py
new file mode 100644
index 0000000000..6f56ddb02d
--- /dev/null
+++ b/scripts/datasets/language_modeling/prepare_lm.py
@@ -0,0 +1,265 @@
+import argparse
+import os
+import zipfile
+import tarfile
+import shutil
+from typing import List, Optional
+from collections import Counter
+from gluonnlp.base import get_data_home_dir
+from gluonnlp.registry import DATA_MAIN_REGISTRY, DATA_PARSER_REGISTRY
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.data.vocab import Vocab
+
+
+_CITATIONS = """
+@ONLINE {mahoney2011large,
+ title={Large text compression benchmark},
+ author={Mahoney, Matt},
+ url={http://www.mattmahoney.net/dc/text.html},
+ year={2011}
+}
+
+@article{chelba2013one,
+ title={One billion word benchmark for measuring progress in statistical language modeling},
+ author={Chelba, Ciprian and Mikolov, Tomas and Schuster, Mike and Ge, Qi and Brants, Thorsten
+ and Koehn, Phillipp and Robinson, Tony},
+ journal={arXiv preprint arXiv:1312.3005},
+ year={2013}
+}
+
+
+@inproceedings{merity2016pointer,
+ title={Pointer sentinel mixture models},
+ author={Merity, Stephen and Xiong, Caiming and Bradbury, James and Socher, Richard},
+ booktitle={ICLR},
+ year={2017}
+}
+"""
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums',
+ 'language_model.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+
+_URLS = {
+ 'wikitext2': 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip',
+ 'wikitext103': 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip',
+ 'enwik8': 'http://mattmahoney.net/dc/enwik8.zip',
+ 'text8': 'http://mattmahoney.net/dc/text8.zip',
+ # The original address of Google One Billion Word dataset is
+ # http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
+ # We uploaded the file to S3 to accelerate the speed
+ 'gbw': 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz',
+ # The data is obtained from https://raw.githubusercontent.com/rafaljozefowicz/lm/master/1b_word_vocab.txt
+ 'gbw_vocab': 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt'
+}
+
+
+@DATA_PARSER_REGISTRY.register('prepare_lm')
+def get_parser():
+ parser = argparse.ArgumentParser(description='Downloading and Preprocessing'
+ ' Language Modeling Datasets.')
+ parser.add_argument('--dataset', type=str, required=True,
+ choices=['wikitext2', 'wikitext103', 'text8', 'enwik8', 'gbw'],
+ help='The dataset to use.')
+ parser.add_argument('--save-dir', type=str, default=None,
+ help='The directory to save the dataset.'
+ ' By default, it will save to a folder with the same name as the '
+ 'dataset')
+ parser.add_argument('--overwrite', action='store_true', help='Whether to overwrite the saved '
+ 'files.')
+ parser.add_argument('--cache-path', type=str,
+ default=os.path.join(get_data_home_dir(), 'lm_benchmark_data'),
+ help='The temporary path to download the dataset.')
+ return parser
+
+
+def path_exist_and_skip(path, overwrite):
+ if os.path.exists(path) and not overwrite:
+ print('Found {}. Skip writing. Turn `--overwrite` to force update the file.'
+ .format(path))
+ return True
+ return False
+
+
+def build_vocab(corpus_path_l: List, eos_token: Optional[str] = '') -> Vocab:
+ """Build the default vocabulary used in datasets like
+
+ - wikitext2
+ - wikitext103
+ - text8
+ - enwiki8
+
+ The strategy is to split with white-space and store all appeared tokens.
+ Also, the tokens will be sorted with a descending order of their frequency.
+
+ Parameters
+ ----------
+ corpus_path_l
+ The corpus path
+ eos_token
+ If it is not None, the eos_token will be added to the vocabulary.
+
+ Returns
+ -------
+ vocab
+ The vocabulary
+ """
+ counter = Counter()
+ ntokens = 0
+ print('Build the default vocabulary used in benchmarks:')
+ for corpus_path in corpus_path_l:
+ with open(corpus_path, 'r', encoding='utf-8') as f:
+ for idx, line in enumerate(f):
+ if idx > 0 and idx % 500000 == 0:
+ print(' line {}'.format(idx))
+ line = line.strip()
+ tokens = line.split()
+ counter.update(tokens)
+ ntokens += len(tokens)
+ if eos_token is not None and eos_token in counter:
+ raise ValueError('eos_token is set to be "{}", which appears in the text. '
+ 'Is it intended? You may choose another token as the eos_token.'
+ .format(eos_token))
+ vocab = Vocab(counter, unk_token=None, eos_token=eos_token)
+ print('Processed {} tokens, vocab={}'.format(ntokens, vocab))
+ return vocab
+
+
+@DATA_MAIN_REGISTRY.register('prepare_lm')
+def main(args):
+ # Download the data
+ url = _URLS[args.dataset]
+ file_hash = _URL_FILE_STATS[url]
+ target_download_location = os.path.join(args.cache_path,
+ os.path.basename(url))
+ download(url, target_download_location, sha1_hash=file_hash)
+ save_dir = args.dataset if args.save_dir is None else args.save_dir
+ if not os.path.exists(save_dir):
+ os.makedirs(save_dir, exist_ok=True)
+ # Extract and process the data
+ if args.dataset == 'wikitext2':
+ with zipfile.ZipFile(target_download_location) as zf:
+ train_data = zf.read('wikitext-2/wiki.train.tokens')
+ valid_data = zf.read('wikitext-2/wiki.valid.tokens')
+ test_data = zf.read('wikitext-2/wiki.test.tokens')
+ for filename, part in [('train.txt', train_data),
+ ('valid.txt', valid_data),
+ ('test.txt', test_data)]:
+ filename = os.path.join(save_dir, filename)
+ print('{} will have {} bytes'.format(filename, len(part)))
+ if not path_exist_and_skip(filename, args.overwrite):
+ with open(filename, 'wb') as of:
+ of.write(part)
+ vocab = build_vocab([os.path.join(save_dir, 'train.txt'),
+ os.path.join(save_dir, 'valid.txt'),
+ os.path.join(save_dir, 'test.txt')])
+ vocab.save(os.path.join(save_dir, 'vocab.json'))
+ elif args.dataset == 'wikitext103':
+ with zipfile.ZipFile(target_download_location) as zf:
+ train_data = zf.read('wikitext-103/wiki.train.tokens')
+ valid_data = zf.read('wikitext-103/wiki.valid.tokens')
+ test_data = zf.read('wikitext-103/wiki.test.tokens')
+ for filename, part in [('train.txt', train_data),
+ ('valid.txt', valid_data),
+ ('test.txt', test_data)]:
+ filename = os.path.join(save_dir, filename)
+ if not path_exist_and_skip(filename, args.overwrite):
+ print('{} will have {} bytes'.format(filename, len(part)))
+ with open(filename, 'wb') as of:
+ of.write(part)
+ vocab = build_vocab([os.path.join(save_dir, 'train.txt')])
+ vocab.save(os.path.join(save_dir, 'vocab.json'))
+ elif args.dataset == 'text8':
+ with zipfile.ZipFile(target_download_location) as zf:
+ with zf.open('text8', 'r') as f:
+ data = f.read().decode('utf-8')
+ num_test_chars = 5000000
+ train_data = data[: -2 * num_test_chars]
+ valid_data = data[-2 * num_test_chars: -num_test_chars]
+ test_data = data[-num_test_chars:]
+ for filename, part in [('train.txt', train_data),
+ ('valid.txt', valid_data),
+ ('test.txt', test_data)]:
+ filename = os.path.join(save_dir, filename)
+ print('{} will have {} bytes'.format(filename, len(part)))
+ print('- Tokenizing...')
+ # Change space ' ' to underscore '_'
+ part_str = ' '.join(['_' if c == ' ' else c for c in part.strip()])
+ print('- Writing...')
+ if not path_exist_and_skip(filename, args.overwrite):
+ with open(filename, 'w', encoding='utf-8') as of:
+ of.write(part_str)
+ if not path_exist_and_skip(filename + '.raw', args.overwrite):
+ with open(filename + '.raw', 'w', encoding='utf-8') as of:
+ of.write(part)
+ vocab = build_vocab([os.path.join(save_dir, 'train.txt')], eos_token=None)
+ vocab.save(os.path.join(save_dir, 'vocab.json'))
+ elif args.dataset == 'enwik8':
+ with zipfile.ZipFile(target_download_location) as zf:
+ data = zf.read('enwik8')
+ print('Length of enwik8: {}'.format(len(data)))
+ num_test_chars = 5000000
+ train_data = data[: -2 * num_test_chars]
+ valid_data = data[-2 * num_test_chars: -num_test_chars]
+ test_data = data[-num_test_chars:]
+
+ for filename, part in [('train.txt', train_data),
+ ('valid.txt', valid_data),
+ ('test.txt', test_data)]:
+ filename = os.path.join(save_dir, filename)
+ print('{} will have {} bytes'.format(filename, len(part)))
+ print('- Tokenizing...')
+ part_str = ' '.join([str(c) if c != ord('\n') else '\n' for c in part])
+ print('- Writing...')
+ if not path_exist_and_skip(filename, args.overwrite):
+ with open(filename, 'w') as of:
+ of.write(part_str)
+ if not path_exist_and_skip(filename + '.raw', args.overwrite):
+ with open(filename + '.raw', 'wb') as of:
+ of.write(part)
+ vocab = build_vocab([os.path.join(save_dir, 'train.txt')], eos_token=None)
+ vocab.save(os.path.join(save_dir, 'vocab.json'))
+
+ elif args.dataset == 'gbw':
+ vocab_path = download(_URLS['gbw_vocab'],
+ os.path.join(args.cache_path, '1b_word_vocab.txt'),
+ sha1_hash=_URL_FILE_STATS[_URLS['gbw_vocab']])
+ with tarfile.open(target_download_location) as f:
+ os.makedirs(os.path.join(save_dir, 'train'), exist_ok=True)
+ os.makedirs(os.path.join(save_dir, 'test'), exist_ok=True)
+ for member in f.getmembers():
+ if 'training-monolingual.tokenized.shuffled' in member.name \
+ and 'news.en' in member.name:
+ basename = os.path.basename(member.name)
+ with f.extractfile(member) as f_in:
+ with open(os.path.join(save_dir, 'train', basename), 'wb') as f_out:
+ shutil.copyfileobj(f_in, f_out)
+ elif 'heldout-monolingual.tokenized.shuffled' in member.name and \
+ '.heldout-' in member.name:
+ basename = os.path.basename(member.name)
+ with f.extractfile(member) as f_in:
+ with open(os.path.join(save_dir, 'test', basename), 'wb') as f_out:
+ shutil.copyfileobj(f_in, f_out)
+ all_tokens = []
+ with open(vocab_path, 'r') as f:
+ for token in f:
+ token = token.strip().split()[0]
+ all_tokens.append(token)
+ vocab = Vocab(all_tokens, bos_token='', unk_token='')
+ vocab.save(os.path.join(save_dir, 'vocab.json'))
+ print('Saved Google-One-Billion-Word in {}'.format(save_dir))
+ print('Vocab={}'.format(vocab))
+ else:
+ raise NotImplementedError
+
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == '__main__':
+ cli_main()
diff --git a/scripts/datasets/machine_translation/README.md b/scripts/datasets/machine_translation/README.md
new file mode 100644
index 0000000000..e9b2f7c194
--- /dev/null
+++ b/scripts/datasets/machine_translation/README.md
@@ -0,0 +1,89 @@
+# Machine Translation
+
+In machine translation, we train a model to map a sentence from the source language, e.g., English,
+to the target language, e.g., Chinese. Here, we provide scripts to download the common benchmark
+datasets for machine translation. The downloaded datasets are stored as a pair of corpus files,
+one for the source and the other for the target.
+
+## WMT
+You can use [prepare_wmt.py](prepare_wmt.py) to download and prepare the raw training corpus and
+then use [clean_parallel_corpus.py](../../preprocess/clean_parallel_corpus.py) to clean and
+filter the corpus.
+
+You may download the raw WMT2014 en-de
+```bash
+nlp_data prepare_wmt \
+ --dataset wmt2014 \
+ --lang-pair en-de \
+ --save-path wmt2014_en_de
+```
+
+By combining `nlp_data` and `nlp_preprocess`, we provide the example for preparing the
+WMT2014 en-de training dataset: [wmt2014_ende.sh](wmt2014_ende.sh). This involves three steps:
+- Downloading the raw text data
+- Clean and tokenize the data
+- Learn subword model and apply the learned subword model.
+
+```bash
+bash wmt2014_ende.sh yttm
+```
+
+We support the following subword learning algorithms:
+
+```bash
+# BPE from YouTokenToMe
+bash wmt2014_ende.sh yttm
+
+# BPE from Huggingface
+bash wmt2014_ende.sh hf_bpe
+
+# BPE from subword-nmt
+bash wmt2014_ende.sh subword_nmt
+
+# Byte-level BPE
+bash wmt2014_ende.sh hf_bytebpe
+
+# Sentencepiece
+bash wmt2014_ende.sh spm
+
+# WordPiece
+bash wmt2014_ende.sh hf_wordpiece
+```
+
+
+Apart from WMT2014 EN-DE, we also provided the script for preparing the training data for
+WMT2017 ZH-EN task:
+[wmt2017_zhen.sh](wmt2017_zhen.sh).
+
+### Monolingual Corpus
+In the WMT competition, there are additional monolingual corpus that helps you train NMT models.
+You may download the raw monolingual corpus by adding `--mono` flag.
+
+One example is to download the newscrawl monolingual corpus in German:
+
+```bash
+nlp_data prepare_wmt \
+ --mono \
+ --mono_lang de \
+ --dataset newscrawl \
+ --save-path wmt2014_mono
+```
+
+
+### Directory Structure of Translation Dataset
+
+The basic structure of a translation dataset is like the following:
+```
+folder_name
+├── train.raw.{src}
+├── train.raw.{tgt}
+├── train.tok.{src}
+├── train.tok.{tgt}
+├── train.tok.{subword_model}.{src}
+├── train.tok.{subword_model}.{tgt}
+├── ...
+├── ... Repeat for valid and test
+├── ...
+├── {subword_model}.model
+├── {subword_model}.path
+```
diff --git a/scripts/datasets/machine_translation/__init__.py b/scripts/datasets/machine_translation/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/datasets/machine_translation/prepare_wmt.py b/scripts/datasets/machine_translation/prepare_wmt.py
new file mode 100644
index 0000000000..2ac5f77772
--- /dev/null
+++ b/scripts/datasets/machine_translation/prepare_wmt.py
@@ -0,0 +1,1071 @@
+from typing import List, Union, IO, AnyStr, Tuple, Optional
+import re
+import os
+import argparse
+import zipfile
+import shutil
+import functools
+import tarfile
+import gzip
+import json
+from xml.etree import ElementTree
+from gluonnlp.data.filtering import ProfanityFilter
+from gluonnlp.utils.misc import file_line_number, download, load_checksum_stats
+from gluonnlp.base import get_data_home_dir, get_repo_url
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+
+# The datasets are provided by WMT2014-WMT2019 and can be freely used for research purposes.
+# You will need to cite the WMT14-WMT19 shared task overview paper and additional citation
+# requirements for specific individual datasets
+# http://www.statmt.org/wmt14/translation-task.html to
+# http://www.statmt.org/wmt19/translation-task.html
+
+
+_CITATIONS = """
+@inproceedings{ziemski2016united,
+ title={The united nations parallel corpus v1. 0},
+ author={Ziemski, Micha{\l} and Junczys-Dowmunt, Marcin and Pouliquen, Bruno},
+ booktitle={Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC'16)},
+ pages={3530--3534},
+ year={2016}
+}
+
+@inproceedings{barrault2019findings,
+ title={Findings of the 2019 conference on machine translation (wmt19)},
+ author={Barrault, Lo{\"\i}c and Bojar, Ond{\v{r}}ej and Costa-juss{\`a}, Marta R and Federmann, Christian and Fishel, Mark and Graham, Yvette and Haddow, Barry and Huck, Matthias and Koehn, Philipp and Malmasi, Shervin and others},
+ booktitle={Proceedings of the Fourth Conference on Machine Translation (Volume 2: Shared Task Papers, Day 1)},
+ pages={1--61},
+ year={2019}
+}
+"""
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'wmt')
+_URL_FILE_STATS = load_checksum_stats(os.path.join(_CURR_DIR, '..', 'url_checksums', 'wmt.txt'))
+
+
+# Here, we will make sure that the languages follow the standard ISO 639-1 language tag.
+# Also, for more information related to the language tag, you may refer to
+# https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
+_PARA_URLS = {
+ 'europarl': {
+ 'v7': {
+ 'cs-en': {
+ 'url': 'http://www.statmt.org/europarl/v7/cs-en.tgz',
+ 'cs': 'europarl-v7.cs-en.cs',
+ 'en': 'europarl-v7.cs-en.en',
+ },
+ 'de-en': {
+ 'url': 'http://www.statmt.org/europarl/v7/de-en.tgz',
+ 'de': 'europarl-v7.de-en.de',
+ 'en': 'europarl-v7.de-en.en',
+ }
+ },
+ 'v8': {
+ 'url': 'http://data.statmt.org/wmt18/translation-task/training-parallel-ep-v8.tgz',
+ 'fi-en': {
+ 'fi': 'training/europarl-v8.fi-en.fi',
+ 'en': 'training/europarl-v8.fi-en.en'
+ },
+ 'et-en': {
+ 'et': 'training/europarl-v8.et-en.et',
+ 'en': 'training/europarl-v8.et-en.en'
+ }
+ },
+ 'v9': {
+ 'cs-en': {
+ 'url': 'http://www.statmt.org/europarl/v9/training/europarl-v9.cs-en.tsv.gz',
+ 'all': 'europarl-v9.cs-en.tsv'
+ },
+ 'de-en': {
+ 'url': 'http://www.statmt.org/europarl/v9/training/europarl-v9.de-en.tsv.gz',
+ 'all': 'europarl-v9.de-en.tsv'
+ },
+ 'fi-en': {
+ 'url': 'http://www.statmt.org/europarl/v9/training/europarl-v9.fi-en.tsv.gz',
+ 'all': 'europarl-v9.fi-en.tsv'
+ },
+ 'lt-en': {
+ 'url': 'http://www.statmt.org/europarl/v9/training/europarl-v9.lt-en.tsv.gz',
+ 'all': 'europarl-v9.lt-en.tsv'
+ }
+ }
+ },
+ 'paracrawl': {
+ 'r3': {
+ 'en-cs': {
+ 'url': 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-cs.bicleaner07.tmx.gz',
+ 'all': 'en-cs.bicleaner07.tmx'
+ },
+ 'en-de': {
+ 'url': 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-de.bicleaner07.tmx.gz',
+ 'all': 'en-de.bicleaner07.tmx'
+ },
+ 'en-fi': {
+ 'url': 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-fi.bicleaner07.tmx.gz',
+ 'all': 'en-fi.bicleaner07.tmx'
+ },
+ 'en-lt': {
+ 'url': 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-lt.bicleaner07.tmx.gz',
+ 'all': 'en-lt.bicleaner07.tmx'
+ }
+ }
+ },
+ 'commoncrawl': {
+ 'wmt13': {
+ 'url': 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
+ 'de-en': {
+ 'de': 'commoncrawl.de-en.de',
+ 'en': 'commoncrawl.de-en.en',
+ }
+ }
+ },
+ 'newscommentary': {
+ 'v9': {
+ 'url': 'http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz',
+ 'de-en': {
+ 'de': 'training/news-commentary-v9.de-en.de',
+ 'en': 'training/news-commentary-v9.de-en.en'
+ }
+ },
+ 'v10': {
+ 'url': 'http://www.statmt.org/wmt15/training-parallel-nc-v10.tgz',
+ 'de-en': {
+ 'de': 'news-commentary-v10.de-en.de',
+ 'en': 'news-commentary-v10.de-en.de'
+ }
+ },
+ 'v11': {
+ 'url': 'http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz',
+ 'de-en': {
+ 'de': 'training-parallel-nc-v11/news-commentary-v11.de-en.de',
+ 'en': 'training-parallel-nc-v11/news-commentary-v11.de-en.en'
+ }
+ },
+ 'v12': {
+ 'url': 'http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz',
+ 'de-en': {
+ 'de': 'training/news-commentary-v12.de-en.de',
+ 'en': 'training/news-commentary-v12.de-en.en',
+ },
+ 'zh-en': {
+ 'zh': 'training/news-commentary-v12.zh-en.zh',
+ 'en': 'training/news-commentary-v12.zh-en.en'
+ }
+ },
+ 'v13': {
+ 'url': 'http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz',
+ 'de-en': {
+ 'de': 'training-parallel-nc-v13/news-commentary-v13.de-en.de',
+ 'en': 'training-parallel-nc-v13/news-commentary-v13.de-en.en',
+ },
+ 'zh-en': {
+ 'zh': 'training-parallel-nc-v13/news-commentary-v13.zh-en.zh',
+ 'en': 'training-parallel-nc-v13/news-commentary-v13.zh-en.en'
+ }
+ },
+ 'v14': {
+ 'de-en': {
+ 'url': 'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.de-en.tsv.gz',
+ 'all': 'news-commentary-v14.de-en.tsv'
+ },
+ 'en-zh': {
+ 'url': 'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-zh.tsv.gz',
+ 'all': 'news-commentary-v14.en-zh.tsv'
+ }
+ }
+ },
+ 'wikititles': {
+ 'v1': {
+ 'cs-en': {
+ 'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.cs-en.tsv.gz',
+ 'all': 'wikititles-v1.cs-en.tsv'
+ },
+ 'cs-pl': {
+ 'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.cs-pl.tsv.gz',
+ 'all': 'wikititles-v1.cs-pl.tsv'
+ },
+ 'de-en': {
+ 'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.de-en.tsv.gz',
+ 'all': 'wikititles-v1.de-en.tsv'
+ },
+ 'es-pt': {
+ 'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.es-pt.tsv.gz',
+ 'all': 'wikititles-v1.es-pt.tsv'
+ },
+ 'fi-en': {
+ 'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.fi-en.tsv.gz',
+ 'all': 'wikititles-v1.fi-en.tsv'
+ },
+ 'gu-en': {
+ 'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.gu-en.tsv.gz',
+ 'all': 'wikititles-v1.gu-en.tsv'
+ },
+ 'hi-ne': {
+ 'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.hi-ne.tsv.gz',
+ 'all': 'wikititles-v1.hi-ne.tsv'
+ },
+ 'kk-en': {
+ 'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz',
+ 'all': 'wikititles-v1.kk-en.tsv'
+ },
+ 'lt-en': {
+ 'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.lt-en.tsv.gz',
+ 'all': 'wikititles-v1.lt-en.tsv'
+ },
+ 'ru-en': {
+ 'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.ru-en.tsv.gz',
+ 'all': 'wikititles-v1.ru-en.tsv'
+ },
+ 'zh-en': {
+ 'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.zh-en.tsv.gz',
+ 'all': 'wikititles-v1.zh-en.tsv'
+ }
+ }
+ },
+ 'uncorpus': {
+ 'v1': {
+ 'en-zh': {
+ 'url': ['https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00',
+ 'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01'],
+ 'en': 'en-zh/UNv1.0.en-zh.en',
+ 'zh': 'en-zh/UNv1.0.en-zh.zh'
+ }
+ }
+ },
+ # For the CWMT dataset, you can also download them from the official location: http://nlp.nju.edu.cn/cwmt-wmt/
+ # Currently, this version is processed via https://gist.github.com/sxjscience/54bedd68ce3fb69b3b1b264377efb5a5
+ 'cwmt': {
+ 'url': 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt.tar.gz',
+ 'zh-en': {
+ 'en': 'cwmt/cwmt-zh-en.en',
+ 'zh': 'cwmt/cwmt-zh-en.zh'
+ }
+ },
+ 'rapid': {
+ '2016': {
+ 'url': 'http://data.statmt.org/wmt17/translation-task/rapid2016.tgz',
+ 'de-en': {
+ 'de': 'rapid2016.de-en.de',
+ 'en': 'rapid2016.de-en.en'
+ }
+ },
+ '2019': {
+ 'de-en': {
+ 'url': 'https://s3-eu-west-1.amazonaws.com/tilde-model/rapid2019.de-en.zip',
+ 'de': 'rapid2019.de-en.de',
+ 'en': 'rapid2019.de-en.en'
+ }
+ }
+ },
+}
+
+_MONOLINGUAL_URLS = {
+ 'newscrawl': {
+ '2007': {
+ 'de': {
+ 'url': 'http://data.statmt.org/news-crawl/de/news.2007.de.shuffled.deduped.gz',
+ 'de': 'newscrawl2007.de',
+ }
+ },
+ '2008': {
+ 'de': {
+ 'url': 'http://data.statmt.org/news-crawl/de/news.2008.de.shuffled.deduped.gz',
+ 'de': 'newscrawl2008.de',
+ }
+ },
+ '2009': {
+ 'de': {
+ 'url': 'http://data.statmt.org/news-crawl/de/news.2009.de.shuffled.deduped.gz',
+ 'de': 'newscrawl2009.de',
+ }
+ },
+ '20010': {
+ 'de': {
+ 'url': 'http://data.statmt.org/news-crawl/de/news.2010.de.shuffled.deduped.gz',
+ 'de': 'newscrawl2010.de',
+ }
+ },
+ '2011': {
+ 'de': {
+ 'url': 'http://data.statmt.org/news-crawl/de/news.2011.de.shuffled.deduped.gz',
+ 'de': 'newscrawl2011.de',
+ }
+ },
+ '2012': {
+ 'de': {
+ 'url': 'http://data.statmt.org/news-crawl/de/news.2012.de.shuffled.deduped.gz',
+ 'de': 'newscrawl2012.de',
+ }
+ },
+ '2013': {
+ 'de': {
+ 'url': 'http://data.statmt.org/news-crawl/de/news.2013.de.shuffled.deduped.gz',
+ 'de': 'newscrawl2013.de',
+ }
+ },
+ '2014': {
+ 'de': {
+ 'url': 'http://data.statmt.org/news-crawl/de/news.2014.de.shuffled.deduped.gz',
+ 'de': 'newscrawl2014.de',
+ }
+ },
+ '2015': {
+ 'de': {
+ 'url': 'http://data.statmt.org/news-crawl/de/news.2015.de.shuffled.deduped.gz',
+ 'de': 'newscrawl2015.de',
+ }
+ },
+ '2016': {
+ 'de': {
+ 'url': 'http://data.statmt.org/news-crawl/de/news.2016.de.shuffled.deduped.gz',
+ 'de': 'newscrawl2016.de',
+ }
+ },
+ '2017': {
+ 'de': {
+ 'url': 'http://data.statmt.org/news-crawl/de/news.2017.de.shuffled.deduped.gz',
+ 'de': 'newscrawl2017.de',
+ }
+ },
+ '2018': {
+ 'de': {
+ 'url': 'http://data.statmt.org/news-crawl/de/news.2018.de.shuffled.deduped.gz',
+ 'de': 'newscrawl2018.de',
+ }
+ },
+ }
+}
+
+with open(os.path.join(_CURR_DIR, '..', 'url_checksums', 'mirror', 'wmt.json')) as wmt_mirror_map_f:
+ _WMT_MIRROR_URL_MAP = json.load(wmt_mirror_map_f)
+
+def _download_with_mirror(url, path, sha1_hash):
+ return download(
+ get_repo_url() + _WMT_MIRROR_URL_MAP[url] if url in _WMT_MIRROR_URL_MAP else url,
+ path=path,
+ sha1_hash=sha1_hash
+ )
+
+def _clean_space(s: str):
+ """Removes trailing and leading spaces and collapses multiple consecutive internal spaces to a single one.
+ This is borrowed from sacrebleu: https://github.com/mjpost/sacreBLEU/blob/069b0c88fceb29f3e24c3c19ba25342a3e7f96cb/sacrebleu.py#L1077
+
+ Parameters
+ ----------
+ s
+ The input string
+
+ Returns
+ -------
+ ret
+ The cleaned string
+ """
+ return re.sub(r'\s+', ' ', s.strip())
+
+
+def _get_buffer(path_or_buffer: Union[str, IO[AnyStr]], mode='r'):
+ if isinstance(path_or_buffer, str):
+ buf = open(path_or_buffer, mode)
+ else:
+ buf = path_or_buffer
+ return buf
+
+
+def parse_sgm(path_or_buffer: Union[str, IO[AnyStr]],
+ out_path_or_buffer: Optional[Union[str, IO[AnyStr]]] = None,
+ return_sentences=False,
+ clean_space=True) -> Optional[List[str]]:
+ """Returns sentences from a single SGML file. This is compatible to the behavior of
+ `input-from-sgm.perl` in
+ https://github.com/moses-smt/mosesdecoder/blob/a89691fee395bb7eb6dfd51e368825f0578f437d/scripts/ems/support/input-from-sgm.perl
+
+ Parameters
+ ----------
+ path_or_buffer
+ The source path to parse the file
+ out_path_or_buffer
+ The output path
+ return_sentences
+ Whether to return the parsed sentences
+ clean_space
+ Whether to clean the spaces in the sentence with the similar strategy in
+ input-from-sgm.perl.
+
+ Returns
+ -------
+ sentences
+ The list contains the parsed sentences in the input file.
+ If the return_sentences is False, return None.
+ """
+ if out_path_or_buffer is None:
+ assert return_sentences, 'Must return sentences if the output path is not specified!'
+ if return_sentences:
+ sentences = []
+ else:
+ sentences = None
+ f_buffer = _get_buffer(path_or_buffer, 'r')
+ of_buffer = _get_buffer(out_path_or_buffer, 'w')
+ seg_re = re.compile(r'(.*).*?')
+ for line in f_buffer:
+ if isinstance(line, bytes):
+ line = line.decode('utf-8')
+ seg_match = re.match(seg_re, line)
+ if seg_match:
+ assert len(seg_match.groups()) == 1,\
+ 'File content is not supported, unmatched line: {}'.format(line)
+ line = seg_match.groups()[0]
+ if clean_space:
+ line = _clean_space(line)
+ if of_buffer is not None:
+ of_buffer.write(line + '\n')
+ if sentences is not None:
+ sentences.append(line)
+ if of_buffer is not None:
+ of_buffer.close()
+ return sentences
+
+
+def parse_paracrawl_tmx(path_or_buffer, src_lang, tgt_lang, out_src_path, out_tgt_path,
+ clean_space=True, filter_profanity=False):
+ candidate_lang = {src_lang, tgt_lang}
+ sent_num = 0
+ if filter_profanity:
+ src_profanity_filter = ProfanityFilter(langs=[src_lang])
+ tgt_profanity_filter = ProfanityFilter(langs=[tgt_lang])
+ has_src = False
+ has_tgt = False
+ src_sentence = None
+ tgt_sentence = None
+ f = _get_buffer(path_or_buffer)
+ src_out_f = open(out_src_path, 'w', encoding='utf-8')
+ tgt_out_f = open(out_tgt_path, 'w', encoding='utf-8')
+ for i, (_, elem) in enumerate(ElementTree.iterparse(f)):
+ if elem.tag == "tu":
+ for tuv in elem.iterfind("tuv"):
+ lang = None
+ for k, v in tuv.items():
+ if k.endswith('}lang'):
+ assert v in candidate_lang,\
+ 'Find language={} in data, which is not the same as either' \
+ ' the source/target languages={}/{}'.format(v, src_lang, tgt_lang)
+ lang = v
+ break
+ if lang is not None:
+ segs = tuv.findall("seg")
+ assert len(segs) == 1, "Invalid number of segments: {}".format(len(segs))
+ if lang == src_lang:
+ assert not has_src
+ has_src = True
+ src_sentence = segs[0].text
+ else:
+ assert not has_tgt
+ has_tgt = True
+ tgt_sentence = segs[0].text
+ if has_src and has_tgt:
+ has_src, has_tgt = False, False
+ if clean_space:
+ # Merge the spaces
+ src_sentence = _clean_space(src_sentence)
+ tgt_sentence = _clean_space(tgt_sentence)
+ if filter_profanity:
+ if src_profanity_filter.match(src_sentence)\
+ or tgt_profanity_filter.match(tgt_sentence):
+ continue
+ sent_num += 1
+ if sent_num % 500000 == 0:
+ print('Processed {} sentences'.format(sent_num))
+ src_out_f.write(src_sentence + '\n')
+ tgt_out_f.write(tgt_sentence + '\n')
+ elem.clear()
+ src_out_f.close()
+ tgt_out_f.close()
+ assert has_src or has_tgt,\
+ 'The number of source and target sentences are not the same.'
+
+
+def parse_tsv(path_or_buffer, src_out_path, tgt_out_path):
+ in_f = _get_buffer(path_or_buffer, 'r')
+ src_out_f = _get_buffer(src_out_path, 'w')
+ tgt_out_f = _get_buffer(tgt_out_path, 'w')
+ for line in in_f:
+ line = line.strip()
+ split_data = line.split('\t')
+ if len(split_data) == 2:
+ # Here, some lines may be corrupted and may not have a target translation
+ src_sentence, tgt_sentence = split_data
+ src_out_f.write(src_sentence + '\n')
+ tgt_out_f.write(tgt_sentence + '\n')
+
+
+def split_lang_pair(pair: str = 'de-en') -> Tuple[str, str]:
+ try:
+ src_lang, tgt_lang = pair.split('-')
+ except ValueError:
+ raise ValueError('pair must be format like "en-de", "zh-en". Received {}'
+ .format(pair))
+ return src_lang, tgt_lang
+
+
+def concatenate_files(fname_l: List[str],
+ out_fname: Optional[str] = None,
+ chunk_size: int = 128 * 1024) -> str:
+ """Concatenate multiple files into a single file. This is used to recover a large file that has
+ been split into multiple parts. E.g.,
+
+ UNv1.0.en-zh.tar.gz.00, UNv1.0.en-zh.tar.gz.01 --> UNv1.0.en-zh.tar.gz
+
+ Parameters
+ ----------
+ fname_l
+ out_fname
+ chunk_size
+
+ Returns
+ -------
+ ret
+ """
+ assert len(fname_l) > 1
+ ext_l = []
+ base_prefix, ext = os.path.splitext(fname_l[0])
+ ext_l.append(ext)
+ for i in range(1, len(fname_l)):
+ prefix, ext = os.path.splitext(fname_l[i])
+ ext_l.append(ext)
+ if prefix != base_prefix:
+ raise ValueError('Cannot concatenate the input files! The prefix does not match! '
+ 'Find prefix={}, Expected prefix={}'.format(prefix, base_prefix))
+ fname_ext_l = sorted(zip(fname_l, ext_l), key=lambda ele: ele[1])
+ if out_fname is None:
+ out_fname = base_prefix
+ with open(out_fname, 'wb') as of:
+ for fname, _ in fname_ext_l:
+ with open(fname, 'rb') as infile:
+ for block in iter(functools.partial(infile.read, chunk_size), b''):
+ of.write(block)
+ return out_fname
+
+
+def extract_mono_corpus(compressed_data_path, lang, name, out_src_path):
+ tmp_dir = os.path.join(os.path.dirname(compressed_data_path), 'raw_data')
+ if not os.path.exists(tmp_dir):
+ os.makedirs(tmp_dir)
+ # Uncompress data
+ if compressed_data_path.endswith('.gz'):
+ with gzip.open(compressed_data_path) as f_in:
+ with open(os.path.join(tmp_dir, name), 'wb') as f_out:
+ shutil.copyfileobj(f_in, f_out)
+ else:
+ raise NotImplementedError('Cannot process {}'.format(compressed_data_path))
+ # Parse data and move to the required src paths
+
+ shutil.copyfile(os.path.join(tmp_dir, name), out_src_path)
+
+ # Clean-up
+ shutil.rmtree(tmp_dir)
+
+
+def fetch_mono_dataset(selection: Union[str, List[str], List[List[str]]],
+ lang: str = 'de',
+ path: Optional[str] = _BASE_DATASET_PATH,
+ overwrite: bool = False) -> List[str]:
+ """Fetch the monolingual dataset provided by WMT
+
+ Parameters
+ ----------
+ selection
+ The selected datasets
+ lang
+ Language of the monolingual corpus
+ path
+
+ overwrite
+ Whether to overwrite the downloaded dataset
+
+ Returns
+ -------
+ src_corpus_paths
+ """
+ base_url_info = _MONOLINGUAL_URLS
+ if isinstance(selection, str):
+ selection = [selection]
+ elif isinstance(selection, list):
+ if isinstance(selection[0], list):
+ corpus_paths = []
+ for ele in selection:
+ ele_corpus_paths =\
+ fetch_mono_dataset(ele, lang, path, overwrite)
+ corpus_paths.extend(ele_corpus_paths)
+ return corpus_paths
+ else:
+ raise NotImplementedError
+ for sel in selection:
+ base_url_info = base_url_info[sel]
+
+ # Check the pair is valid
+ available_lang = set(base_url_info.keys())
+ if 'url' in available_lang:
+ available_lang.remove('url')
+ if lang in available_lang:
+ matched_lang = '{}'.format(lang)
+ else:
+ raise ValueError('Unsupported lang, lang={}. All supported: {}'
+ .format(lang, available_lang))
+ save_dir_path = os.path.join(path, *(selection + [matched_lang]))
+ if not os.path.exists(save_dir_path):
+ os.makedirs(save_dir_path)
+ out_path = os.path.join(save_dir_path, lang + '.txt')
+ # Check for whether we can load the cached version
+ if os.path.exists(out_path) and not overwrite:
+ print('Found data in {}, skip:\n'
+ '\tSource: {}\n'.format(selection + [lang], out_path))
+ return [out_path]
+ lang_data_info = base_url_info[matched_lang]
+ if 'url' in lang_data_info:
+ url_l = lang_data_info['url']
+ else:
+ url_l = base_url_info['url']
+ # Download the data + Concatenate the file-parts (if necessary)
+ download_fname_l = []
+ if isinstance(url_l, str):
+ url_l = [url_l]
+ for url in url_l:
+ original_filename = url[url.rfind("/") + 1:]
+ sha1_hash = _URL_FILE_STATS[url]
+ if 'url' in lang_data_info:
+ save_path_l = [path] + selection + [matched_lang, original_filename]
+ else:
+ save_path_l = [path] + selection + [original_filename]
+ download_fname = _download_with_mirror(
+ url,
+ path=os.path.join(*save_path_l),
+ sha1_hash=sha1_hash
+ )
+ download_fname_l.append(download_fname)
+ if len(download_fname_l) > 1:
+ data_path = concatenate_files(download_fname_l)
+ else:
+ data_path = download_fname_l[0]
+
+ src_name = lang_data_info[lang]
+ print('Prepare data for {}\n'
+ '\tCompressed File: {}\n'
+ '\t{}: {}\n'.format(selection + [lang],
+ data_path,
+ lang, out_path))
+ extract_mono_corpus(data_path,
+ lang=lang,
+ name=src_name,
+ out_src_path=out_path)
+ return [out_path]
+
+
+def extract_src_tgt_corpus(compressed_data_path,
+ data_lang_pair, src_lang, tgt_lang,
+ src_name, tgt_name, src_tgt_name,
+ out_src_path, out_tgt_path):
+ data_src_lang, data_tgt_lang = split_lang_pair(data_lang_pair)
+ if not ((src_lang == data_src_lang and tgt_lang == data_tgt_lang) or
+ (src_lang == data_tgt_lang and tgt_lang == data_src_lang)):
+ raise ValueError('Mismatch src/tgt language. Required pair={}, Given src={}, tgt={}'
+ .format(data_lang_pair, src_lang, tgt_lang))
+ reverse_pair = (src_lang == data_tgt_lang) and (tgt_lang == data_src_lang)
+ if src_tgt_name is not None:
+ assert src_name is None and tgt_name is None
+ tmp_dir = os.path.join(os.path.dirname(compressed_data_path), 'raw_data')
+ if not os.path.exists(tmp_dir):
+ os.makedirs(tmp_dir)
+ # Uncompress data
+ if compressed_data_path.endswith('.tar.gz') or compressed_data_path.endswith('.tgz'):
+ with tarfile.open(compressed_data_path) as f:
+ if src_tgt_name is None:
+ f.extract(src_name, tmp_dir)
+ f.extract(tgt_name, tmp_dir)
+ else:
+ f.extract(src_tgt_name, os.path.join(tmp_dir, src_tgt_name))
+ elif compressed_data_path.endswith('.gz'):
+ assert src_tgt_name is not None
+ with gzip.open(compressed_data_path) as f_in:
+ with open(os.path.join(tmp_dir, src_tgt_name), 'wb') as f_out:
+ shutil.copyfileobj(f_in, f_out)
+ elif compressed_data_path.endswith('.zip'):
+ with zipfile.ZipFile(compressed_data_path) as zip_handler:
+ if src_tgt_name is None:
+ with zip_handler.open(src_name) as f_in:
+ with open(os.path.join(tmp_dir, src_name), 'wb') as f_out:
+ shutil.copyfileobj(f_in, f_out)
+ with zip_handler.open(tgt_name) as f_in:
+ with open(os.path.join(tmp_dir, tgt_name), 'wb') as f_out:
+ shutil.copyfileobj(f_in, f_out)
+ else:
+ with zip_handler.open(src_tgt_name) as f_in:
+ with open(os.path.join(tmp_dir, src_tgt_name), 'wb') as f_out:
+ shutil.copyfileobj(f_in, f_out)
+ else:
+ raise NotImplementedError('Cannot process {}'.format(compressed_data_path))
+ # Parse data and move to the required src/tgt path
+ if src_tgt_name is None:
+ if src_name.endswith('.sgm'):
+ parse_sgm(os.path.join(tmp_dir, src_name), out_src_path)
+ parse_sgm(os.path.join(tmp_dir, tgt_name), out_tgt_path)
+ else:
+ shutil.copyfile(os.path.join(tmp_dir, src_name), out_src_path)
+ shutil.copyfile(os.path.join(tmp_dir, tgt_name), out_tgt_path)
+ else:
+ if src_tgt_name.endswith('.tmx'):
+ parse_paracrawl_tmx(os.path.join(tmp_dir, src_tgt_name),
+ src_lang=src_lang,
+ tgt_lang=tgt_lang,
+ out_src_path=out_src_path,
+ out_tgt_path=out_tgt_path,
+ clean_space=True,
+ filter_profanity=False)
+ elif src_tgt_name.endswith('.tsv'):
+ if reverse_pair:
+ parse_tsv(os.path.join(tmp_dir, src_tgt_name), out_tgt_path, out_src_path)
+ else:
+ parse_tsv(os.path.join(tmp_dir, src_tgt_name), out_src_path, out_tgt_path)
+ else:
+ raise NotImplementedError
+ # Clean-up
+ shutil.rmtree(tmp_dir)
+
+
+def fetch_wmt_parallel_dataset(selection: Union[str, List[str], List[List[str]]],
+ lang_pair: str = 'de-en',
+ path: Optional[str] = _BASE_DATASET_PATH,
+ overwrite: bool = False) -> Tuple[List[str], List[str]]:
+ """
+
+ Parameters
+ ----------
+ selection
+ lang_pair
+ path
+ overwrite
+
+ Returns
+ -------
+ src_corpus_paths
+ target_corpus_paths
+ """
+ src_lang, tgt_lang = split_lang_pair(lang_pair)
+ base_url_info = _PARA_URLS
+ if isinstance(selection, str):
+ selection = [selection]
+ elif isinstance(selection, list):
+ if isinstance(selection[0], list):
+ src_corpus_paths = []
+ tgt_corpus_paths = []
+ for ele in selection:
+ ele_src_corpus_paths, ele_tgt_corpus_paths =\
+ fetch_wmt_parallel_dataset(ele, lang_pair, path, overwrite)
+ src_corpus_paths.extend(ele_src_corpus_paths)
+ tgt_corpus_paths.extend(ele_tgt_corpus_paths)
+ return src_corpus_paths, tgt_corpus_paths
+ else:
+ raise NotImplementedError
+ for sel in selection:
+ base_url_info = base_url_info[sel]
+ # Check the pair is valid
+ available_pairs = set(base_url_info.keys())
+ if 'url' in available_pairs:
+ available_pairs.remove('url')
+ if str(src_lang) + '-' + str(tgt_lang) in available_pairs:
+ matched_pair = '{}-{}'.format(src_lang, tgt_lang)
+ elif str(tgt_lang) + '-' + str(src_lang) in available_pairs:
+ matched_pair = '{}-{}'.format(tgt_lang, src_lang)
+ else:
+ raise ValueError('Unsupported pairs, src_lang={}, tgt_lang={}. All supported: {}'
+ .format(src_lang, tgt_lang, available_pairs))
+ save_dir_path = os.path.join(path, *(selection + [matched_pair]))
+ if not os.path.exists(save_dir_path):
+ os.makedirs(save_dir_path)
+ out_src_path = os.path.join(save_dir_path, src_lang + '.txt')
+ out_tgt_path = os.path.join(save_dir_path, tgt_lang + '.txt')
+ # Check for whether we can load the cached version
+ # TODO we can do something smarter here
+ if os.path.exists(out_src_path) and os.path.exists(out_tgt_path) and not overwrite:
+ print('Found data in {}, skip:\n'
+ '\tSource: {}\n'
+ '\tTarget: {}\n'.format(selection + [lang_pair], out_src_path, out_tgt_path))
+ return [out_src_path], [out_tgt_path]
+ pair_data_info = base_url_info[matched_pair]
+ if 'url' in pair_data_info:
+ url_l = pair_data_info['url']
+ else:
+ url_l = base_url_info['url']
+ # Download the data + Concatenate the file-parts (if necessary)
+ download_fname_l = []
+ if isinstance(url_l, str):
+ url_l = [url_l]
+ for url in url_l:
+ original_filename = url[url.rfind("/") + 1:]
+ sha1_hash = _URL_FILE_STATS[url]
+ if 'url' in pair_data_info:
+ save_path_l = [path] + selection + [matched_pair, original_filename]
+ else:
+ save_path_l = [path] + selection + [original_filename]
+ download_fname = _download_with_mirror(
+ url,
+ path=os.path.join(*save_path_l),
+ sha1_hash=sha1_hash
+ )
+ download_fname_l.append(download_fname)
+ if len(download_fname_l) > 1:
+ data_path = concatenate_files(download_fname_l)
+ else:
+ data_path = download_fname_l[0]
+ if 'all' in pair_data_info:
+ src_name, tgt_name, src_tgt_name = None, None, pair_data_info['all']
+ else:
+ src_name, tgt_name, src_tgt_name = pair_data_info[src_lang], pair_data_info[tgt_lang], None
+ print('Prepare data for {}\n'
+ '\tCompressed File: {}\n'
+ '\t{}: {}\n'
+ '\t{}: {}\n'.format(selection + [lang_pair],
+ data_path,
+ src_lang, out_src_path,
+ tgt_lang, out_tgt_path))
+ extract_src_tgt_corpus(data_path,
+ data_lang_pair=matched_pair,
+ src_lang=src_lang,
+ tgt_lang=tgt_lang,
+ src_name=src_name,
+ tgt_name=tgt_name,
+ src_tgt_name=src_tgt_name,
+ out_src_path=out_src_path,
+ out_tgt_path=out_tgt_path)
+ assert file_line_number(out_src_path) == file_line_number(out_tgt_path)
+ return [out_src_path], [out_tgt_path]
+
+
+def download_mono_newscrawl(lang: str = 'de', path: str = _BASE_DATASET_PATH)\
+ -> List[str]:
+ """Download the train dataset used for WMT2014
+
+ Parameters
+ ----------
+ lang
+ path
+
+ Returns
+ -------
+ train_src_paths
+ """
+ if lang == 'de':
+ train_src_paths =\
+ fetch_mono_dataset([['newscrawl', '2017'],
+ ['newscrawl', '2018']],
+ lang=lang,
+ path=path)
+ else:
+ raise NotImplementedError
+ return train_src_paths
+
+
+def download_wmt14_train(lang_pair: str = 'en-de', path: str = _BASE_DATASET_PATH)\
+ -> Tuple[List[str], List[str]]:
+ """Download the train dataset used for WMT2014
+
+ Parameters
+ ----------
+ lang_pair
+ path
+
+ Returns
+ -------
+ train_src_paths
+ train_tgt_paths
+ """
+ if lang_pair == 'en-de' or lang_pair == 'de-en':
+ train_src_paths, train_tgt_paths =\
+ fetch_wmt_parallel_dataset([['europarl', 'v7'],
+ ['commoncrawl', 'wmt13'],
+ ['newscommentary', 'v9']], lang_pair, path=path)
+ else:
+ raise NotImplementedError
+ return train_src_paths, train_tgt_paths
+
+
+def download_wmt16_train(lang_pair: str = 'en-de', path: str = _BASE_DATASET_PATH)\
+ -> Tuple[List[str], List[str]]:
+ """Download the train dataset used for WMT2016
+
+ Parameters
+ ----------
+ lang_pair
+ path
+
+ Returns
+ -------
+ train_src_paths
+ train_tgt_paths
+
+ """
+ if lang_pair == 'en-de' or lang_pair == 'de-en':
+ train_src_paths, train_tgt_paths = \
+ fetch_wmt_parallel_dataset([['europarl', 'v7'],
+ ['commoncrawl', 'wmt13'],
+ ['newscommentary', 'v11']], lang_pair, path=path)
+ else:
+ raise NotImplementedError
+ return train_src_paths, train_tgt_paths
+
+
+def download_wmt17_train(lang_pair: str = 'en-de', path: str = _BASE_DATASET_PATH)\
+ -> Tuple[List[str], List[str]]:
+ """Download the train dataset used for WMT2017
+
+ Parameters
+ ----------
+ lang_pair
+ path
+
+ Returns
+ -------
+ train_src_paths
+ train_tgt_paths
+
+ """
+ if lang_pair == 'en-de' or lang_pair == 'de-en':
+ train_src_paths, train_tgt_paths = \
+ fetch_wmt_parallel_dataset([['europarl', 'v7'],
+ ['commoncrawl', 'wmt13'],
+ ['newscommentary', 'v12'],
+ ['rapid', '2016']], lang_pair, path=path)
+ elif lang_pair == 'zh-en' or lang_pair == 'en-zh':
+ train_src_paths, train_tgt_paths = \
+ fetch_wmt_parallel_dataset([['newscommentary', 'v13'],
+ ['uncorpus', 'v1'],
+ ['cwmt']], lang_pair, path=path)
+ else:
+ raise NotImplementedError
+ return train_src_paths, train_tgt_paths
+
+
+@DATA_PARSER_REGISTRY.register('prepare_wmt')
+def get_parser():
+ parser = argparse.ArgumentParser(description='Downloading and Preprocessing WMT Datasets.')
+ parser.add_argument('--dataset', type=str, required=True,
+ choices=['wmt2014', 'wmt2017', 'newscrawl'],
+ help='The dataset to use.')
+ parser.add_argument('--mono', action='store_true',
+ help='Download monolingual dataset.')
+ parser.add_argument('--mono_lang', type=str, default='de',
+ help='The monolingual language.')
+ parser.add_argument('--lang-pair', type=str, default='en-de',
+ help='The pair of source language and target language separated by "-", '
+ 'e.g. "en-de", "en-zh".')
+ parser.add_argument('--mode', choices=['path_only',
+ 'raw'],
+ default='raw',
+ help='If the mode is "path_only",'
+ ' the script will only output the path of the raw corpus.'
+ 'If mode is "raw", the script will concatenate all the related'
+ ' corpus and save to the folder.')
+ parser.add_argument('--save-path', type=str, default='wmt_data',
+ help='The path to save the dataset.')
+ parser.add_argument('--prefix', type=str, default='train.raw',
+ help='The prefix of the saved raw files.')
+ parser.add_argument('--overwrite', action='store_true', help='Whether to overwrite the ')
+ parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
+ help='The path to cache the downloaded files.')
+ return parser
+
+
+def mono_main(args):
+ lang = args.mono_lang
+ if args.dataset.lower() == 'newscrawl':
+ if lang == 'de':
+ train_src_paths =\
+ download_mono_newscrawl('de', args.cache_path)
+ else:
+ raise NotImplementedError
+ else:
+ raise NotImplementedError
+ if args.mode == 'path_only':
+ print('Dataset: {}/{}'.format(args.dataset, args.mono_lang))
+ print('Train Source:')
+ for path in train_src_paths:
+ print('\t{}'.format(path))
+ elif args.mode == 'raw':
+ assert args.save_path is not None
+ if not os.path.exists(args.save_path):
+ os.makedirs(args.save_path)
+ print('Save to {}'.format(args.save_path))
+ raw_src_path = os.path.join(args.save_path, '{}.{}'.format(args.prefix, lang))
+ if not os.path.exists(raw_src_path) or args.overwrite:
+ with open(raw_src_path, 'wb') as out_f:
+ for ele_path in train_src_paths:
+ with open(ele_path, 'rb') as in_f:
+ shutil.copyfileobj(in_f, out_f)
+ else:
+ raise NotImplementedError
+
+
+@DATA_MAIN_REGISTRY.register('prepare_wmt')
+def main(args):
+ if args.mono:
+ mono_main(args)
+ else:
+ src_lang, tgt_lang = split_lang_pair(args.lang_pair)
+ if args.dataset.lower() == 'wmt2014':
+ if (src_lang, tgt_lang) in [('en', 'de'), ('de', 'en')]:
+ train_src_paths, train_tgt_paths =\
+ download_wmt14_train(args.lang_pair, args.cache_path)
+ else:
+ raise NotImplementedError
+ elif args.dataset.lower() == 'wmt2016':
+ if (src_lang, tgt_lang) in [('en', 'de'), ('de', 'en')]:
+ train_src_paths, train_tgt_paths =\
+ download_wmt16_train(args.lang_pair, args.cache_path)
+ else:
+ raise NotImplementedError
+ elif args.dataset.lower() == 'wmt2017':
+ if (src_lang, tgt_lang) in [('en', 'de'), ('de', 'en'),
+ ('zh', 'en'), ('en', 'zh')]:
+ train_src_paths, train_tgt_paths =\
+ download_wmt17_train(args.lang_pair, args.cache_path)
+ else:
+ raise NotImplementedError
+ else:
+ raise NotImplementedError
+ if args.mode == 'path_only':
+ print('Dataset: {}/{}'.format(args.dataset, args.lang_pair))
+ print('Train Source:')
+ for path in train_src_paths:
+ print('\t{}'.format(path))
+ print('Train Target:')
+ for path in train_tgt_paths:
+ print('\t{}'.format(path))
+ elif args.mode == 'raw':
+ if not os.path.exists(args.save_path):
+ os.makedirs(args.save_path)
+ print('Save to {}'.format(args.save_path))
+ raw_src_path = os.path.join(args.save_path, '{}.{}'.format(args.prefix, src_lang))
+ raw_tgt_path = os.path.join(args.save_path, '{}.{}'.format(args.prefix, tgt_lang))
+ if not os.path.exists(raw_src_path) or args.overwrite:
+ with open(raw_src_path, 'wb') as out_f:
+ for ele_path in train_src_paths:
+ with open(ele_path, 'rb') as in_f:
+ shutil.copyfileobj(in_f, out_f)
+ if not os.path.exists(raw_tgt_path) or args.overwrite:
+ with open(raw_tgt_path, 'wb') as out_f:
+ for ele_path in train_tgt_paths:
+ with open(ele_path, 'rb') as in_f:
+ shutil.copyfileobj(in_f, out_f)
+ assert file_line_number(raw_src_path) == file_line_number(raw_tgt_path)
+ else:
+ raise NotImplementedError
+
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == '__main__':
+ cli_main()
+
diff --git a/scripts/datasets/machine_translation/wmt2014_ende.sh b/scripts/datasets/machine_translation/wmt2014_ende.sh
new file mode 100644
index 0000000000..6557715365
--- /dev/null
+++ b/scripts/datasets/machine_translation/wmt2014_ende.sh
@@ -0,0 +1,78 @@
+SUBWORD_ALGO=$1
+SRC=en
+TGT=de
+SAVE_PATH=wmt2014_ende
+
+# Fetch the raw text
+nlp_data prepare_wmt \
+ --dataset wmt2014 \
+ --lang-pair ${SRC}-${TGT} \
+ --save-path ${SAVE_PATH}
+
+# We use sacrebleu to fetch the dev set (newstest2013) and test set (newstest2014)
+sacrebleu -t wmt13 -l ${SRC}-${TGT} --echo src > ${SAVE_PATH}/dev.raw.${SRC}
+sacrebleu -t wmt13 -l ${SRC}-${TGT} --echo ref > ${SAVE_PATH}/dev.raw.${TGT}
+sacrebleu -t wmt14/full -l ${SRC}-${TGT} --echo src > ${SAVE_PATH}/test.raw.${SRC}
+sacrebleu -t wmt14/full -l ${SRC}-${TGT} --echo ref > ${SAVE_PATH}/test.raw.${TGT}
+
+
+# Clean and tokenize the training + dev corpus
+cd ${SAVE_PATH}
+nlp_preprocess clean_tok_para_corpus --src-lang ${SRC} \
+ --tgt-lang ${TGT} \
+ --src-corpus train.raw.${SRC} \
+ --tgt-corpus train.raw.${TGT} \
+ --min-num-words 1 \
+ --max-num-words 100 \
+ --max-ratio 1.5 \
+ --src-save-path train.tok.${SRC} \
+ --tgt-save-path train.tok.${TGT}
+
+nlp_preprocess clean_tok_para_corpus --src-lang ${SRC} \
+ --tgt-lang ${TGT} \
+ --src-corpus dev.raw.${SRC} \
+ --tgt-corpus dev.raw.${TGT} \
+ --min-num-words 1 \
+ --max-num-words 100 \
+ --max-ratio 1.5 \
+ --src-save-path dev.tok.${SRC} \
+ --tgt-save-path dev.tok.${TGT}
+
+# For test corpus, we will just tokenize the data
+nlp_preprocess clean_tok_para_corpus --src-lang ${SRC} \
+ --tgt-lang ${TGT} \
+ --src-corpus test.raw.${SRC} \
+ --tgt-corpus test.raw.${TGT} \
+ --src-save-path test.tok.${SRC} \
+ --tgt-save-path test.tok.${TGT}
+
+# Learn BPE with the training data
+nlp_preprocess learn_subword --corpus train.tok.${SRC} train.tok.${TGT} \
+ --model ${SUBWORD_ALGO} \
+ --save-dir . \
+ --vocab-size 32768
+
+# Apply the learned codes to the training set
+for LANG in ${SRC} ${TGT}
+do
+nlp_preprocess apply_subword --model ${SUBWORD_ALGO}\
+ --output-type subword \
+ --model-path ${SUBWORD_ALGO}.model \
+ --vocab-path ${SUBWORD_ALGO}.vocab \
+ --corpus train.tok.${LANG} \
+ --save-path train.tok.${SUBWORD_ALGO}.${LANG}
+done
+
+# Apply the learned codes to the dev/test set
+for LANG in ${SRC} ${TGT}
+do
+ for SPLIT in dev test
+ do
+ nlp_preprocess apply_subword --model ${SUBWORD_ALGO} \
+ --output-type subword \
+ --model-path ${SUBWORD_ALGO}.model \
+ --vocab-path ${SUBWORD_ALGO}.vocab \
+ --corpus ${SPLIT}.tok.${LANG} \
+ --save-path ${SPLIT}.tok.${SUBWORD_ALGO}.${LANG}
+ done
+done
diff --git a/scripts/datasets/machine_translation/wmt2017_zhen.sh b/scripts/datasets/machine_translation/wmt2017_zhen.sh
new file mode 100644
index 0000000000..95e1b6492d
--- /dev/null
+++ b/scripts/datasets/machine_translation/wmt2017_zhen.sh
@@ -0,0 +1,89 @@
+SUBWORD_ALGO=$1
+SRC=zh
+TGT=en
+SAVE_PATH=wmt2017_zhen
+
+# Fetch the raw text
+nlp_data prepare_wmt \
+ --dataset wmt2017 \
+ --lang-pair ${SRC}-${TGT} \
+ --save-path ${SAVE_PATH}
+
+# We use sacrebleu to fetch the dev set and test set of wmt17
+sacrebleu -t wmt17/dev -l ${SRC}-${TGT} --echo src > ${SAVE_PATH}/dev.raw.${SRC}
+sacrebleu -t wmt17/dev -l ${SRC}-${TGT} --echo ref > ${SAVE_PATH}/dev.raw.${TGT}
+sacrebleu -t wmt17 -l ${SRC}-${TGT} --echo src > ${SAVE_PATH}/test.raw.${SRC}
+sacrebleu -t wmt17 -l ${SRC}-${TGT} --echo ref > ${SAVE_PATH}/test.raw.${TGT}
+
+
+# Clean and tokenize the training + dev corpus
+cd ${SAVE_PATH}
+nlp_preprocess clean_tok_para_corpus --src-lang ${SRC} \
+ --tgt-lang ${TGT} \
+ --src-corpus train.raw.${SRC} \
+ --tgt-corpus train.raw.${TGT} \
+ --src-tokenizer jieba \
+ --tgt-tokenizer moses \
+ --max-ratio 1.3 \
+ --min-num-words 3 \
+ --max-num-words 70 \
+ --src-save-path train.tok.${SRC} \
+ --tgt-save-path train.tok.${TGT}
+
+nlp_preprocess clean_tok_para_corpus --src-lang ${SRC} \
+ --tgt-lang ${TGT} \
+ --src-corpus dev.raw.${SRC} \
+ --tgt-corpus dev.raw.${TGT} \
+ --src-tokenizer jieba \
+ --tgt-tokenizer moses \
+ --max-ratio 1.3 \
+ --min-num-words 3 \
+ --max-num-words 70 \
+ --src-save-path dev.tok.${SRC} \
+ --tgt-save-path dev.tok.${TGT}
+
+# For test corpus, we will just tokenize the data
+nlp_preprocess clean_tok_para_corpus --src-lang ${SRC} \
+ --tgt-lang ${TGT} \
+ --src-corpus test.raw.${SRC} \
+ --tgt-corpus test.raw.${TGT} \
+ --src-tokenizer jieba \
+ --tgt-tokenizer moses \
+ --src-save-path test.tok.${SRC} \
+ --tgt-save-path test.tok.${TGT}
+
+# Learn BPE with the training data. We learn independent source/target vocabulary
+
+nlp_preprocess learn_subword --corpus train.tok.${SRC} \
+ --model ${SUBWORD_ALGO} \
+ --save-dir ./${SRC}_model \
+ --vocab-size 44000
+nlp_preprocess learn_subword --corpus train.tok.${TGT} \
+ --model ${SUBWORD_ALGO} \
+ --save-dir ./${TGT}_model \
+ --vocab-size 33000
+
+# Apply the learned codes to the training set
+for LANG in ${SRC} ${TGT}
+do
+nlp_preprocess apply_subword --model ${SUBWORD_ALGO}\
+ --output-type subword \
+ --model-path ${LANG}_model/${SUBWORD_ALGO}.model \
+ --vocab-path ${LANG}_model/${SUBWORD_ALGO}.vocab \
+ --corpus train.tok.${LANG} \
+ --save-path train.tok.${SUBWORD_ALGO}.${LANG}
+done
+
+# Apply the learned codes to the dev/test set
+for LANG in ${SRC} ${TGT}
+do
+ for SPLIT in dev test
+ do
+ nlp_preprocess apply_subword --model ${SUBWORD_ALGO} \
+ --output-type subword \
+ --model-path ${LANG}_model/${SUBWORD_ALGO}.model \
+ --vocab-path ${LANG}_model/${SUBWORD_ALGO}.vocab \
+ --corpus ${SPLIT}.tok.${LANG} \
+ --save-path ${SPLIT}.tok.${SUBWORD_ALGO}.${LANG}
+ done
+done
diff --git a/scripts/datasets/music_generation/README.md b/scripts/datasets/music_generation/README.md
new file mode 100644
index 0000000000..983c271de5
--- /dev/null
+++ b/scripts/datasets/music_generation/README.md
@@ -0,0 +1,42 @@
+# Music Generation
+
+We provide datasets for training a music generation model.
+
+## Maestro
+
+See https://magenta.tensorflow.org/datasets/maestro for detailed introduction.
+
+```
+# Get V1 Dataset
+nlp_data prepare_music_midi --dataset maestro_v1
+
+# Get V2 Dataset
+nlp_data prepare_music_midi --dataset maestro_v2
+```
+
+## LakhMIDI
+
+See https://colinraffel.com/projects/lmd/ for more details
+
+```
+# Get Lakh MIDI Full Dataset
+nlp_data prepare_music_midi --dataset lmd_full
+
+# Get the subset of 45,129 files from LMD-full
+# which have been matched to entries in the Million Song Datase
+nlp_data prepare_music_midi --dataset lmd_matched
+
+# Get the aligned version of lmd_matched
+nlp_data prepare_music_midi --dataset lmd_aligned
+
+# Get the clean midi data
+nlp_data prepare_music_midi --dataset clean_midi
+```
+
+## Geocities
+
+The Geocities collection of MIDI files.
+See https://archive.org/details/archiveteam-geocities-midi-collection-2009 for more details.
+```
+nlp_data prepare_music_midi --dataset geocities
+```
diff --git a/scripts/datasets/music_generation/__init__.py b/scripts/datasets/music_generation/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/datasets/music_generation/prepare_music_midi.py b/scripts/datasets/music_generation/prepare_music_midi.py
new file mode 100644
index 0000000000..cb07cb5687
--- /dev/null
+++ b/scripts/datasets/music_generation/prepare_music_midi.py
@@ -0,0 +1,110 @@
+import argparse
+import os
+import tarfile
+from gluonnlp.base import get_data_home_dir
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+import zipfile
+
+_CITATIONS = """
+@phdthesis{raffel2016learning,
+ title={Learning-based methods for comparing sequences, with applications to audio-to-midi alignment and matching},
+ author={Raffel, Colin},
+ year={2016},
+ school={Columbia University}
+}
+
+@inproceedings{hawthorne2018enabling,
+ title={Enabling Factorized Piano Music Modeling and Generation with the {MAESTRO} Dataset},
+ author={Curtis Hawthorne and Andriy Stasyuk and Adam Roberts and Ian Simon and Cheng-Zhi Anna Huang and Sander Dieleman and Erich Elsen and Jesse Engel and Douglas Eck},
+ booktitle={International Conference on Learning Representations},
+ year={2019},
+ url={https://openreview.net/forum?id=r1lYRjC9F7},
+}
+"""
+
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'music_midi_data')
+
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'music_midi.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+
+_URLS = {
+ 'lmd_full': 'http://hog.ee.columbia.edu/craffel/lmd/lmd_full.tar.gz',
+ 'lmd_matched': 'http://hog.ee.columbia.edu/craffel/lmd/lmd_matched.tar.gz',
+ 'lmd_aligned': 'http://hog.ee.columbia.edu/craffel/lmd/lmd_aligned.tar.gz',
+ 'clean_midi': 'http://hog.ee.columbia.edu/craffel/lmd/clean_midi.tar.gz',
+ 'maestro_v1': 'https://storage.googleapis.com/magentadata/datasets/maestro/v1.0.0/maestro-v1.0.0-midi.zip',
+ 'maestro_v2': 'https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0-midi.zip',
+ 'geocities': 'https://archive.org/download/archiveteam-geocities-midi-collection-2009/2009.GeoCities.MIDI.ArchiveTeam.zip'
+}
+
+
+@DATA_PARSER_REGISTRY.register('prepare_music_midi')
+def get_parser():
+ parser = argparse.ArgumentParser(description='Download the Music Midi Datasets.')
+ parser.add_argument('--dataset', type=str, required=True,
+ choices=['lmd_full', 'lmd_matched', 'lmd_aligned', 'clean_midi',
+ 'maestro_v1', 'maestro_v2', 'geocities'],
+ help='The dataset to download.')
+ parser.add_argument('--save-dir', type=str, default=None,
+ help='The directory to save the dataset.'
+ ' By default, it will save to a folder with the same name as the '
+ 'dataset')
+ parser.add_argument('--overwrite', action='store_true',
+ help='Whether to overwrite the directory.')
+ parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
+ help='The temporary path to download the compressed dataset.')
+ return parser
+
+
+@DATA_MAIN_REGISTRY.register('prepare_music_midi')
+def main(args):
+ # Download the data
+ url = _URLS[args.dataset]
+ file_hash = _URL_FILE_STATS[url]
+ target_download_location = os.path.join(args.cache_path, os.path.basename(url))
+ download(url, target_download_location, sha1_hash=file_hash)
+ if args.save_dir is None:
+ save_dir = args.dataset
+ else:
+ save_dir = args.save_dir
+ if not args.overwrite and os.path.exists(save_dir):
+ print('{} found, skip! Turn on --overwrite to force overwrite'.format(save_dir))
+ print('Extract the data from {} into {}'.format(target_download_location,
+ save_dir))
+ if args.dataset == 'lmd_full':
+ with tarfile.open(target_download_location) as f:
+ f.extractall(save_dir)
+ elif args.dataset == 'lmd_matched':
+ with tarfile.open(target_download_location) as f:
+ f.extractall(save_dir)
+ elif args.dataset == 'lmd_aligned':
+ with tarfile.open(target_download_location) as f:
+ f.extractall(save_dir)
+ elif args.dataset == 'clean_midi':
+ with tarfile.open(target_download_location) as f:
+ f.extractall(save_dir)
+ elif args.dataset == 'maestro_v1':
+ with zipfile.ZipFile(target_download_location, 'r') as fobj:
+ fobj.extractall(save_dir)
+ elif args.dataset == 'maestro_v2':
+ with zipfile.ZipFile(target_download_location, 'r') as fobj:
+ fobj.extractall(save_dir)
+ elif args.dataset == 'geocities':
+ with zipfile.ZipFile(target_download_location, 'r') as fobj:
+ fobj.extractall(save_dir)
+ else:
+ raise NotImplementedError
+
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == '__main__':
+ cli_main()
diff --git a/scripts/datasets/pretrain_corpus/README.md b/scripts/datasets/pretrain_corpus/README.md
new file mode 100644
index 0000000000..1f49996bfb
--- /dev/null
+++ b/scripts/datasets/pretrain_corpus/README.md
@@ -0,0 +1,55 @@
+# Pretraining Corpus
+
+We provide a series of shared scripts for downloading/preparing the text corpus for pretraining NLP models.
+This helps create a unified text corpus for studying the performance of different pretraining algorithms.
+When releasing the datasets, we follow the [FAIR principle](https://www.go-fair.org/fair-principles/),
+i.e., the dataset needs to be findable, accessible, interoperable, and reusable.
+
+## BookCorpus
+Unfortunately, we are unable to provide the original [Toronto BookCorpus dataset](https://yknzhu.wixsite.com/mbweb) due to licensing issues.
+
+There are some open source efforts for reproducing the dataset, e.g.,
+ using [soskek/bookcorpus](https://github.com/soskek/bookcorpus) or directly downloading the [preprocessed version](https://drive.google.com/file/d/16KCjV9z_FHm8LgZw05RSuk4EsAWPOP_z/view).
+
+Nevertheless, we utilize the [Project Gutenberg](https://www.gutenberg.org/) as an alternative to Toronto BookCorpus.
+
+You can use the following command to download and prepare the Gutenberg dataset.
+
+```bash
+python3 prepare_bookcorpus.py --dataset gutenberg
+```
+
+Also, you should follow the [license](https://www.gutenberg.org/wiki/Gutenberg:The_Project_Gutenberg_License) for using the data.
+
+## Wikipedia
+
+Please install [attardi/wikiextractor](https://github.com/attardi/wikiextractor) for preparing the data.
+
+```bash
+# Download
+python3 prepare_wikipedia.py --mode download --lang en --date latest -o ./
+
+# Properly format the text files
+python3 prepare_wikipedia.py --mode format -i [path-to-wiki.xml.bz2] -o ./
+
+```
+The process of downloading and formatting is time consuming, and we offer an alternative solution to download the prepared raw text file from S3 bucket. This raw text file is in English and was dumped at 2020-06-20 being formated by the above very process (` --lang en --date 20200620`).
+
+```bash
+python3 prepare_wikipedia.py --mode download_prepared -o ./
+```
+### References
+- [NVIDIA/DeepLearningExamples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT)
+- [attardi/wikiextractor](https://github.com/attardi/wikiextractor)
+
+## OpenWebText
+
+You can download the OpenWebText from [link](https://skylion007.github.io/OpenWebTextCorpus/).
+After downloading and extracting the OpenWebText (i.e., `tar xf openwebtext.tar.xz`), you can use the following command to preprocess the dataset.
+
+```bash
+python3 prepare_openwebtext.py --input openwebtext/ --output prepared_owt --shuffle
+```
+
+In this step, the archived txt are directly read without decompressing.
+They are concatenated together in a single txt file with the same name as the archived file, using double empty lines as the document separation.
diff --git a/scripts/datasets/pretrain_corpus/__init__.py b/scripts/datasets/pretrain_corpus/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/datasets/pretrain_corpus/prepare_bookcorpus.py b/scripts/datasets/pretrain_corpus/prepare_bookcorpus.py
new file mode 100644
index 0000000000..7e00f73a98
--- /dev/null
+++ b/scripts/datasets/pretrain_corpus/prepare_bookcorpus.py
@@ -0,0 +1,91 @@
+import glob
+import os
+import argparse
+import zipfile
+from gluonnlp.base import get_data_home_dir
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+
+
+_CITATIONS = r"""
+@InProceedings{lahiri:2014:SRW,
+ author = {Lahiri, Shibamouli},
+ title = {{Complexity of Word Collocation Networks: A Preliminary Structural Analysis}},
+ booktitle = {Proceedings of the Student Research Workshop at the 14th Conference of the European Chapter of the Association for Computational Linguistics},
+ month = {April},
+ year = {2014},
+ address = {Gothenburg, Sweden},
+ publisher = {Association for Computational Linguistics},
+ pages = {96--105},
+ url = {http://www.aclweb.org/anthology/E14-3011}
+}
+"""
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'book_corpus.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+
+# The Gutenberg dataset is downloaded from:
+# https://web.eecs.umich.edu/~lahiri/gutenberg_dataset.html, and
+# is a small subset of the Project Gutenberg corpus
+# The original link for
+# downloading is https://drive.google.com/file/d/0B2Mzhc7popBga2RkcWZNcjlRTGM/edit?usp=sharing
+
+_URLS = {
+ 'gutenberg':
+ 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/pretrain_corpus/Gutenberg.zip',
+}
+
+
+@DATA_PARSER_REGISTRY.register('prepare_bookcorpus')
+def get_parser():
+ parser = argparse.ArgumentParser(description='Download and Prepare the BookCorpus dataset.')
+ parser.add_argument('--dataset', type=str, choices=['gutenberg'], default='gutenberg')
+ parser.add_argument('--mode', type=str, default='raw', choices=['raw', 'format'],
+ help='Specify the mode for preparing the data.'
+ ' "raw" means to download and extract the books into the output'
+ ' folder, each file is a book and the filename is the tile of the '
+ 'book. "format" means to format the extracted txt files for '
+ 'usage of pretraining.')
+ parser.add_argument('--save_dir', type=str, default=None,
+ help='The directory to save the dataset. Default is the same as the'
+ ' dataset.')
+ parser.add_argument('--cache-path', type=str,
+ default=os.path.join(get_data_home_dir(), 'book_corpus'),
+ help='The temporary path to download the compressed dataset.')
+ return parser
+
+
+@DATA_MAIN_REGISTRY.register('prepare_bookcorpus')
+def main(args):
+ url = _URLS[args.dataset]
+ file_hash = _URL_FILE_STATS[url]
+ target_download_location = os.path.join(args.cache_path,
+ os.path.basename(url))
+ download(url, target_download_location, sha1_hash=file_hash)
+ save_dir = args.dataset if args.save_dir is None else args.save_dir
+ if not os.path.exists(save_dir):
+ os.makedirs(save_dir, exist_ok=True)
+ if args.dataset == 'gutenberg':
+ if args.mode == 'raw':
+ with zipfile.ZipFile(target_download_location) as f:
+ for name in f.namelist():
+ if name.endswith('.txt'):
+ filename = os.path.basename(name)
+ f.extract(name, os.path.join(save_dir, filename))
+ else:
+ # TODO(zheyuye), format for pretraining
+ raise NotImplementedError
+ else:
+ raise NotImplementedError
+
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == '__main__':
+ cli_main()
diff --git a/scripts/datasets/pretrain_corpus/prepare_openwebtext.py b/scripts/datasets/pretrain_corpus/prepare_openwebtext.py
new file mode 100644
index 0000000000..ff3edf75f5
--- /dev/null
+++ b/scripts/datasets/pretrain_corpus/prepare_openwebtext.py
@@ -0,0 +1,106 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare the OpenWebText Dataset Corpus for pre-training. """
+
+import os
+import re
+import time
+import random
+import tarfile
+import argparse
+import functools
+import multiprocessing
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+
+_CITATIONS = r"""
+@misc{Gokaslan2019OpenWeb,
+ title={OpenWebText Corpus},
+ author={Aaron Gokaslan and Vanya Cohen},
+ howpublished{\url{http://Skylion007.github.io/OpenWebTextCorpus}},
+ year={2019}
+}
+"""
+
+
+@DATA_PARSER_REGISTRY.register('prepare_openwebtext')
+def get_parser():
+ parser = argparse.ArgumentParser(description='Prepare the OpenWebText corpus for pretraining')
+ parser.add_argument("-i", "--input", required=True,
+ help="path to openwebtext dataset")
+ parser.add_argument("-o", "--output", default="openwebtext",
+ help="directory for extracted files")
+ parser.add_argument("--num_process", type=int, default=8,
+ help="number of processes for multiprocessing")
+ parser.add_argument("--shuffle", action="store_true",
+ help="Wether to shuffle the data order")
+ return parser
+
+
+def extract_files(full_name, output_dir, shuffle=False):
+ """
+ Extract the file and concatenate all the TXT files it archives
+ """
+ if not full_name.endswith(".xz"):
+ return
+ file_prefix = re.split(r'\.|/', full_name)[-2]
+ file_prefix = file_prefix.replace('urlsf_subset', 'openwebtext-prepared-')
+ with open("{}.txt".format(os.path.join(output_dir, file_prefix)), "w") as fp:
+ with tarfile.open(full_name) as t:
+ txt_names = t.getnames()
+ if shuffle:
+ random.shuffle(txt_names)
+ for txt_name in txt_names:
+ f = t.extractfile(txt_name)
+ for line in f.readlines():
+ # skip empty line
+ line = line.strip()
+ if line:
+ fp.write(line.decode() + '\n')
+ # Two extra line break to mark the document separation
+ fp.write('\n')
+
+
+@DATA_MAIN_REGISTRY.register('prepare_openwebtext')
+def main(args):
+ num_process = min(multiprocessing.cpu_count(), args.num_process)
+ if not os.path.exists(args.output):
+ os.makedirs(args.output, exist_ok=True)
+ fnames = sorted(os.listdir(args.input))
+ fnames = [os.path.join(args.input, fname) for fname in fnames]
+ if args.shuffle:
+ random.shuffle(fnames)
+ print('Start extracting {} files with {} cores'.format(len(fnames), num_process))
+ start_time = time.time()
+ with multiprocessing.Pool(num_process) as pool:
+ iter = pool.imap(
+ functools.partial(
+ extract_files,
+ output_dir=args.output,
+ shuffle=args.shuffle),
+ fnames)
+ for f_index, _ in enumerate(iter):
+ if f_index > 0 and f_index % 250 == 0:
+ elapsed = time.time() - start_time
+ print("Extracted {:}, Elapsed: {:}s, ETA: {:}s, ".format(
+ f_index, int(elapsed), int((len(fnames) - f_index) / (f_index / elapsed))))
+
+ print("Done!")
+
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == '__main__':
+ cli_main()
diff --git a/scripts/datasets/pretrain_corpus/prepare_wikipedia.py b/scripts/datasets/pretrain_corpus/prepare_wikipedia.py
new file mode 100644
index 0000000000..481598c22e
--- /dev/null
+++ b/scripts/datasets/pretrain_corpus/prepare_wikipedia.py
@@ -0,0 +1,253 @@
+"""Prepare the Wikipedia dataset that contain cleaned articles of all languages."""
+import os
+import sys
+import glob
+import math
+import time
+import tarfile
+import argparse
+import multiprocessing
+
+from gluonnlp.registry import DATA_MAIN_REGISTRY, DATA_PARSER_REGISTRY
+from gluonnlp.utils.misc import download, load_checksum_stats
+
+_CITATION = """\
+@ONLINE {wikidump,
+ author = "Wikimedia Foundation",
+ title = "Wikimedia Downloads",
+ url = "https://dumps.wikimedia.org"
+}
+"""
+
+# See https://en.wikipedia.org/wiki/List_of_Wikipedias for details
+__LANGUAGES_BANK = [
+ "aa", "ab", "ace", "ady", "af", "ak", "als", "am", "an", "ang", "ar", "arc",
+ "arz", "as", "ast", "atj", "av", "ay", "az", "azb", "ba", "bar", "bat-smg",
+ "bcl", "be", "be-x-old", "bg", "bh", "bi", "bjn", "bm", "bn", "bo", "bpy",
+ "br", "bs", "bug", "bxr", "ca", "cbk-zam", "cdo", "ce", "ceb", "ch", "cho",
+ "chr", "chy", "ckb", "co", "cr", "crh", "cs", "csb", "cu", "cv", "cy", "da",
+ "de", "din", "diq", "dsb", "dty", "dv", "dz", "ee", "el", "eml", "en", "eo",
+ "es", "et", "eu", "ext", "fa", "ff", "fi", "fiu-vro", "fj", "fo", "fr",
+ "frp", "frr", "fur", "fy", "ga", "gag", "gan", "gd", "gl", "glk", "gn",
+ "gom", "gor", "got", "gu", "gv", "ha", "hak", "haw", "he", "hi", "hif",
+ "ho", "hr", "hsb", "ht", "hu", "hy", "ia", "id", "ie", "ig", "ii",
+ "ik", "ilo", "inh", "io", "is", "it", "iu", "ja", "jam", "jbo", "jv", "ka",
+ "kaa", "kab", "kbd", "kbp", "kg", "ki", "kj", "kk", "kl", "km", "kn", "ko",
+ "koi", "krc", "ks", "ksh", "ku", "kv", "kw", "ky", "la", "lad", "lb",
+ "lbe", "lez", "lfn", "lg", "li", "lij", "lmo", "ln", "lo", "lrc", "lt",
+ "ltg", "lv", "mai", "map-bms", "mdf", "mg", "mh", "mhr", "mi", "min", "mk",
+ "ml", "mn", "mr", "mrj", "ms", "mt", "mus", "mwl", "my", "myv", "mzn", "na",
+ "nah", "nap", "nds", "nds-nl", "ne", "new", "ng", "nl", "nn", "no", "nov",
+ "nrm", "nso", "nv", "ny", "oc", "olo", "om", "or", "os", "pa", "pag", "pam",
+ "pap", "pcd", "pdc", "pfl", "pi", "pih", "pl", "pms", "pnb", "pnt", "ps",
+ "pt", "qu", "rm", "rmy", "rn", "ro", "roa-rup", "roa-tara", "ru", "rue",
+ "rw", "sa", "sah", "sat", "sc", "scn", "sco", "sd", "se", "sg", "sh", "si",
+ "simple", "sk", "sl", "sm", "sn", "so", "sq", "sr", "srn", "ss", "st",
+ "stq", "su", "sv", "sw", "szl", "ta", "tcy", "te", "tet", "tg", "th", "ti",
+ "tk", "tl", "tn", "to", "tpi", "tr", "ts", "tt", "tum", "tw", "ty", "tyv",
+ "udm", "ug", "uk", "ur", "uz", "ve", "vec", "vep", "vi", "vls", "vo", "wa",
+ "war", "wo", "wuu", "xal", "xh", "xmf", "yi", "yo", "za", "zea", "zh",
+ "zh-classical", "zh-min-nan", "zh-yue", "zu"]
+
+_BASE_URL_TMPL\
+ = "https://dumps.wikimedia.org/{lang}wiki/{date}/{lang}wiki-{date}-pages-articles.xml.bz2"
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'wikipedia.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+_URLS = {
+ 'wikipedia-en-20200620':
+ 'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/pretrain_corpus/wikipedia-en-20200620.tar.gz',
+}
+
+
+def get_url(lang, date):
+ return _BASE_URL_TMPL.format(lang=lang, date=date)
+
+
+def try_import_wikiextractor():
+ try:
+ sys.path.append(_CURR_DIR)
+ import WikiExtractor
+ except ImportError:
+ try:
+ download(
+ 'https://raw.githubusercontent.com/attardi/wikiextractor/master/WikiExtractor.py',
+ path=os.path.join(_CURR_DIR, 'WikiExtractor.py'),
+ sha1_hash='3c4896a837b75c476d23c037e8d6c7fdfd9a29eb')
+ sys.path.append(_CURR_DIR)
+ import WikiExtractor
+ except BaseException:
+ raise ImportError('Cannot import WikiExtractor! You can download the "WikiExtractor.py"'
+ ' in https://github.com/attardi/wikiextractor to {}'
+ .format(_CURR_DIR))
+ return WikiExtractor
+
+
+def get_formatting_list(wiki_path, recursive=False):
+ """
+ get formatting list of file names from extracted content
+ """
+ filenames = []
+ for dirname in glob.glob(os.path.join(wiki_path, '*'), recursive=False):
+ for filename in glob.glob(os.path.join(dirname, 'wiki_*'), recursive=recursive):
+ filenames.append(filename)
+ return filenames
+
+
+def merge(x):
+ """
+ Puts one article per line
+ """
+ file_list, output_filename = x
+ article_lines = []
+ article_open = False
+
+ with open(output_filename, mode='w', newline='\n') as ofile:
+ for filename in file_list:
+ with open(filename, mode='r', newline='\n') as file:
+ for line in file:
+ if '' in line:
+ article_open = False
+ for oline in article_lines[1:]:
+ if oline != '\n':
+ ofile.write(oline.rstrip() + " ")
+ ofile.write("\n\n")
+ article_lines = []
+ else:
+ if article_open:
+ article_lines.append(line)
+
+
+@DATA_PARSER_REGISTRY.register('prepare_wikipedia')
+def get_parser():
+ parser = argparse.ArgumentParser(description='Download and Prepare the Wikipedia')
+ parser.add_argument('--mode', type=str,
+ default='download+format',
+ choices=['download', 'format', 'download+format', 'download_prepared'],
+ help='Specify the action you want the app to take. '
+ '"download" means to download the Wikipedia dump. '
+ '"format" means to extract the content and '
+ 'format it for pretraining. "download+format" means to combine '
+ 'these two options'
+ '"download_prepared" downloads the prepared txt from S3 directly')
+ parser.add_argument('--lang', type=str, default='en',
+ help='Language of the wikipedia dump file.'
+ 'We only support English and Chinese for current version')
+ parser.add_argument('--date', type=str, default='latest',
+ help='Date of the wikipedia dump file. You can choose a date like '
+ '"--date 20200201" or use "--date latest"')
+ parser.add_argument("-i", "--input", default=None,
+ help="path to XML wiki dump file.")
+ parser.add_argument("-o", "--output", default="wikicorpus",
+ help="directory for downloaded or formatted files")
+ parser.add_argument("-b", "--bytes", default="100M",
+ help="maximum bytes per extracted file (default %(default)s)",
+ metavar="n[KMG]")
+ parser.add_argument("--num_process", type=int, default=8,
+ help="number of processes for multiprocessing")
+ parser.add_argument("--num_out_files", type=int, default=1000,
+ help="Number of desired output files, where each is processed"
+ " independently by a worker.")
+ return parser
+
+
+def download_wikicorpus(lang, date, output):
+ """
+ lang: the language code such as en, zh
+ date: string, the date of the Wikipedia with format of YYYYMMDD, or 'latest'.
+ """
+ if not os.path.exists(output):
+ os.makedirs(output)
+ if lang not in __LANGUAGES_BANK:
+ raise ValueError('Unsupported language code')
+ language = lang.replace('-', '_')
+ output_file = os.path.join(output, 'download', language, date,
+ 'wikicorpus.xml.bz2')
+ download(get_url(language, date), output_file)
+ return output_file
+
+
+def format_wikicorpus(input, output, bytes, num_process, num_out_files):
+ if input is None:
+ raise ValueError('input file is empty.')
+ if not input.endswith('xml.bz2'):
+ raise ValueError('input file not *.xml.bz2.')
+ if not os.path.exists(output):
+ os.makedirs(output)
+
+ # Use WikiExtractor to extract the content
+ WikiExtractor = try_import_wikiextractor()
+ wiki_path = os.path.join(output, 'extracted')
+ sys.argv = ['prog', '-b', bytes, '-o', wiki_path, input]
+ WikiExtractor.main()
+
+ # Merge extracted content into txt files
+ prepared_path = os.path.join(output, 'prepared_wikipedia')
+ if not os.path.exists(prepared_path):
+ os.makedirs(prepared_path)
+ filenames = get_formatting_list(wiki_path, recursive=True)
+ num_files = len(filenames)
+ num_out_files = min(num_out_files, num_files)
+ file_volume = math.ceil(num_files / num_out_files)
+ splited_files = [filenames[i: i + file_volume] for i in range(0, num_files, file_volume)]
+ num_out_files = len(splited_files)
+ output_files = [
+ os.path.join(
+ prepared_path,
+ "wikipedia-prepared-{}.txt".format(
+ str(i).zfill(4))) for i in range(num_out_files)]
+ print("All prepared raw text will be saved in {} txt files".format(num_out_files))
+ num_process = min(num_process, num_out_files)
+ print('Start preprocessing {} text files with {} cores'.format(num_files, num_process))
+ process_args = [(splited_files[i], output_files[i]) for i in range(num_out_files)]
+
+ start_time = time.time()
+ with multiprocessing.Pool(num_process) as pool:
+ f_read = 0
+ for i, _ in enumerate(pool.imap(merge, process_args)):
+ elapsed = time.time() - start_time
+ f_read += len(splited_files[i])
+ print("prepared {:} files, Elapsed: {:.2f}s, ETA: {:.2f}s, ".format(
+ f_read, elapsed, (num_files - f_read) / (num_files / elapsed)))
+ print("Done preparation within {:.2f} seconds".format(elapsed))
+
+
+@DATA_MAIN_REGISTRY.register('prepare_wikipedia')
+def main(args):
+ num_process = min(multiprocessing.cpu_count(), args.num_process)
+ if args.mode == 'download':
+ download_wikicorpus(args.lang, args.date, args.output)
+ elif args.mode == 'format':
+ format_wikicorpus(args.input, args.output, args.bytes, num_process, args.num_out_files)
+ elif args.mode == 'download+format':
+ downloaded_file = download_wikicorpus(args.lang, args.date, args.output)
+ format_wikicorpus(downloaded_file, args.output, args.bytes, num_process, args.num_out_files)
+ elif args.mode == 'download_prepared':
+ url = _URLS['wikipedia-en-20200620']
+ file_hash = _URL_FILE_STATS[url]
+ target_download_location = os.path.join(args.output,
+ os.path.basename(url))
+ download(url, target_download_location, sha1_hash=file_hash)
+ tar = tarfile.open(target_download_location)
+ names = tar.getnames()
+ print('Start unarchiving raw text files')
+ start_time = time.time()
+ for name in names:
+ tar.extract(name, path=args.output)
+ tar.close()
+ print("Done unarchiving within {:.2f} seconds".format(time.time() - start_time))
+ else:
+ raise NotImplementedError
+
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == "__main__":
+ cli_main()
diff --git a/scripts/datasets/question_answering/README.md b/scripts/datasets/question_answering/README.md
new file mode 100644
index 0000000000..96e53f03dd
--- /dev/null
+++ b/scripts/datasets/question_answering/README.md
@@ -0,0 +1,101 @@
+# Question Answering
+
+## SQuAD
+SQuAD datasets is distributed under the [CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/legalcode) license.
+
+Run the following command to download squad
+
+```bash
+python3 prepare_squad.py --version 1.1 # Squad 1.1
+python3 prepare_squad.py --version 2.0 # Squad 2.0
+```
+
+For all datasets we support, we provide command-line-toolkits for downloading them as
+
+```bash
+nlp_data prepare_squad --version 1.1
+nlp_data prepare_squad --version 2.0
+```
+
+Directory structure of the squad dataset will be as follows, where `version` can be 1.1 or 2.0:
+```
+squad
+├── train-v{version}.json
+├── dev-v{version}.json
+```
+
+## SearchQA
+Following BSD-3-Clause License, we uploaded the SearchQA to our S3 bucket and provide the link to download the processed txt files. Please check out the [Google drive link](https://drive.google.com/drive/u/0/folders/1kBkQGooNyG0h8waaOJpgdGtOnlb1S649) to download to raw and split files collected through web search using the scraper from [GitHub repository](https://github.com/nyu-dl/dl4ir-searchQA).
+
+Download SearchQA Dataset with python command or Command-line Toolkits
+
+```bash
+python3 prepare_searchqa.py
+
+# Or download with command-line toolkits
+nlp_data prepare_searchqa
+```
+
+Directory structure of the searchqa dataset will be as follows
+```
+searchqa
+├── train.txt
+├── val.txt
+├── test.txt
+```
+
+## TriviaQA
+[TriviaQA](https://nlp.cs.washington.edu/triviaqa/) is an open domain QA dataset. See more useful scripts in [Offical Github](https://github.com/mandarjoshi90/triviaqa)
+
+Run the following command to download triviaqa
+
+```bash
+python3 prepare_triviaqa.py --version rc # Download TriviaQA version 1.0 for RC (2.5G)
+python3 prepare_triviaqa.py --version unfiltered # Download unfiltered TriviaQA version 1.0 (604M)
+
+# Or download with command-line toolkits
+nlp_data prepare_triviaqa --version rc
+nlp_data prepare_triviaqa --version unfiltered
+```
+
+Directory structure of the triviaqa (rc and unfiltered) dataset will be as follows:
+```
+triviaqa
+├── triviaqa-rc
+ ├── qa
+ ├── verified-web-dev.json
+ ├── web-dev.json
+ ├── web-train.json
+ ├── web-test-without-answers.json
+ ├── verified-wikipedia-dev.json
+ ├── wikipedia-test-without-answers.json
+ ├── wikipedia-dev.json
+ ├── wikipedia-train.json
+ ├── evidence
+ ├── web
+ ├── wikipedia
+
+├── triviaqa-unfiltered
+ ├── unfiltered-web-train.json
+ ├── unfiltered-web-dev.json
+ ├── unfiltered-web-test-without-answers.json
+```
+
+## HotpotQA
+HotpotQA is distributed under a [CC BY-SA 4.0 License](https://creativecommons.org/licenses/by-sa/4.0/). We only provide download scripts (run by the following command), and please check out the [GitHub repository](https://github.com/hotpotqa/hotpot) for the details of preprocessing and evaluation.
+
+```bash
+python3 prepare_hotpotqa.py
+
+# Or download with command-line toolkits
+nlp_data prepare_hotpotqa
+```
+
+Directory structure of the hotpotqa dataset will be as follows
+```
+hotpotqa
+├── hotpot_train_v1.1.json
+├── hotpot_dev_fullwiki_v1.json
+├── hotpot_dev_distractor_v1.json
+├── hotpot_test_fullwiki_v1.json
+```
diff --git a/scripts/datasets/question_answering/__init__.py b/scripts/datasets/question_answering/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/datasets/question_answering/prepare_hotpotqa.py b/scripts/datasets/question_answering/prepare_hotpotqa.py
new file mode 100644
index 0000000000..f894b91f8a
--- /dev/null
+++ b/scripts/datasets/question_answering/prepare_hotpotqa.py
@@ -0,0 +1,62 @@
+import os
+import argparse
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.base import get_data_home_dir
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'hotpotqa')
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'hotpotqa.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+
+_CITATIONS = """
+@inproceedings{yang2018hotpotqa,
+ title={{HotpotQA}: A Dataset for Diverse, Explainable Multi-hop Question Answering},
+ author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William W. and Salakhutdinov, Ruslan and Manning, Christopher D.},
+ booktitle={Conference on Empirical Methods in Natural Language Processing ({EMNLP})},
+ year={2018}
+}
+
+"""
+
+_URLS = {
+ 'train': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json',
+ 'dev_fullwiki': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json',
+ 'dev_distractor': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json',
+ 'test_fullwiki': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_test_fullwiki_v1.json',
+}
+
+
+@DATA_PARSER_REGISTRY.register('prepare_hotpotqa')
+def get_parser():
+ parser = argparse.ArgumentParser(description='Downloading the HotpotQA Dataset.')
+ parser.add_argument('--save-path', type=str, default='hotpotqa')
+ parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
+ help='The path to download the dataset.')
+ parser.add_argument('--overwrite', action='store_true')
+ return parser
+
+
+@DATA_MAIN_REGISTRY.register('prepare_hotpotqa')
+def main(args):
+ if not os.path.exists(args.save_path):
+ os.makedirs(args.save_path)
+ for url in _URLS.values():
+ file_name = url[url.rfind('/') + 1:]
+ file_hash = _URL_FILE_STATS[url]
+ download(url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash)
+ if not os.path.exists(os.path.join(args.save_path, file_name))\
+ or (args.overwrite and args.save_path != args.cache_path):
+ os.symlink(os.path.join(args.cache_path, file_name),
+ os.path.join(args.save_path, file_name))
+
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == '__main__':
+ cli_main()
diff --git a/scripts/datasets/question_answering/prepare_searchqa.py b/scripts/datasets/question_answering/prepare_searchqa.py
new file mode 100644
index 0000000000..f48236a944
--- /dev/null
+++ b/scripts/datasets/question_answering/prepare_searchqa.py
@@ -0,0 +1,61 @@
+import os
+import argparse
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.base import get_data_home_dir
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'searchqa')
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'searchqa.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+
+_CITATIONS = """
+@article{dunn2017searchqa,
+ title={Searchqa: A new q\&a dataset augmented with context from a search engine},
+ author={Dunn, Matthew and Sagun, Levent and Higgins, Mike and Guney, V Ugur and Cirik, Volkan and Cho, Kyunghyun},
+ journal={arXiv preprint arXiv:1704.05179},
+ year={2017}
+}
+
+"""
+
+_URLS = {
+ 'train': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/train.txt',
+ 'val': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/val.txt',
+ 'test': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/test.txt'
+}
+
+
+@DATA_PARSER_REGISTRY.register('prepare_searchqa')
+def get_parser():
+ parser = argparse.ArgumentParser(description='Downloading the SearchQA Dataset.')
+ parser.add_argument('--save-path', type=str, default='searchqa')
+ parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
+ help='The path to download the dataset.')
+ parser.add_argument('--overwrite', action='store_true')
+ return parser
+
+
+@DATA_MAIN_REGISTRY.register('prepare_searchqa')
+def main(args):
+ if not os.path.exists(args.save_path):
+ os.makedirs(args.save_path)
+ for url in _URLS.values():
+ file_name = url[url.rfind('/') + 1:]
+ file_hash = _URL_FILE_STATS[url]
+ download(url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash)
+ if not os.path.exists(os.path.join(args.save_path, file_name))\
+ or (args.overwrite and args.save_path != args.cache_path):
+ os.symlink(os.path.join(args.cache_path, file_name),
+ os.path.join(args.save_path, file_name))
+
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == '__main__':
+ cli_main()
diff --git a/scripts/datasets/question_answering/prepare_squad.py b/scripts/datasets/question_answering/prepare_squad.py
new file mode 100644
index 0000000000..777a336609
--- /dev/null
+++ b/scripts/datasets/question_answering/prepare_squad.py
@@ -0,0 +1,81 @@
+import os
+import argparse
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.base import get_data_home_dir
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'squad')
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'squad.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+
+_CITATIONS = """
+@inproceedings{rajpurkar2016squad,
+ title={Squad: 100,000+ questions for machine comprehension of text},
+ author={Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy},
+ booktitle={EMNLP},
+ year={2016}
+}
+
+@inproceedings{rajpurkar2018know,
+ title={Know What You Don't Know: Unanswerable Questions for SQuAD},
+ author={Rajpurkar, Pranav and Jia, Robin and Liang, Percy},
+ booktitle={ACL},
+ year={2018}
+}
+
+"""
+
+_URLS = {
+ '1.1': {
+ 'train': 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json',
+ 'dev': 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json',
+ },
+ '2.0': {
+ 'train': 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json',
+ 'dev': 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json'
+ }
+}
+
+
+@DATA_PARSER_REGISTRY.register('prepare_squad')
+def get_parser():
+ parser = argparse.ArgumentParser(description='Downloading the SQuAD Dataset.')
+ parser.add_argument('--version', type=str, choices=['1.1', '2.0'], default='1.1',
+ help='Version of the squad dataset.')
+ parser.add_argument('--save-path', type=str, default='squad')
+ parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
+ help='The path to download the dataset.')
+ parser.add_argument('--overwrite', action='store_true')
+ return parser
+
+
+@DATA_MAIN_REGISTRY.register('prepare_squad')
+def main(args):
+ train_url = _URLS[args.version]['train']
+ dev_url = _URLS[args.version]['dev']
+ train_file_name = train_url[train_url.rfind('/') + 1:]
+ dev_file_name = dev_url[dev_url.rfind('/') + 1:]
+ download(train_url, path=os.path.join(args.cache_path, train_file_name))
+ download(dev_url, path=os.path.join(args.cache_path, dev_file_name))
+ if not os.path.exists(args.save_path):
+ os.makedirs(args.save_path)
+ if not os.path.exists(os.path.join(args.save_path, train_file_name))\
+ or (args.overwrite and args.save_path != args.cache_path):
+ os.symlink(os.path.join(args.cache_path, train_file_name),
+ os.path.join(args.save_path, train_file_name))
+ if not os.path.exists(os.path.join(args.save_path, dev_file_name))\
+ or (args.overwrite and args.save_path != args.cache_path):
+ os.symlink(os.path.join(args.cache_path, dev_file_name),
+ os.path.join(args.save_path, dev_file_name))
+
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == '__main__':
+ cli_main()
diff --git a/scripts/datasets/question_answering/prepare_triviaqa.py b/scripts/datasets/question_answering/prepare_triviaqa.py
new file mode 100644
index 0000000000..d67886fc1b
--- /dev/null
+++ b/scripts/datasets/question_answering/prepare_triviaqa.py
@@ -0,0 +1,77 @@
+import os
+import tarfile
+import argparse
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.base import get_data_home_dir
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'triviaqa')
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'triviaqa.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+
+_CITATIONS = """
+@InProceedings{JoshiTriviaQA2017,
+ author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
+ title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
+ booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
+ month = {July},
+ year = {2017},
+ address = {Vancouver, Canada},
+ publisher = {Association for Computational Linguistics},
+}
+
+"""
+
+_URLS = {
+ 'rc': 'https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz',
+ 'unfiltered': 'https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz'
+}
+
+
+@DATA_PARSER_REGISTRY.register('prepare_triviaqa')
+def get_parser():
+ parser = argparse.ArgumentParser(description='Downloading the TriviaQA Dataset.')
+ parser.add_argument('--type', type=str, choices=['rc', 'unfiltered'], default='rc',
+ help='type of the triviaqa dataset.')
+ parser.add_argument('--save-path', type=str, default='triviaqa')
+ parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
+ help='The path to download the dataset.')
+ parser.add_argument('--overwrite', action='store_true')
+ return parser
+
+
+@DATA_MAIN_REGISTRY.register('prepare_triviaqa')
+def main(args):
+
+ def extract(tar_path, target_path):
+ try:
+ tar = tarfile.open(tar_path, "r:gz")
+ file_names = tar.getnames()
+ for file_name in file_names:
+ tar.extract(file_name, target_path)
+ tar.close()
+ except Exception as e:
+ print(e)
+
+ tar_url = _URLS[args.type]
+ file_name = tar_url[tar_url.rfind('/') + 1:]
+ file_hash = _URL_FILE_STATS[tar_url]
+ download(tar_url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash)
+ if not os.path.exists(args.save_path):
+ os.makedirs(args.save_path)
+ if not os.path.exists(os.path.join(args.save_path, file_name))\
+ or (args.overwrite and args.save_path != args.cache_path):
+ os.symlink(os.path.join(args.cache_path, file_name),
+ os.path.join(args.save_path, file_name))
+ extract(os.path.join(args.save_path, file_name), args.save_path)
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == '__main__':
+ cli_main()
diff --git a/scripts/datasets/update_download_stats.py b/scripts/datasets/update_download_stats.py
new file mode 100644
index 0000000000..4ab37f9e52
--- /dev/null
+++ b/scripts/datasets/update_download_stats.py
@@ -0,0 +1,122 @@
+import hashlib
+import requests
+import time
+import os
+import copy
+from collections import OrderedDict
+from gluonnlp.cli.data.machine_translation.prepare_wmt\
+ import _PARA_URLS as wmt_para_urls, _MONOLINGUAL_URLS as wmt_mono_urls
+from gluonnlp.cli.data.question_answering.prepare_squad import _URLS as squad_urls
+from gluonnlp.cli.data.question_answering.prepare_triviaqa import _URLS as triviaqa_url
+from gluonnlp.cli.data.question_answering.prepare_hotpotqa import _URLS as hotpotqa_urls
+from gluonnlp.cli.data.question_answering.prepare_searchqa import _URLS as searchqa_urls
+from gluonnlp.cli.data.language_modeling.prepare_lm import _URLS as lm_urls
+from gluonnlp.cli.data.music_generation.prepare_music_midi import _URLS as midi_urls
+from gluonnlp.cli.data.pretrain_corpus.prepare_bookcorpus import _URLS as book_urls
+from gluonnlp.cli.data.general_nlp_benchmark.prepare_glue import SUPERGLUE_TASK2PATH as superglue_urls
+from gluonnlp.cli.data.general_nlp_benchmark.prepare_glue import GLUE_TASK2PATH as glue_urls
+
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_CHECK_SUM_BASE = os.path.join(_CURR_DIR, 'url_checksums')
+
+
+def get_hash_and_size(obj, retries=5, algorithm='sha1', cache=None, save_path=None,
+ verify_ssl=True):
+ """Fetch sha1 hash of all urls in the input obj"""
+ def _get_hash_and_size(obj, retries, algorithm, cache=None, save_path=None):
+ if isinstance(obj, str):
+ if obj.startswith('http://') or obj.startswith('https://'):
+ url = obj
+ hex_hash = None
+ file_size = None
+ if cache is not None and obj in cache:
+ return obj, cache[obj]
+ while retries + 1 > 0:
+ # Disable pyling too broad Exception
+ # pylint: disable=W0703
+ try:
+ if algorithm == 'sha1':
+ m = hashlib.sha1()
+ elif algorithm == 'sha256':
+ m = hashlib.sha256()
+ elif algorithm == 'md5':
+ m = hashlib.md5()
+ else:
+ raise NotImplementedError
+ print('Calculating hash of the file downloaded from {}...'.format(url))
+ start = time.time()
+ r = requests.get(url, stream=True, verify=verify_ssl)
+ if r.status_code != 200:
+ raise RuntimeError('Failed downloading url {}'.format(url))
+ f_size = 0
+ for chunk in r.iter_content(chunk_size=10240):
+ if chunk: # filter out keep-alive new chunks
+ m.update(chunk)
+ f_size += len(chunk)
+ hex_hash = m.hexdigest()
+ file_size = f_size
+ end = time.time()
+ print('{}={}, size={}, Time spent={}'.format(algorithm, hex_hash, file_size,
+ end - start))
+ if cache is None:
+ cache = OrderedDict()
+ cache[url] = (hex_hash, file_size)
+ if save_path is not None:
+ with open(save_path, 'a', encoding='utf-8') as of:
+ of.write('{} {} {}\n'.format(url, hex_hash, file_size))
+ break
+ except Exception as e:
+ retries -= 1
+ if retries <= 0:
+ raise e
+ print('download failed due to {}, retrying, {} attempt{} left'
+ .format(repr(e), retries, 's' if retries > 1 else ''))
+ return obj, (hex_hash, file_size)
+ else:
+ return obj
+ elif isinstance(obj, tuple):
+ return tuple((_get_hash_and_size(ele, retries, algorithm, cache, save_path)
+ for ele in obj))
+ elif isinstance(obj, list):
+ return [_get_hash_and_size(ele, retries, algorithm, cache, save_path) for ele in obj]
+ elif isinstance(obj, dict):
+ return {k: _get_hash_and_size(v, retries, algorithm, cache, save_path)
+ for k, v in obj.items()}
+ else:
+ return obj
+ if cache is None:
+ cache = OrderedDict()
+ else:
+ cache = copy.deepcopy(cache)
+ if save_path is not None and os.path.exists(save_path):
+ with open(save_path, 'r', encoding='utf-8') as f:
+ for line in f:
+ line = line.strip()
+ url, hex_hash, file_size = line.split()
+ cache[url] = (hex_hash, file_size)
+ _get_hash_and_size(obj, retries, algorithm, cache, save_path)
+ return obj, cache
+
+
+if __name__ == '__main__':
+ get_hash_and_size([wmt_para_urls, wmt_mono_urls],
+ save_path=os.path.join(_CHECK_SUM_BASE, 'wmt.txt'))
+ get_hash_and_size(squad_urls,
+ save_path=os.path.join(_CHECK_SUM_BASE, 'squad.txt'))
+ get_hash_and_size(hotpotqa_urls,
+ save_path=os.path.join(_CHECK_SUM_BASE, 'hotpotqa.txt'))
+ get_hash_and_size(triviaqa_url,
+ save_path=os.path.join(_CHECK_SUM_BASE, 'triviaqa.txt'))
+ get_hash_and_size(searchqa_url,
+ save_path=os.path.join(_CHECK_SUM_BASE, 'searchqa.txt'))
+ get_hash_and_size(lm_urls,
+ save_path=os.path.join(_CHECK_SUM_BASE, 'language_model.txt'))
+ get_hash_and_size(midi_urls,
+ save_path=os.path.join(_CHECK_SUM_BASE, 'music_midi.txt'))
+ get_hash_and_size(book_urls,
+ save_path=os.path.join(_CHECK_SUM_BASE, 'book_corpus.txt'))
+ get_hash_and_size(glue_urls,
+ save_path=os.path.join(_CHECK_SUM_BASE, 'glue.txt'))
+ get_hash_and_size(superglue_urls,
+ save_path=os.path.join(_CHECK_SUM_BASE, 'superglue.txt'))
diff --git a/scripts/datasets/url_checksums/book_corpus.txt b/scripts/datasets/url_checksums/book_corpus.txt
new file mode 100644
index 0000000000..abacb7b93e
--- /dev/null
+++ b/scripts/datasets/url_checksums/book_corpus.txt
@@ -0,0 +1 @@
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/pretrain_corpus/Gutenberg.zip 91e842dc3671ed5a917b7ff6a60f5f87397780e2 461506225
diff --git a/scripts/datasets/url_checksums/glue.txt b/scripts/datasets/url_checksums/glue.txt
new file mode 100644
index 0000000000..f29bb8d9d7
--- /dev/null
+++ b/scripts/datasets/url_checksums/glue.txt
@@ -0,0 +1,14 @@
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4 19096246cd2a06d8fe2d13880d6cec61149f77c7 376971
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8 44f5954391612a8b3d9d65f6d4a824e9ae8d19ce 7439277
+https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt 716e0f67af962f08220b7e97d229b293077ef41f 1047044
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc 506c7a1a5e0dd551ceec2f84070fa1a8c2bc4b41 6222
+https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt 4265196c15cf75620b0b592b8b921f543bda7e6c 441275
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP-clean.zip?alt=media&token=11a647cb-ecd3-49c9-9d31-79f8ca8fe277 d775bd543ee78e3f64892a43ada949daf93e003d 41696084
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5 cc66d8533052de6d7475ac56dfce300751e070a4 802872
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce c22c684daa5cc9fad949d09d10ecedf94a2ce053 312783507
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df c60db4cc8820749e6af9f713f4d55109dd46e8c1 129820157
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601 6700cb1d2536bf512314b01350f9ac382439218e 10627589
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb 2eb8630df898b7d8df14ca9130c1ac1cf79eb376 697150
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf fc9834b5a8af4e1d8412e48bc38b477510a8c2d0 28999
+https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D c137a2020ab489011dc38fde9ee429f4e2c71257 222257
+https://www.dropbox.com/s/ju7d95ifb072q9f/diagnostic-full.tsv?dl=1 2f46c4b80fea8d3ea52a28e05467af3332fa65d9 265530
diff --git a/scripts/datasets/url_checksums/hotpotqa.txt b/scripts/datasets/url_checksums/hotpotqa.txt
new file mode 100644
index 0000000000..17b96cf3c6
--- /dev/null
+++ b/scripts/datasets/url_checksums/hotpotqa.txt
@@ -0,0 +1,4 @@
+http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json 08c42431c22984f362e94de0e635c7b858c3cff0 566426227
+http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json 825b6cfc34a61db41e82bbb14d978d5a834925f8 46320117
+http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json 96a41025612e8cb15989251102dc05efe9647eda 47454698
+http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_test_fullwiki_v1.json b30e4ff0d8b7bd808240e5609410f9c36279ef36 46213747
diff --git a/scripts/datasets/url_checksums/language_model.txt b/scripts/datasets/url_checksums/language_model.txt
new file mode 100644
index 0000000000..f5ce7ef716
--- /dev/null
+++ b/scripts/datasets/url_checksums/language_model.txt
@@ -0,0 +1,6 @@
+https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip 3c914d17d80b1459be871a5039ac23e752a53cbe 4475746
+https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip 0aec09a7537b58d4bb65362fee27650eeaba625a 190229076
+http://mattmahoney.net/dc/enwik8.zip d856b1ccd937c51aeb9c342e47666fb8c38e7e72 36445475
+http://mattmahoney.net/dc/text8.zip 6c70299b93b7e1f927b42cd8f6ac1a31547c7a2e 31344016
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz 4df859766482e12264a5a9d9fb7f0e276020447d 1792209805
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt aa2322a3da82ef628011336c9b5c6059e4f56c3f 9507106
diff --git a/scripts/datasets/url_checksums/mirror/wmt.json b/scripts/datasets/url_checksums/mirror/wmt.json
new file mode 100644
index 0000000000..fa695f6bd9
--- /dev/null
+++ b/scripts/datasets/url_checksums/mirror/wmt.json
@@ -0,0 +1,48 @@
+{
+ "http://www.statmt.org/europarl/v7/cs-en.tgz" : "datasets/third_party_mirror/cs-en-28bad3e096923694fb776b6cd6ba1079546a9e58.tgz",
+ "http://www.statmt.org/europarl/v7/de-en.tgz" : "datasets/third_party_mirror/de-en-53bb5408d22977c89284bd755717e6bbb5b12bc5.tgz",
+ "http://data.statmt.org/wmt18/translation-task/training-parallel-ep-v8.tgz" : "datasets/third_party_mirror/training-parallel-ep-v8-2f5c2c2c98b72921474a3f1837dc5b61dd44ba88.tgz",
+ "http://www.statmt.org/europarl/v9/training/europarl-v9.cs-en.tsv.gz" : "datasets/third_party_mirror/europarl-v9.cs-en.tsv-e36a1bfe634379ec813b399b57a38093df2349ef.gz",
+ "http://www.statmt.org/europarl/v9/training/europarl-v9.de-en.tsv.gz" : "datasets/third_party_mirror/europarl-v9.de-en.tsv-d553d0c8189642c1c7ae6ed3c265c847e432057c.gz",
+ "http://www.statmt.org/europarl/v9/training/europarl-v9.fi-en.tsv.gz" : "datasets/third_party_mirror/europarl-v9.fi-en.tsv-c5d2f6aad04e88dda6ad11a110f4ca24150edca3.gz",
+ "http://www.statmt.org/europarl/v9/training/europarl-v9.lt-en.tsv.gz" : "datasets/third_party_mirror/europarl-v9.lt-en.tsv-a6343d8fc158f44714ea7d01c0eb65b34640841d.gz",
+ "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz" : "datasets/third_party_mirror/training-parallel-commoncrawl-1c0ad85f0ebaf1d543acb009607205f5dae6627d.tgz",
+ "http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz" : "datasets/third_party_mirror/training-parallel-nc-v9-c7ae7f50cd45c2f3014d78ddba25a4a8a851e27a.tgz",
+ "http://www.statmt.org/wmt15/training-parallel-nc-v10.tgz" : "datasets/third_party_mirror/training-parallel-nc-v10-6c3c45b0f34d5e84a4d0b75a5edcca226ba7d6c2.tgz",
+ "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz" : "datasets/third_party_mirror/training-parallel-nc-v11-f51a1f03908e790d23d10001e92e09ce9555a790.tgz",
+ "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz" : "datasets/third_party_mirror/training-parallel-nc-v12-d98afc59e1d753485530b377ff65f1f891d3bced.tgz",
+ "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz" : "datasets/third_party_mirror/training-parallel-nc-v13-cbaa7834e58d36f228336e3caee6a9056029ff5d.tgz",
+ "http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.de-en.tsv.gz" : "datasets/third_party_mirror/news-commentary-v14.de-en.tsv-c1fd94c7c9ff222968cbd45100bdd8dbeb5ab2aa.gz",
+ "http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-zh.tsv.gz" : "datasets/third_party_mirror/news-commentary-v14.en-zh.tsv-4ca5c01deeba5425646d42f9598d081cd662908b.gz",
+ "http://data.statmt.org/wikititles/v1/wikititles-v1.cs-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.cs-en.tsv-6e094d218dfd8f987fa1a18ea7b4cb127cfb1763.gz",
+ "http://data.statmt.org/wikititles/v1/wikititles-v1.cs-pl.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.cs-pl.tsv-dc93d346d151bf73e4165d6db425b903fc21a5b0.gz",
+ "http://data.statmt.org/wikititles/v1/wikititles-v1.de-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.de-en.tsv-e141c55c43a474e06c259c3fa401288b39cd4315.gz",
+ "http://data.statmt.org/wikititles/v1/wikititles-v1.es-pt.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.es-pt.tsv-c3bd398d57471ee4ab33323393977b8d475a368c.gz",
+ "http://data.statmt.org/wikititles/v1/wikititles-v1.fi-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.fi-en.tsv-5668b004567ca286d1aad9c2b45862a441d79667.gz",
+ "http://data.statmt.org/wikititles/v1/wikititles-v1.gu-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.gu-en.tsv-95b9f15b6a86bfed6dc9bc91597368fd334f436e.gz",
+ "http://data.statmt.org/wikititles/v1/wikititles-v1.hi-ne.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.hi-ne.tsv-6d63908950c72bc8cc69ca470deccff11354afc2.gz",
+ "http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.kk-en.tsv-56ee1e450ef98fe92ea2116c3ce7acc7c7c42b39.gz",
+ "http://data.statmt.org/wikititles/v1/wikititles-v1.lt-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.lt-en.tsv-b8829928686727165eec6c591d2875d12d7c0cfe.gz",
+ "http://data.statmt.org/wikititles/v1/wikititles-v1.ru-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.ru-en.tsv-16d8d231fdf6347b4cc7834654adec80153ff7a4.gz",
+ "http://data.statmt.org/wikititles/v1/wikititles-v1.zh-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.zh-en.tsv-5829097ff7dd61752f29fb306b04d790a1a1cfd7.gz",
+ "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00" : "datasets/third_party_mirror/UNv1.0.en-ru-98c4e01e16070567d27da0ab4fe401f309dd3678.tar.gz.00",
+ "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01" : "datasets/third_party_mirror/UNv1.0.en-ru-86c6013dc88f353d2d6e591928e7549060fcb949.tar.gz.01",
+ "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02" : "datasets/third_party_mirror/UNv1.0.en-ru-bf6b18a33c8cafa6889fd463fa8a2850d8877d35.tar.gz.02",
+ "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00" : "datasets/third_party_mirror/UNv1.0.en-zh-1bec5f10297512183e483fdd4984d207700657d1.tar.gz.00",
+ "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01" : "datasets/third_party_mirror/UNv1.0.en-zh-15df2968bc69ef7662cf3029282bbb62cbf107b1.tar.gz.01",
+ "http://data.statmt.org/wmt17/translation-task/rapid2016.tgz" : "datasets/third_party_mirror/rapid2016-8b173ce0bc77f2a1a57c8134143e3b5ae228a6e2.tgz",
+ "http://data.statmt.org/wmt19/translation-task/dev.tgz" : "datasets/third_party_mirror/dev-451ce2cae815c8392212ccb3f54f5dcddb9b2b9e.tgz",
+ "http://data.statmt.org/wmt19/translation-task/test.tgz" : "datasets/third_party_mirror/test-ce02a36fb2cd41abfa19d36eb8c8d50241ed3346.tgz",
+ "http://data.statmt.org/news-crawl/de/news.2007.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2007.de.shuffled.deduped-9d746b9df345f764e6e615119113c70e3fb0858c.gz",
+ "http://data.statmt.org/news-crawl/de/news.2008.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2008.de.shuffled.deduped-185a24e8833844486aee16cb5decf9a64da1c101.gz",
+ "http://data.statmt.org/news-crawl/de/news.2009.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2009.de.shuffled.deduped-9f7645fc6467de88f4205d94f483194838bad8ce.gz",
+ "http://data.statmt.org/news-crawl/de/news.2010.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2010.de.shuffled.deduped-f29b761194e9606f086102cfac12813931575818.gz",
+ "http://data.statmt.org/news-crawl/de/news.2011.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2011.de.shuffled.deduped-613b16e7a1cb8559dd428525a4c3b42c8a4dc278.gz",
+ "http://data.statmt.org/news-crawl/de/news.2012.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2012.de.shuffled.deduped-1bc419364ea3fe2f9ba4236947c012d4198d9282.gz",
+ "http://data.statmt.org/news-crawl/de/news.2013.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2013.de.shuffled.deduped-3edd84a7f105907608371c81babc7a9078f40aac.gz",
+ "http://data.statmt.org/news-crawl/de/news.2014.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2014.de.shuffled.deduped-1466c67b330c08ab5ab7d48e666c1d3a0bb4e479.gz",
+ "http://data.statmt.org/news-crawl/de/news.2015.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2015.de.shuffled.deduped-2c6d5ec9f8fe51e9eb762be8ff7107c6116c00c4.gz",
+ "http://data.statmt.org/news-crawl/de/news.2016.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2016.de.shuffled.deduped-e7d235c5d28e36dcf6382f1aa12c6ff37d4529bb.gz",
+ "http://data.statmt.org/news-crawl/de/news.2017.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2017.de.shuffled.deduped-f70b4a67bc04c0fdc2ec955b737fa22681e8c038.gz",
+ "http://data.statmt.org/news-crawl/de/news.2018.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2018.de.shuffled.deduped-43f8237de1e219276c0682255def13aa2cb80e35.gz"
+}
\ No newline at end of file
diff --git a/scripts/datasets/url_checksums/music_midi.txt b/scripts/datasets/url_checksums/music_midi.txt
new file mode 100644
index 0000000000..84394518ea
--- /dev/null
+++ b/scripts/datasets/url_checksums/music_midi.txt
@@ -0,0 +1,7 @@
+http://hog.ee.columbia.edu/craffel/lmd/lmd_full.tar.gz 330b3c67f24f9280f81e1f7ab12749087dd83f08 1768163879
+http://hog.ee.columbia.edu/craffel/lmd/lmd_matched.tar.gz 218b7c82ecb230a6679053e48e87714f0bd4836f 1407072670
+http://hog.ee.columbia.edu/craffel/lmd/lmd_aligned.tar.gz 9873e84dd5a531ba3623e0a24ce33a81681cba80 272169548
+http://hog.ee.columbia.edu/craffel/lmd/clean_midi.tar.gz ae47e29dfc18d7779d95697a6461d759504c7a1c 234283029
+https://storage.googleapis.com/magentadata/datasets/maestro/v1.0.0/maestro-v1.0.0-midi.zip e189d8a0b6769f3be576a036da840adafe489327 46579421
+https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0-midi.zip 13808bf9503c72371d38e9705e93ce8623b21c01 59243107
+https://archive.org/download/archiveteam-geocities-midi-collection-2009/2009.GeoCities.MIDI.ArchiveTeam.zip 493880759c648dd96167a2f4d394421e6fa33874 437506993
diff --git a/scripts/datasets/url_checksums/searchqa.txt b/scripts/datasets/url_checksums/searchqa.txt
new file mode 100644
index 0000000000..12ba03a7d5
--- /dev/null
+++ b/scripts/datasets/url_checksums/searchqa.txt
@@ -0,0 +1,3 @@
+s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/train.txt c7e1eb8c34d0525547b91e18b3f8f4d855e35c16 1226681217
+s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/test.txt 08a928e0f8c129d5b3ca43bf46df117e38be0c27 332064988
+s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/val.txt c2f65d6b83c26188d5998ab96bc6a38c1a127fcc 170835902
diff --git a/scripts/datasets/url_checksums/squad.txt b/scripts/datasets/url_checksums/squad.txt
new file mode 100644
index 0000000000..ee6f52e66f
--- /dev/null
+++ b/scripts/datasets/url_checksums/squad.txt
@@ -0,0 +1,4 @@
+https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json 1faea1252438a64f9718412a55036b786cfcc636 30288272
+https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json e1621aae0683b346ee9743bd5609266ba0cc34fc 4854279
+https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json ceb2acdea93b9d82ab1829c7b1e03bee9e302c99 42123633
+https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json 53ebaeb15bc5cab36645150f6f65d074348e2f3d 4370528
diff --git a/scripts/datasets/url_checksums/superglue.txt b/scripts/datasets/url_checksums/superglue.txt
new file mode 100644
index 0000000000..897bb2e490
--- /dev/null
+++ b/scripts/datasets/url_checksums/superglue.txt
@@ -0,0 +1,10 @@
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/CB.zip c16fa0a46f0f888d59767851c44d8db397896fe5 75482
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/COPA.zip ef110b215d7ff95a2fd2d0133f0959d324e9eec3 43986
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/MultiRC.zip 05bfcb1da7ea06742266f24503342fc20b2ab88a 1116225
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/RTE.zip 66105efeccc3fc54f9c5539de4c2d393d5bb4d36 750920
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WiC.zip 5b95487a3690abc718bc173ccd35bf084c43b10a 396213
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip 829ec3dd532284281cc19bacf9cded6c11d3452d 32751
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-b.zip 8c8874dcace4942dd00cf9f76c2537ea0e2026eb 33950
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-g.zip 949909079262bc4f6fb66bd889707aa71218975f 10413
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/BoolQ.zip 90bf152c8012869d326260709404ce5111a76b46 4118001
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/ReCoRD.zip af2825be511efa8fbc7813756e768efffb8fcc11 51757880
diff --git a/scripts/datasets/url_checksums/triviaqa.txt b/scripts/datasets/url_checksums/triviaqa.txt
new file mode 100644
index 0000000000..e31be83f58
--- /dev/null
+++ b/scripts/datasets/url_checksums/triviaqa.txt
@@ -0,0 +1,2 @@
+https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz aa7d8c01d4a5e563caaeb648e8c1f506e353ebd6 2665779500
+https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz 670ba904b286865e25bb67ebd31c25e7c74c18ae 632549060
diff --git a/scripts/datasets/url_checksums/wikipedia.txt b/scripts/datasets/url_checksums/wikipedia.txt
new file mode 100644
index 0000000000..2f4c117a9e
--- /dev/null
+++ b/scripts/datasets/url_checksums/wikipedia.txt
@@ -0,0 +1 @@
+https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/pretrain_corpus/wikipedia-en-20200620.tar.gz 1e1d77c31622744aaa45ff5bfbfca397154d9186 5068070627
diff --git a/scripts/datasets/url_checksums/wmt.txt b/scripts/datasets/url_checksums/wmt.txt
new file mode 100644
index 0000000000..195fdf1a6a
--- /dev/null
+++ b/scripts/datasets/url_checksums/wmt.txt
@@ -0,0 +1,58 @@
+http://www.statmt.org/europarl/v7/cs-en.tgz 28bad3e096923694fb776b6cd6ba1079546a9e58 62062621
+http://www.statmt.org/europarl/v7/de-en.tgz 53bb5408d22977c89284bd755717e6bbb5b12bc5 197785698
+http://data.statmt.org/wmt18/translation-task/training-parallel-ep-v8.tgz 2f5c2c2c98b72921474a3f1837dc5b61dd44ba88 246201434
+http://www.statmt.org/europarl/v9/training/europarl-v9.cs-en.tsv.gz e36a1bfe634379ec813b399b57a38093df2349ef 68176874
+http://www.statmt.org/europarl/v9/training/europarl-v9.de-en.tsv.gz d553d0c8189642c1c7ae6ed3c265c847e432057c 204454328
+http://www.statmt.org/europarl/v9/training/europarl-v9.fi-en.tsv.gz c5d2f6aad04e88dda6ad11a110f4ca24150edca3 194574376
+http://www.statmt.org/europarl/v9/training/europarl-v9.lt-en.tsv.gz a6343d8fc158f44714ea7d01c0eb65b34640841d 64351345
+https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-cs.bicleaner07.tmx.gz 201fc692d4e730cc63e0b1274f98769eeab2faad 957135146
+https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-de.bicleaner07.tmx.gz 7930ac4d7aa1d17467edc04a45f3ed2ffe809a30 9091373722
+https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-fi.bicleaner07.tmx.gz 2485ce022a8027a4cec60eed0e35b989d2302e32 726455593
+https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-lt.bicleaner07.tmx.gz 926dfcd0aba9cc46e6e1a099047a49ee01745d63 286088883
+https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz 6a4c43a2fac39153f2320984a0f13bf4266696d8 667981874
+http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz 1c0ad85f0ebaf1d543acb009607205f5dae6627d 918311367
+http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz c7ae7f50cd45c2f3014d78ddba25a4a8a851e27a 80418416
+http://www.statmt.org/wmt15/training-parallel-nc-v10.tgz 6c3c45b0f34d5e84a4d0b75a5edcca226ba7d6c2 125136590
+http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz f51a1f03908e790d23d10001e92e09ce9555a790 75178032
+http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz d98afc59e1d753485530b377ff65f1f891d3bced 168591139
+http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz cbaa7834e58d36f228336e3caee6a9056029ff5d 113157482
+http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.de-en.tsv.gz c1fd94c7c9ff222968cbd45100bdd8dbeb5ab2aa 39390551
+http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-zh.tsv.gz 4ca5c01deeba5425646d42f9598d081cd662908b 36696769
+http://data.statmt.org/wikititles/v1/wikititles-v1.cs-en.tsv.gz 6e094d218dfd8f987fa1a18ea7b4cb127cfb1763 5112423
+http://data.statmt.org/wikititles/v1/wikititles-v1.cs-pl.tsv.gz dc93d346d151bf73e4165d6db425b903fc21a5b0 3525297
+http://data.statmt.org/wikititles/v1/wikititles-v1.de-en.tsv.gz e141c55c43a474e06c259c3fa401288b39cd4315 17919359
+http://data.statmt.org/wikititles/v1/wikititles-v1.es-pt.tsv.gz c3bd398d57471ee4ab33323393977b8d475a368c 7916897
+http://data.statmt.org/wikititles/v1/wikititles-v1.fi-en.tsv.gz 5668b004567ca286d1aad9c2b45862a441d79667 5101486
+http://data.statmt.org/wikititles/v1/wikititles-v1.gu-en.tsv.gz 95b9f15b6a86bfed6dc9bc91597368fd334f436e 177183
+http://data.statmt.org/wikititles/v1/wikititles-v1.hi-ne.tsv.gz 6d63908950c72bc8cc69ca470deccff11354afc2 184765
+http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz 56ee1e450ef98fe92ea2116c3ce7acc7c7c42b39 1575037
+http://data.statmt.org/wikititles/v1/wikititles-v1.lt-en.tsv.gz b8829928686727165eec6c591d2875d12d7c0cfe 1725255
+http://data.statmt.org/wikititles/v1/wikititles-v1.ru-en.tsv.gz 16d8d231fdf6347b4cc7834654adec80153ff7a4 20299017
+http://data.statmt.org/wikititles/v1/wikititles-v1.zh-en.tsv.gz 5829097ff7dd61752f29fb306b04d790a1a1cfd7 12974754
+https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00 98c4e01e16070567d27da0ab4fe401f309dd3678 1073741824
+https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01 86c6013dc88f353d2d6e591928e7549060fcb949 1073741824
+https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02 bf6b18a33c8cafa6889fd463fa8a2850d8877d35 306221588
+https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00 1bec5f10297512183e483fdd4984d207700657d1 1073741824
+https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01 15df2968bc69ef7662cf3029282bbb62cbf107b1 312943879
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/casia2015.zip b432394685e4c53797e1ac86851f8a013aef27a2 98159063
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/casict2011.zip 769a9a86c24e9507dbf520b950b9026120cb041e 166957775
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/datum2015.zip 6d94cc8d296dd4268ed0a10fa3a419267280363e 100118018
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/datum2017.zip 480fa06760b2dbe7c9a9bd7c3fd5e5b22b860a45 37389573
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/neu2017.zip 532b56ba62f6cffccdc85f4316468873ca739bd1 148681171
+http://data.statmt.org/wmt17/translation-task/rapid2016.tgz 8b173ce0bc77f2a1a57c8134143e3b5ae228a6e2 163416042
+https://s3-eu-west-1.amazonaws.com/tilde-model/rapid2019.de-en.zip aafe431338abb98fc20951b2d6011223a1b91311 111888392
+http://data.statmt.org/wmt19/translation-task/dev.tgz 451ce2cae815c8392212ccb3f54f5dcddb9b2b9e 38654961
+http://data.statmt.org/wmt19/translation-task/test.tgz ce02a36fb2cd41abfa19d36eb8c8d50241ed3346 3533424
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt.tar.gz 88c2f4295169e9f0a9834bf8bff87e3fd4c04055 709032378
+http://data.statmt.org/news-crawl/de/news.2007.de.shuffled.deduped.gz 9d746b9df345f764e6e615119113c70e3fb0858c 90104365
+http://data.statmt.org/news-crawl/de/news.2008.de.shuffled.deduped.gz 185a24e8833844486aee16cb5decf9a64da1c101 308205291
+http://data.statmt.org/news-crawl/de/news.2009.de.shuffled.deduped.gz 9f7645fc6467de88f4205d94f483194838bad8ce 317590378
+http://data.statmt.org/news-crawl/de/news.2010.de.shuffled.deduped.gz f29b761194e9606f086102cfac12813931575818 170405229
+http://data.statmt.org/news-crawl/de/news.2011.de.shuffled.deduped.gz 613b16e7a1cb8559dd428525a4c3b42c8a4dc278 661772046
+http://data.statmt.org/news-crawl/de/news.2012.de.shuffled.deduped.gz 1bc419364ea3fe2f9ba4236947c012d4198d9282 854369573
+http://data.statmt.org/news-crawl/de/news.2013.de.shuffled.deduped.gz 3edd84a7f105907608371c81babc7a9078f40aac 1987692337
+http://data.statmt.org/news-crawl/de/news.2014.de.shuffled.deduped.gz 1466c67b330c08ab5ab7d48e666c1d3a0bb4e479 2018482812
+http://data.statmt.org/news-crawl/de/news.2015.de.shuffled.deduped.gz 2c6d5ec9f8fe51e9eb762be8ff7107c6116c00c4 1772843312
+http://data.statmt.org/news-crawl/de/news.2016.de.shuffled.deduped.gz e7d235c5d28e36dcf6382f1aa12c6ff37d4529bb 1276921550
+http://data.statmt.org/news-crawl/de/news.2017.de.shuffled.deduped.gz f70b4a67bc04c0fdc2ec955b737fa22681e8c038 1863251604
+http://data.statmt.org/news-crawl/de/news.2018.de.shuffled.deduped.gz 43f8237de1e219276c0682255def13aa2cb80e35 2000806230
diff --git a/scripts/index.rst b/scripts/index.rst
deleted file mode 100644
index ca7d84ac00..0000000000
--- a/scripts/index.rst
+++ /dev/null
@@ -1,93 +0,0 @@
-Model Zoo
-=========
-
-.. container:: cards
-
- .. card::
- :title: Word Embedding
- :link: word_embeddings/index.html
-
- Mapping words to vectors.
-
- .. card::
- :title: Language Modeling
- :link: language_model/index.html
-
- Learning the distribution and representation of sequences of words.
-
- .. card::
- :title: Machine Translation
- :link: machine_translation/index.html
-
- From "Hello" to "Bonjour".
-
- .. card::
- :title: Text Classification
- :link: text_classification/index.html
-
- Categorizing documents.
-
- .. card::
- :title: Sentiment Analysis
- :link: sentiment_analysis/index.html
-
- Classifying polarity of emotions and opinions.
-
- .. card::
- :title: Parsing
- :link: parsing/index.html
-
- Dependency parsing.
-
- .. card::
- :title: Natural Language Inference
- :link: natural_language_inference/index.html
-
- Determining if the premise semantically entails the hypothesis.
-
- .. card::
- :title: Text Generation
- :link: text_generation/index.html
-
- Generating text from language models.
-
- .. card::
- :title: BERT
- :link: bert/index.html
-
- Transferring pre-trained language representations to language understanding tasks.
-
- .. card::
- :title: Named Entity Recognition
- :link: ner/index.html
-
- Locating and classifying named entity mentioned in unstructured texts.
-
- .. card::
- :title: Intent Classification and Slot Labeling
- :link: intent_cls_slot_labeling/index.html
-
- Predicting the intent of the query and extracting semantic concepts in the query.
-
- .. card::
- :title: Model Conversion
- :link: model_zoo/conversion_tools/index.html
-
- Converting NLP models from other frameworks to GluonNLP.
-
-.. toctree::
- :hidden:
- :maxdepth: 1
-
- word_embeddings/index
- language_model/index
- machine_translation/index
- text_classification/index
- sentiment_analysis/index
- natural_language_inference/index
- text_generation/index
- parsing/index
- bert/index
- ner/index
- intent_cls_slot_labeling/index
- conversion_tools/index
diff --git a/scripts/intent_cls_slot_labeling/finetune_icsl.py b/scripts/intent_cls_slot_labeling/finetune_icsl.py
deleted file mode 100644
index 75c4aa3a14..0000000000
--- a/scripts/intent_cls_slot_labeling/finetune_icsl.py
+++ /dev/null
@@ -1,461 +0,0 @@
-"""
-Intent Classification and Slot Labelling with BERT
-
-=========================================================================================
-
-This example shows how to implement finetune a model with pre-trained BERT parameters for
-joint intent classification and slot labelling, with Gluon NLP Toolkit.
-
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation,arguments-differ,unused-variable,missing-docstring,wrong-import-order
-import os
-import sys
-import time
-import argparse
-import random
-import numpy as np
-import pandas as pd
-from tqdm import tqdm
-import mxnet as mx
-from mxnet import gluon
-from mxnet.gluon import nn, Block
-from seqeval.metrics import f1_score as ner_f1_score
-import gluonnlp as nlp
-from gluonnlp.data import BERTTokenizer, ATISDataset, SNIPSDataset
-
-nlp.utils.check_version('0.7.0')
-
-class BERTForICSL(Block):
- """Model
-
- """
- def __init__(self, bert, num_intent_classes, num_slot_classes, dropout_prob,
- prefix=None, params=None):
- """
-
- Parameters
- ----------
- bert : Block
- num_intent_classes : int
- num_slot_classes : int
- dropout_prob : float
- prefix : None or str
- params : None or ParamDict
- """
- super(BERTForICSL, self).__init__(prefix=prefix, params=params)
- self.bert = bert
- with self.name_scope():
- self.intent_classifier = nn.HybridSequential()
- with self.intent_classifier.name_scope():
- self.intent_classifier.add(nn.Dropout(rate=dropout_prob))
- self.intent_classifier.add(nn.Dense(units=num_intent_classes, flatten=False))
- self.slot_tagger = nn.HybridSequential()
- with self.slot_tagger.name_scope():
- self.slot_tagger.add(nn.Dropout(rate=dropout_prob))
- self.slot_tagger.add(nn.Dense(units=num_slot_classes, flatten=False))
-
- def forward(self, inputs, valid_length):
- """
-
- Parameters
- ----------
- inputs : NDArray
- The input sentences, has shape (batch_size, seq_length)
- valid_length : NDArray
- The valid length of the sentences
-
- Returns
- -------
- intent_scores : NDArray
- Shape (batch_size, num_classes)
- slot_scores : NDArray
- Shape (batch_size, seq_length, num_tag_types)
- """
- token_types = mx.nd.zeros_like(inputs)
- encoded_states, pooler_out = self.bert(inputs, token_types, valid_length)
- intent_scores = self.intent_classifier(pooler_out)
- slot_scores = self.slot_tagger(encoded_states)
- return intent_scores, slot_scores
-
-
-class IDSLSubwordTransform():
- """Transform the word_tokens/tags by the subword tokenizer
-
- """
- def __init__(self, subword_vocab, subword_tokenizer, slot_vocab, cased=False):
- """
-
- Parameters
- ----------
- subword_vocab : Vocab
- subword_tokenizer : Tokenizer
- cased : bool
- Whether to convert all characters to lower
- """
- self._subword_vocab = subword_vocab
- self._subword_tokenizer = subword_tokenizer
- self._slot_vocab = slot_vocab
- self._cased = cased
- self._slot_pad_id = self._slot_vocab['O']
-
-
- def __call__(self, word_tokens, tags, intent_ids):
- """ Transform the word_tokens/tags by the subword tokenizer
-
- Parameters
- ----------
- word_tokens : List[str]
- tags : List[str]
- intent_ids : np.ndarray
-
- Returns
- -------
- subword_ids : np.ndarray
- subword_mask : np.ndarray
- selected : np.ndarray
- padded_tag_ids : np.ndarray
- intent_label : int
- length : int
- """
- subword_ids = []
- subword_mask = []
- selected = []
- padded_tag_ids = []
- intent_label = intent_ids[0]
- ptr = 0
- for token, tag in zip(word_tokens, tags):
- if not self._cased:
- token = token.lower()
- token_sw_ids = self._subword_vocab[self._subword_tokenizer(token)]
- subword_ids.extend(token_sw_ids)
- subword_mask.extend([1] + [0] * (len(token_sw_ids) - 1))
- selected.append(ptr)
- padded_tag_ids.extend([self._slot_vocab[tag]] +
- [self._slot_pad_id] * (len(token_sw_ids) - 1))
- ptr += len(token_sw_ids)
- length = len(subword_ids)
- if len(subword_ids) != len(padded_tag_ids):
- print(word_tokens)
- print(tags)
- print(subword_ids)
- print(padded_tag_ids)
- return np.array(subword_ids, dtype=np.int32),\
- np.array(subword_mask, dtype=np.int32),\
- np.array(selected, dtype=np.int32),\
- np.array(padded_tag_ids, dtype=np.int32),\
- intent_label,\
- length
-
-
-def parse_args():
- arg_parser = argparse.ArgumentParser(
- description='Train a BERT-based model for joint intent detection and slot filling on '
- 'ATIS/SNIPS dataset.',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- arg_parser.add_argument('--seed', type=int, default=123)
- arg_parser.add_argument('--dataset', choices=['atis', 'snips'], default='atis')
- arg_parser.add_argument('--bert-model', type=str, default='bert_12_768_12',
- help='Name of the BERT model')
- arg_parser.add_argument('--cased', action='store_true',
- help='Whether to use the cased model trained on book_corpus_wiki_en.'
- 'Otherwise, use the uncased model.')
- arg_parser.add_argument('--dropout-prob', type=float, default=0.1,
- help='Dropout probability for the last layer')
- arg_parser.add_argument('--batch-size', type=int, default=32, help='Batch size for training')
- arg_parser.add_argument('--epochs', type=int, default=40, help='Batch size for training')
- arg_parser.add_argument('--optimizer', type=str, default='bertadam',
- help='Optimization algorithm to use')
- arg_parser.add_argument('--learning-rate', type=float, default=5e-5,
- help='Learning rate for optimization')
- arg_parser.add_argument('--wd', type=float, default=0.0,
- help='Weight decay')
- arg_parser.add_argument('--warmup-ratio', type=float, default=0.1,
- help='Warmup ratio for learning rate scheduling')
- arg_parser.add_argument('--slot-loss-mult', type=float, default=1.0,
- help='Multiplier for the slot loss.')
- arg_parser.add_argument('--save-dir', type=str, default='saved_model')
- arg_parser.add_argument('--gpu', type=int, default=None,
- help='Number (index) of GPU to run on, e.g. 0.'
- ' If not specified, uses CPU.')
- args = arg_parser.parse_args()
- return args
-
-
-
-def print_sample(dataset, sample_id):
- """ Print sample in the dataset
-
- Parameters
- ----------
- dataset : SimpleDataset
- sample_id: int
-
- Returns
- -------
- """
- word_tokens, tags, intent_ids = dataset[sample_id]
- print('Sample #ID: {} Intent: {}'.format(sample_id,
- [dataset.intent_vocab.idx_to_token[ele]
- for ele in intent_ids]))
- df = pd.DataFrame(list(zip(word_tokens, tags)))
- df.index.name = None
- print('Sequence:')
- print(df.to_string(header=False))
-
-
-def evaluation(ctx, data_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab):
- """ Evaluate the trained model
-
- Parameters
- ----------
- ctx : Context
- data_loader : DataLoader
- net : Block
- intent_pred_loss : Loss
- slot_pred_loss : Loss
- slot_vocab : Vocab
-
- Returns
- -------
- avg_intent_loss : float
- avg_slot_loss : float
- intent_acc : float
- slot_f1 : float
- pred_slots : list
- gt_slots : list
- """
- nsample = 0
- nslot = 0
- avg_intent_loss = 0
- avg_slot_loss = 0
- correct_intent = 0
- pred_slots = []
- gt_slots = []
- for token_ids, mask, selected, slot_ids, intent_label, valid_length in data_loader:
- token_ids = mx.nd.array(token_ids, ctx=ctx).astype(np.int32)
- mask = mx.nd.array(mask, ctx=ctx).astype(np.float32)
- slot_ids = mx.nd.array(slot_ids, ctx=ctx).astype(np.int32)
- intent_label = mx.nd.array(intent_label, ctx=ctx).astype(np.int32)
- valid_length = mx.nd.array(valid_length, ctx=ctx).astype(np.float32)
- batch_nslot = mask.sum().asscalar()
- batch_nsample = token_ids.shape[0]
- # Forward network
- intent_scores, slot_scores = net(token_ids, valid_length)
- intent_loss = intent_pred_loss(intent_scores, intent_label)
- slot_loss = slot_pred_loss(slot_scores, slot_ids, mask.expand_dims(axis=-1))
- avg_intent_loss += intent_loss.sum().asscalar()
- avg_slot_loss += slot_loss.sum().asscalar()
- pred_slot_ids = mx.nd.argmax(slot_scores, axis=-1).astype(np.int32)
- correct_intent += (mx.nd.argmax(intent_scores, axis=-1).astype(np.int32)
- == intent_label).sum().asscalar()
- for i in range(batch_nsample):
- ele_valid_length = int(valid_length[i].asscalar())
- ele_sel = selected[i].asnumpy()[:ele_valid_length]
- ele_gt_slot_ids = slot_ids[i].asnumpy()[ele_sel]
- ele_pred_slot_ids = pred_slot_ids[i].asnumpy()[ele_sel]
- ele_gt_slot_tokens = [slot_vocab.idx_to_token[v] for v in ele_gt_slot_ids]
- ele_pred_slot_tokens = [slot_vocab.idx_to_token[v] for v in ele_pred_slot_ids]
- gt_slots.append(ele_gt_slot_tokens)
- pred_slots.append(ele_pred_slot_tokens)
- nsample += batch_nsample
- nslot += batch_nslot
- avg_intent_loss /= nsample
- avg_slot_loss /= nslot
- intent_acc = correct_intent / float(nsample)
- slot_f1 = ner_f1_score(pred_slots, gt_slots)
- return avg_intent_loss, avg_slot_loss, intent_acc, slot_f1, pred_slots, gt_slots
-
-
-
-def train(args):
- ctx = mx.cpu() if args.gpu is None else mx.gpu(args.gpu)
- dataset_name = 'book_corpus_wiki_en_cased' if args.cased else 'book_corpus_wiki_en_uncased'
- bert_model, bert_vocab = nlp.model.get_model(name=args.bert_model,
- dataset_name=dataset_name,
- pretrained=True,
- ctx=ctx,
- use_pooler=True,
- use_decoder=False,
- use_classifier=False,
- dropout=args.dropout_prob,
- embed_dropout=args.dropout_prob)
- tokenizer = BERTTokenizer(bert_vocab, lower=not args.cased)
- if args.dataset == 'atis':
- train_data = ATISDataset('train')
- dev_data = ATISDataset('dev')
- test_data = ATISDataset('test')
- intent_vocab = train_data.intent_vocab
- slot_vocab = train_data.slot_vocab
- elif args.dataset == 'snips':
- train_data = SNIPSDataset('train')
- dev_data = SNIPSDataset('dev')
- test_data = SNIPSDataset('test')
- intent_vocab = train_data.intent_vocab
- slot_vocab = train_data.slot_vocab
- else:
- raise NotImplementedError
- print('Dataset {}'.format(args.dataset))
- print(' #Train/Dev/Test = {}/{}/{}'.format(len(train_data), len(dev_data), len(test_data)))
- print(' #Intent = {}'.format(len(intent_vocab)))
- print(' #Slot = {}'.format(len(slot_vocab)))
- # Display An Example
- print('Display A Samples')
- print_sample(test_data, 1)
- print('-' * 80)
-
- idsl_transform = IDSLSubwordTransform(subword_vocab=bert_vocab,
- subword_tokenizer=tokenizer,
- slot_vocab=slot_vocab,
- cased=args.cased)
- train_data_bert = train_data.transform(idsl_transform, lazy=False)
- dev_data_bert = dev_data.transform(idsl_transform, lazy=False)
- test_data_bert = test_data.transform(idsl_transform, lazy=False)
- # Construct the DataLoader
- batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0), # Subword ID
- nlp.data.batchify.Pad(pad_val=0), # Subword Mask
- nlp.data.batchify.Pad(pad_val=0), # Beginning of subword
- nlp.data.batchify.Pad(pad_val=0), # Tag IDs
- nlp.data.batchify.Stack(), # Intent Label
- nlp.data.batchify.Stack()) # Valid Length
- train_batch_sampler = nlp.data.sampler.SortedBucketSampler(
- [len(ele) for ele in train_data_bert],
- batch_size=args.batch_size,
- mult=20,
- shuffle=True)
- train_loader = gluon.data.DataLoader(dataset=train_data_bert,
- num_workers=4,
- batch_sampler=train_batch_sampler,
- batchify_fn=batchify_fn)
- dev_loader = gluon.data.DataLoader(dataset=dev_data_bert,
- num_workers=4,
- batch_size=args.batch_size,
- batchify_fn=batchify_fn,
- shuffle=False)
- test_loader = gluon.data.DataLoader(dataset=test_data_bert,
- num_workers=4,
- batch_size=args.batch_size,
- batchify_fn=batchify_fn,
- shuffle=False)
-
- # Build the network and loss functions
- intent_pred_loss = gluon.loss.SoftmaxCELoss()
- slot_pred_loss = gluon.loss.SoftmaxCELoss(batch_axis=[0, 1])
-
- net = BERTForICSL(bert_model, num_intent_classes=len(intent_vocab),
- num_slot_classes=len(slot_vocab), dropout_prob=args.dropout_prob)
- net.slot_tagger.initialize(ctx=ctx, init=mx.init.Normal(0.02))
- net.intent_classifier.initialize(ctx=ctx, init=mx.init.Normal(0.02))
- net.hybridize()
- intent_pred_loss.hybridize()
- slot_pred_loss.hybridize()
-
- # Build the trainer
- trainer = gluon.Trainer(net.collect_params(), args.optimizer,
- {'learning_rate': args.learning_rate, 'wd': args.wd},
- update_on_kvstore=False)
-
- step_num = 0
- num_train_steps = int(len(train_batch_sampler) * args.epochs)
- num_warmup_steps = int(num_train_steps * args.warmup_ratio)
- best_dev_sf1 = -1
- for epoch_id in range(args.epochs):
- avg_train_intent_loss = 0.0
- avg_train_slot_loss = 0.0
- nsample = 0
- nslot = 0
- ntoken = 0
- train_epoch_start = time.time()
- for token_ids, mask, _, slot_ids, intent_label, valid_length\
- in tqdm(train_loader, file=sys.stdout):
- ntoken += valid_length.sum().asscalar()
- token_ids = mx.nd.array(token_ids, ctx=ctx).astype(np.int32)
- mask = mx.nd.array(mask, ctx=ctx).astype(np.float32)
- slot_ids = mx.nd.array(slot_ids, ctx=ctx).astype(np.int32)
- intent_label = mx.nd.array(intent_label, ctx=ctx).astype(np.int32)
- valid_length = mx.nd.array(valid_length, ctx=ctx).astype(np.float32)
- batch_nslots = mask.sum().asscalar()
- batch_nsample = token_ids.shape[0]
-
- # Set learning rate warm-up
- step_num += 1
- if step_num < num_warmup_steps:
- new_lr = args.learning_rate * step_num / num_warmup_steps
- else:
- offset = ((step_num - num_warmup_steps) * args.learning_rate /
- (num_train_steps - num_warmup_steps))
- new_lr = args.learning_rate - offset
- trainer.set_learning_rate(new_lr)
-
- with mx.autograd.record():
- intent_scores, slot_scores = net(token_ids, valid_length)
- intent_loss = intent_pred_loss(intent_scores, intent_label)
- slot_loss = slot_pred_loss(slot_scores, slot_ids, mask.expand_dims(axis=-1))
- intent_loss = intent_loss.mean()
- slot_loss = slot_loss.sum() / batch_nslots
- loss = intent_loss + args.slot_loss_mult * slot_loss
- loss.backward()
- trainer.update(1.0)
- avg_train_intent_loss += intent_loss.asscalar() * batch_nsample
- avg_train_slot_loss += slot_loss.asscalar() * batch_nslots
- nsample += batch_nsample
- nslot += batch_nslots
- train_epoch_end = time.time()
- avg_train_intent_loss /= nsample
- avg_train_slot_loss /= nslot
- print('[Epoch {}] train intent/slot = {:.3f}/{:.3f}, #token per second={:.0f}'.format(
- epoch_id, avg_train_intent_loss, avg_train_slot_loss,
- ntoken / (train_epoch_end - train_epoch_start)))
- avg_dev_intent_loss, avg_dev_slot_loss, dev_intent_acc,\
- dev_slot_f1, dev_pred_slots, dev_gt_slots\
- = evaluation(ctx, dev_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab)
- print('[Epoch {}] dev intent/slot = {:.3f}/{:.3f},'
- ' slot f1 = {:.2f}, intent acc = {:.2f}'.format(epoch_id, avg_dev_intent_loss,
- avg_dev_slot_loss,
- dev_slot_f1 * 100,
- dev_intent_acc * 100))
- if dev_slot_f1 > best_dev_sf1:
- best_dev_sf1 = dev_slot_f1
- avg_test_intent_loss, avg_test_slot_loss, test_intent_acc, \
- test_slot_f1, test_pred_slots, test_gt_slots \
- = evaluation(ctx, test_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab)
- print('[Epoch {}] test intent/slot = {:.3f}/{:.3f},'
- ' slot f1 = {:.2f}, intent acc = {:.2f}'.format(epoch_id, avg_test_intent_loss,
- avg_test_slot_loss,
- test_slot_f1 * 100,
- test_intent_acc * 100))
- if not os.path.exists(args.save_dir):
- os.makedirs(args.save_dir)
- net.save_parameters(os.path.join(args.save_dir, 'best_valid.params'))
- print('Evaluate the best model:')
- net.load_parameters(os.path.join(args.save_dir, 'best_valid.params'))
- avg_test_intent_loss, avg_test_slot_loss, test_intent_acc, \
- test_slot_f1, test_pred_slots, test_gt_slots \
- = evaluation(ctx, test_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab)
- print('Best validation model --> Slot F1={:.2f}, Intent acc={:.2f}'
- .format(test_slot_f1 * 100, test_intent_acc * 100))
- with open(os.path.join(args.save_dir, 'test_error.txt'), 'w') as of:
- of.write('{} {}\n'.format(test_slot_f1, test_intent_acc))
-
-if __name__ == '__main__':
- args = parse_args()
- np.random.seed(args.seed)
- mx.random.seed(args.seed)
- random.seed(args.seed)
- train(args)
diff --git a/scripts/intent_cls_slot_labeling/index.rst b/scripts/intent_cls_slot_labeling/index.rst
deleted file mode 100644
index 7b46da1bff..0000000000
--- a/scripts/intent_cls_slot_labeling/index.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-Intent Classification and Slot Labeling
----------------------------------------
-
-:download:`Download scripts `
-
-Reference:
-- Devlin, Jacob, et al. "`Bert: Pre-training of deep bidirectional transformers for language understanding. `_" arXiv preprint arXiv:1810.04805 (2018).
-- Chen, Qian, et al. "`BERT for Joint Intent Classification and Slot Filling. `_" arXiv preprint arXiv:1902.10909 (2019).
-
-Joint Intent Classification and Slot Labelling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Intent classification and slot labelling are two essential problems in Natural Language Understanding (NLU).
-In *intent classification*, the agent needs to detect the intention that the speaker's utterance conveys. For example, when the speaker says "Book a flight from Long Beach to Seattle", the intention is to book a flight ticket.
-In *slot labelling*, the agent needs to extract the semantic entities that are related to the intent. In our previous example,
-"Long Beach" and "Seattle" are two semantic constituents related to the flight, i.e., the origin and the destination.
-
-Essentially, *intent classification* can be viewed as a sequence classification problem and *slot labelling* can be viewed as a
-sequence tagging problem similar to Named-entity Recognition (NER). Due to their inner correlation, these two tasks are usually
-trained jointly with a multi-task objective function.
-
-Here's one example of the ATIS dataset, it uses the `IOB2 format `__.
-
-+-----------+--------------------------+--------------+
-| Sentence | Tags | Intent Label |
-+===========+==========================+==============+
-| are | O | atis_flight |
-+-----------+--------------------------+--------------+
-| there | O | |
-+-----------+--------------------------+--------------+
-| any | O | |
-+-----------+--------------------------+--------------+
-| flight | O | |
-+-----------+--------------------------+--------------+
-| from | O | |
-+-----------+--------------------------+--------------+
-| long | B-fromloc.city_name | |
-+-----------+--------------------------+--------------+
-| beach | I-fromloc.city_name | |
-+-----------+--------------------------+--------------+
-| to | O | |
-+-----------+--------------------------+--------------+
-| columbus | B-toloc.city_name | |
-+-----------+--------------------------+--------------+
-| on | O | |
-+-----------+--------------------------+--------------+
-| wednesday | B-depart_date.day_name | |
-+-----------+--------------------------+--------------+
-| april | B-depart_date.month_name | |
-+-----------+--------------------------+--------------+
-| sixteen | B-depart_date.day_number | |
-+-----------+--------------------------+--------------+
-
-
-
-In this example, we demonstrate how to use GluonNLP to fine-tune a pretrained BERT model for joint intent classification and slot labelling. We
-choose to finetune a pretrained BERT model. We use two datasets `ATIS `__ and `SNIPS `__.
-
-The training script requires the seqeval and tqdm packages:
-
-.. code-block:: console
-
- $ pip3 install seqeval --user
- $ pip3 install tqdm --user
-
-For the ATIS dataset, use the following command to run the experiment:
-
-.. code-block:: console
-
- $ python finetune_icsl.py --gpu 0 --dataset atis
-
-It produces the final slot labelling F1 = `95.83%` and intent classification accuracy = `98.66%`
-
-For the SNIPS dataset, use the following command to run the experiment:
-
-.. code-block:: console
-
- $ python finetune_icsl.py --gpu 0 --dataset snips
-
-It produces the final slot labelling F1 = `96.06%` and intent classification accuracy = `98.71%`
-
-Also, we train the models with three random seeds and report the mean/std.
-
-For ATIS
-
-+--------------------------------------------------------------------------------------------+----------------+-------------+
-| Models | Intent Acc (%) | Slot F1 (%) |
-+============================================================================================+================+=============+
-| `Intent Gating & self-attention, EMNLP 2018 `__ | 98.77 | 96.52 |
-+--------------------------------------------------------------------------------------------+----------------+-------------+
-| `BLSTM-CRF + ELMo, AAAI 2019, `__ | 97.42 | 95.62 |
-+--------------------------------------------------------------------------------------------+----------------+-------------+
-| `Joint BERT, Arxiv 2019, `__ | 97.5 | 96.1 |
-+--------------------------------------------------------------------------------------------+----------------+-------------+
-| Ours | 98.66±0.00 | 95.88±0.04 |
-+--------------------------------------------------------------------------------------------+----------------+-------------+
-
-For SNIPS
-
-+--------------------------------------------------------------------+----------------+-------------+
-| Models | Intent Acc (%) | Slot F1 (%) |
-+====================================================================+================+=============+
-| `BLSTM-CRF + ELMo, AAAI 2019 `__ | 99.29 | 93.90 |
-+--------------------------------------------------------------------+----------------+-------------+
-| `Joint BERT, Arxiv 2019 `__ | 98.60 | 97.00 |
-+--------------------------------------------------------------------+----------------+-------------+
-| Ours | 98.81±0.13 | 95.94±0.10 |
-+--------------------------------------------------------------------+----------------+-------------+
diff --git a/scripts/language_model/__init__.py b/scripts/language_model/__init__.py
deleted file mode 100644
index a747f8c58b..0000000000
--- a/scripts/language_model/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Language Model example."""
diff --git a/scripts/language_model/cache_language_model.py b/scripts/language_model/cache_language_model.py
deleted file mode 100644
index a8acaa6771..0000000000
--- a/scripts/language_model/cache_language_model.py
+++ /dev/null
@@ -1,211 +0,0 @@
-"""
-Neural Cache Language Model
-===================
-This example shows how to build a neural cache language model based on
-pre-trained word-level language model on WikiText-2 with Gluon NLP Toolkit.
-
-We implement the neural cache language model proposed in the following work.
-@article{grave2016improving,
- title={Improving neural language models with a continuous cache},
- author={Grave, Edouard and Joulin, Armand and Usunier, Nicolas},
- journal={ICLR},
- year={2017}
-}
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import time
-import math
-import os
-import sys
-import mxnet as mx
-import gluonnlp as nlp
-
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, '..', '..'))
-
-nlp.utils.check_version('0.7.0')
-
-parser = argparse.ArgumentParser(description=
- 'MXNet Neural Cache Language Model on Wikitext-2.')
-parser.add_argument('--bptt', type=int, default=2000,
- help='sequence length')
-parser.add_argument('--model_name', type=str, default='awd_lstm_lm_1150',
- help='name of the pre-trained language model')
-parser.add_argument('--gpus', type=str,
- help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.'
- '(using single gpu is suggested)')
-parser.add_argument('--window', type=int, default=2000,
- help='cache window length')
-parser.add_argument('--theta', type=float, default=0.662,
- help='the scala controls the flatness of the cache distribution '
- 'that predict the next word')
-parser.add_argument('--lambdas', type=float, default=0.1279,
- help='linear scalar between only cache and vocab distribution')
-parser.add_argument('--path_to_params_file', type=str, default=None,
- help='path to the saved params file of user pre-trained model, '
- 'including the params file, e.g., ~/.mxnet/models/awd_lstm_lm_1150.params')
-args = parser.parse_args()
-
-###############################################################################
-# Load vocabulary
-###############################################################################
-
-context = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
- [mx.gpu(int(x)) for x in args.gpus.split(',')]
-
-print(args)
-
-_, vocab = nlp.model.get_model(name=args.model_name,
- dataset_name='wikitext-2',
- pretrained=False,
- ctx=context)
-ntokens = len(vocab)
-
-###############################################################################
-# Build the cache model and load pre-trained language model
-###############################################################################
-
-
-if not args.path_to_params_file:
- cache_cell = nlp.model.train.get_cache_model(name=args.model_name,
- dataset_name='wikitext-2',
- window=args.window,
- theta=args.theta,
- lambdas=args.lambdas,
- ctx=context)
-else:
- model, _ = nlp.model.get_model(name=args.model_name,
- dataset_name='wikitext-2',
- pretrained=False,
- ctx=context)
- cache_cell = nlp.model.train.CacheCell(model, ntokens, args.window, args.theta, args.lambdas)
- cache_cell.load_parameters(args.path_to_params_file, ctx=context)
-
-###############################################################################
-# Load data
-###############################################################################
-
-val_dataset, test_dataset = \
- [nlp.data.WikiText2(segment=segment,
- skip_empty=False, bos=None, eos='')
- for segment in ['val', 'test']]
-
-val_batch_size = 1
-val_batchify = nlp.data.batchify.CorpusBatchify(vocab, val_batch_size)
-val_data = val_batchify(val_dataset)
-test_batch_size = 1
-test_batchify = nlp.data.batchify.CorpusBatchify(vocab, test_batch_size)
-test_data = test_batchify(test_dataset)
-
-###############################################################################
-# Training
-###############################################################################
-
-
-def detach(hidden):
- """Transfer hidden states into new states, to detach them from the history.
- Parameters
- ----------
- hidden : NDArray
- The hidden states
- Returns
- ----------
- hidden: NDArray
- The detached hidden states
- """
- if isinstance(hidden, (tuple, list)):
- hidden = [detach(h) for h in hidden]
- else:
- hidden = hidden.detach()
- return hidden
-
-
-def get_batch(data_source, i, seq_len=None):
- """Get mini-batches of the dataset.
-
- Parameters
- ----------
- data_source : NDArray
- The dataset is evaluated on.
- i : int
- The index of the batch, starting from 0.
- seq_len : int
- The length of each sample in the batch.
-
- Returns
- -------
- data: NDArray
- The context
- target: NDArray
- The words to predict
- """
- seq_len = min(seq_len if seq_len else args.bptt, len(data_source) - 1 - i)
- data = data_source[i:i+seq_len]
- target = data_source[i+1:i+1+seq_len]
- return data, target
-
-
-def evaluate(data_source, batch_size, ctx=None):
- """Evaluate the model on the dataset with cache model.
-
- Parameters
- ----------
- data_source : NDArray
- The dataset is evaluated on.
- batch_size : int
- The size of the mini-batch.
- ctx : mx.cpu() or mx.gpu()
- The context of the computation.
-
- Returns
- -------
- loss: float
- The loss on the dataset
- """
- total_L = 0
- hidden = cache_cell.\
- begin_state(func=mx.nd.zeros, batch_size=batch_size, ctx=context[0])
- next_word_history = None
- cache_history = None
- for i in range(0, len(data_source) - 1, args.bptt):
- if i > 0:
- print('Batch %d/%d, ppl %f'%
- (i, len(data_source), math.exp(total_L/i)))
- data, target = get_batch(data_source, i)
- data = data.as_in_context(ctx)
- target = target.as_in_context(ctx)
- L = 0
- outs, next_word_history, cache_history, hidden = \
- cache_cell(data, target, next_word_history, cache_history, hidden)
- for out in outs:
- L += (-mx.nd.log(out)).asscalar()
- total_L += L / data.shape[1]
- hidden = detach(hidden)
- return total_L / len(data_source)
-
-
-if __name__ == '__main__':
- start_pipeline_time = time.time()
- final_val_L = evaluate(val_data, val_batch_size, context[0])
- final_test_L = evaluate(test_data, test_batch_size, context[0])
- print('Best validation loss %.2f, val ppl %.2f' % (final_val_L, math.exp(final_val_L)))
- print('Best test loss %.2f, test ppl %.2f' % (final_test_L, math.exp(final_test_L)))
- print('Total time cost %.2fs' % (time.time()-start_pipeline_time))
diff --git a/scripts/language_model/conversion_utils/compare_transformerxl_pytorch_gluon_model.py b/scripts/language_model/conversion_utils/compare_transformerxl_pytorch_gluon_model.py
deleted file mode 100644
index 065f1bcb4a..0000000000
--- a/scripts/language_model/conversion_utils/compare_transformerxl_pytorch_gluon_model.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Script for model comparison between TF and Gluon."""
-
-import argparse
-import glob
-import logging
-import os
-import pickle
-import re
-import sys
-
-import mxnet as mx
-import numpy as np
-import tensorflow as tf
-import torch
-from absl import flags
-
-import gluonnlp as nlp
-import transformers
-from utils import read_tf_checkpoint, to_gluon_kwargs
-
-
-def get_kwargs_and_corpus(args):
- # Infer model config
- with open(os.path.join(args.tf_data_dir, 'cache.pkl'), 'rb') as f:
- corpus = pickle.load(f, encoding='latin1')
- tf_checkpoint_file = os.path.expanduser(
- os.path.join(args.tf_checkpoint_dir, args.tf_model_prefix))
- tf_tensors = read_tf_checkpoint(tf_checkpoint_file)
- kwargs, _ = to_gluon_kwargs(tf_tensors)
- return kwargs, corpus
-
-
-def get_data(args):
- record_info_dir = os.path.join(args.tf_data_dir, 'tfrecords')
- assert os.path.exists(record_info_dir)
- record_info_file = glob.glob(os.path.join(record_info_dir, "record_info*json"))[0]
- eval_split, batch_size, tgt_len = re.search(r'record_info-(\w+)\.bsz-(\d+)\.tlen-(\d+).json',
- record_info_file).groups()
- batch_size, tgt_len = int(batch_size), int(tgt_len)
-
- num_core_per_host = 1
- num_hosts = 1
- eval_input_fn, eval_record_info = data_utils.get_input_fn(
- record_info_dir=record_info_dir, split=eval_split, per_host_bsz=batch_size, tgt_len=tgt_len,
- num_core_per_host=num_core_per_host, num_hosts=num_hosts, use_tpu=False)
-
- ##### Create computational graph
- eval_set = eval_input_fn({"batch_size": batch_size, "data_dir": record_info_dir})
- input_feed, label_feed = eval_set.make_one_shot_iterator().get_next()
-
- # Extract first two batches
- sess = tf.Session()
- np_features, np_labels = [], []
- for i in range(2):
- feature_i, label_i = sess.run((input_feed, label_feed))
- np_features.append(feature_i[:1]) # force batch_size of 1
- np_labels.append(label_i[:1])
-
- return np_features, np_labels, 1, tgt_len
-
-
-def compare_transformerxl(args, kwargs, corpus):
- # Data
- np_features, np_labels, batch_size, tgt_len = get_data(args)
-
- # Models
- model_p = transformers.TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
- model_p.crit.keep_order = True
- model_p.transformer.output_attentions = False # no change of default; breaks model if changed
- model_p.transformer.output_hidden_states = True
-
- with open(args.gluon_vocab_file, 'r') as f:
- vocab = nlp.Vocab.from_json(f.read())
- ctx = mx.gpu()
- model = TransformerXL(vocab_size=len(vocab), clamp_len=model_p.transformer.clamp_len, **kwargs)
- model.initialize(ctx=ctx)
- model.load_parameters(args.gluon_parameter_file, ignore_extra=False)
- model.hybridize()
-
- # Computation
- assert len(np_features) == 2
- mems = model.begin_mems(batch_size, model_p.config.mem_len, context=ctx)
- mems_p = None
- for batch in range(2):
- print('Batch {}'.format(batch))
-
- features_nd = mx.nd.array(np_features[batch], ctx=ctx)
- labels_nd = mx.nd.array(np_labels[batch], ctx=ctx)
- features_p = torch.tensor(np_features[batch], dtype=torch.long)
- labels_p = torch.tensor(np_labels[batch], dtype=torch.long)
-
- loss, mems, last_hidden = model(features_nd, labels_nd, mems)
-
- loss_p, _, mems_p, all_hidden_p = model_p(features_p, mems=mems_p, labels=labels_p)
-
- for i in range(kwargs['num_layers']):
- a_b = mems_p[i][:, 0].numpy() - mems[i][0].asnumpy()
- max_error = a_b.max()
- argmax_error = a_b.argmax()
- stdev = np.std(a_b)
- print('Layer {i}: Maximum error {err:.2e} at position {pos}. stdev={stdev:.2e}'.format(
- i=i, err=max_error, pos=np.unravel_index(argmax_error, shape=a_b.shape),
- stdev=stdev))
- a_b = loss_p.detach().numpy()[0] - loss.asnumpy()[0]
- max_error = a_b.max()
- argmax_error = a_b.argmax()
- stdev = np.std(a_b)
- print('Loss: Maximum error {err:.2e} at position {pos}. stdev={stdev:.2e}'.format(
- i=i, err=max_error, pos=np.unravel_index(argmax_error, shape=a_b.shape), stdev=stdev))
- assert max_error < 5e-5
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(
- description='Comparison script for Tensorflow and GLuon Transformer-XL model',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--transformer-xl-repo', type=str, required=True,
- help='Path to https://github.com/kimiyoung/transformer-xl repo.')
- parser.add_argument('--tf-checkpoint-dir', type=str, required=True,
- help='Path to Tensorflow checkpoint folder.')
- parser.add_argument(
- '--tf-model-prefix', type=str, required=True, help='Prefix of the checkpoint files. '
- 'For example model.ckpt-0 or model.ckpt-1191000')
- parser.add_argument(
- '--tf-data-dir', type=str, required=True, help='Path to TransformerXL data folder. '
- 'The folder should contain the tfrecords directory as well as the cache.pkl file. '
- 'tfrecords can be created with the TransformerXL data_utils.py script.')
- parser.add_argument('--gluon-parameter-file', type=str, required=True,
- help='gluon parameter file name.')
- parser.add_argument('--gluon-vocab-file', type=str, required=True,
- help='gluon vocab file corresponding to --gluon_parameter_file.')
- parser.add_argument('--debug', action='store_true', help='debugging mode')
- args = parser.parse_args()
- logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
- logging.info(args)
-
- # Load stuff required for unpickling
- sys.path.append(os.path.join((args.transformer_xl_repo), 'tf'))
- import vocabulary # pylint: disable=unused-import
- import data_utils
-
- # Infer correct tf flags
- kwargs, corpus = get_kwargs_and_corpus(args)
- tf_argv = [
- 'train.py',
- '--n_layer=' + str(kwargs['num_layers']),
- '--d_model=' + str(kwargs['units']),
- '--d_embed=' + str(kwargs['embed_size']),
- '--n_head=' + str(kwargs['num_heads']),
- '--d_head=' + str(kwargs['units'] // kwargs['num_heads']),
- '--d_inner=' + str(kwargs['hidden_size']),
- '--dropout=0.0',
- '--dropatt=0.0',
- '--same_length=True',
- '--model_dir=' + args.tf_checkpoint_dir,
- '--proj_share_all_but_first=True',
- '--untie_r=True',
- '--div_val=' + str(kwargs['embed_div_val']),
- ]
- tf_flags = flags.FLAGS(tf_argv, known_only=True)
-
- sys.path.append(os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
- from transformer import TransformerXL
-
- compare_transformerxl(args, kwargs, corpus)
diff --git a/scripts/language_model/conversion_utils/compare_xlnet_pytorch_gluon_model.py b/scripts/language_model/conversion_utils/compare_xlnet_pytorch_gluon_model.py
deleted file mode 100644
index f21374e896..0000000000
--- a/scripts/language_model/conversion_utils/compare_xlnet_pytorch_gluon_model.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# coding: utf-8
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Script for model comparison between TF and Gluon."""
-
-import argparse
-import logging
-import os
-import sys
-
-import mxnet as mx
-import numpy as np
-import torch
-
-import gluonnlp as nlp
-import transformers
-
-
-def compare_xlnet(args):
- batch_size, qlen, mlen = 2, 16, 100
-
- model_p = transformers.XLNetLMHeadModel.from_pretrained(
- 'xlnet-base-cased'
- if args.model_name == 'xlnet_cased_L-12_H-768_A-12' else 'xlnet-large-cased', dropout=0)
- model_p.transformer.attentions = False # no change of default
- model_p.transformer.output_hidden_states = True
- model_p.transformer.mem_len = mlen
-
- if args.model_name == 'xlnet_cased_L-12_H-768_A-12':
- kwargs = {
- 'hidden_size': 3072,
- 'units': 768,
- 'activation': 'approx_gelu',
- 'num_heads': 12,
- 'num_layers': 12,
- 'vocab_size': 32000
- }
- elif args.model_name == 'xlnet_cased_L-24_H-1024_A-16':
- kwargs = {
- 'hidden_size': 4096,
- 'units': 1024,
- 'activation': 'approx_gelu',
- 'num_heads': 16,
- 'num_layers': 24,
- 'vocab_size': 32000
- }
-
- with open(args.gluon_vocab_file, 'r') as f:
- vocab = nlp.Vocab.from_json(f.read())
- ctx = mx.cpu()
- assert kwargs['vocab_size'] == len(vocab)
- clamp_len = model_p.transformer.clamp_len if model_p.transformer.clamp_len > 0 else None
- model = XLNet(clamp_len=clamp_len, **kwargs)
- model.initialize(ctx=ctx)
- model.load_parameters(args.gluon_parameter_file, ignore_extra=False)
- model.hybridize()
-
- # Computation
- mems = model.begin_mems(batch_size, mlen, context=mx.cpu())
- x = mx.nd.ones(shape=(batch_size, qlen))
- token_types = mx.nd.ones(shape=(batch_size, qlen))
- output, new_mems = model(x, token_types, mems)
-
- x_p = torch.tensor(x.asnumpy(), dtype=torch.long)
- mems_p = [torch.tensor(mems_i.transpose((1, 0, 2)).asnumpy()) for mems_i in mems]
- token_types_p = torch.tensor(token_types.asnumpy(), dtype=torch.long)
- output_p, new_mems_p, hids_p = model_p(x_p, token_type_ids=token_types_p, mems=mems_p)
-
- for i in range(kwargs['num_layers']):
- a, b = new_mems[i][:, -qlen:].asnumpy(), hids_p[i].detach().numpy()
- assert np.all(np.isclose(a, b, atol=1e-5))
- assert np.all(np.isclose(output.asnumpy(), output_p.detach().numpy(), atol=5e-5))
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(
- description='Comparison script for Tensorflow and GLuon XLNet model',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--model-name', type=str, required=True,
- choices=['xlnet_cased_L-12_H-768_A-12',
- 'xlnet_cased_L-24_H-1024_A-16'], help='Model name')
- parser.add_argument('--gluon-parameter-file', type=str, required=True,
- help='gluon parameter file name.')
- parser.add_argument('--gluon-vocab-file', type=str, required=True,
- help='gluon vocab file corresponding to --gluon_parameter_file.')
- parser.add_argument('--debug', action='store_true', help='debugging mode')
- args = parser.parse_args()
- logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
- logging.info(args)
- sys.path.append(os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
- from transformer import XLNet
-
- compare_xlnet(args)
diff --git a/scripts/language_model/conversion_utils/convert_transformer_xl.py b/scripts/language_model/conversion_utils/convert_transformer_xl.py
deleted file mode 100644
index e5bea1545a..0000000000
--- a/scripts/language_model/conversion_utils/convert_transformer_xl.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import logging
-import os
-import pickle
-import re
-import sys
-
-import mxnet as mx
-import numpy as np
-
-import gluonnlp as nlp
-from utils import _split_dict, get_hash, to_gluon_kwargs, read_tf_checkpoint
-
-
-def to_gluon_vocab(corpus):
- """Convert a TransformerXL corpus object to a GluonNLP Vocab."""
- # Clean up latin-1 mis-encoding of words
- idx2sym = [w.encode('latin-1').decode('utf-8') for w in corpus.vocab.idx2sym]
- sym2idx = {sym: idx for idx, sym in enumerate(idx2sym)}
-
- special_tokens = dict(unknown_token=None, padding_token=None, bos_token=None)
- if hasattr(corpus.vocab, 'unk_idx'):
- special_tokens['unknown_token'] = idx2sym[corpus.vocab.unk_idx]
- elif '' in sym2idx:
- special_tokens['unknown_token'] = ''
- elif '' in sym2idx:
- special_tokens['unknown_token'] = ''
-
- # Discover special tokens
- if [''] == corpus.vocab.special:
- if '' in sym2idx: # Only include if special token is actually used
- special_tokens['eos_token'] = ''
- elif '' in sym2idx:
- # Special case for model trained on Google 1 Billion Word LM dataset
- special_tokens['eos_token'] = ''
- elif corpus.vocab.special:
- raise NotImplementedError('Provided TransformerXL cache.pkl uses an unknown special token. '
- 'You must extend the `to_gluon_vocab` method to support it.')
- else:
- special_tokens['eos_token'] = None
-
- counter = nlp.data.count_tokens(sym2idx.keys())
- vocab = nlp.vocab.Vocab(counter, token_to_idx=sym2idx, **special_tokens)
- return vocab
-
-
-def set_params(model, tf_tensors, kwargs, tie_r):
- # Drop optimizer params
- _, tf_tensors = _split_dict(lambda k, v: k.endswith('Adam'), tf_tensors)
- _, tf_tensors = _split_dict(lambda k, v: k.endswith('Adam_1'), tf_tensors)
- del tf_tensors['global_step']
- del tf_tensors['beta1_power']
- del tf_tensors['beta2_power']
-
- loaded = set() # Cache of processed parameters
-
- if 'embed_cutoffs' in kwargs: # Adaptive Embedding and Softmax
- # Embedding
- for name, param in model._net.embedding._collect_params_with_prefix().items():
- purpose, i, postfix = re.match(r'([a-zA-Z]*)(\d*)(.*)', name).groups()
- if purpose == 'embedding':
- assert postfix == '_weight'
- tf_param = tf_tensors.pop(
- 'transformer/adaptive_embed/cutoff_{}/lookup_table'.format(i))
- elif purpose == 'projection':
- assert postfix == '_weight'
- tf_param = tf_tensors.pop('transformer/adaptive_embed/cutoff_{}/proj_W'.format(i)).T
- else:
- raise RuntimeError('Embedding had unexpected parameter: {}'.format(name))
-
- param.set_data(mx.nd.array(tf_param))
- loaded.add(param)
-
- # Softmax
- for name, param in model._net.crit._collect_params_with_prefix().items():
- if param in loaded:
- continue # Some parameters are shared between Embedding and Softmax
-
- purpose, i, postfix = re.match(r'([a-zA-Z]*)(\d*)(.*)', name).groups()
- if purpose == 'outembedding':
- if postfix == '_weight':
- tf_param = tf_tensors.pop(
- 'transformer/adaptive_softmax/cutoff_{}/lookup_table'.format(i))
- elif postfix == '_bias':
- tf_param = tf_tensors.pop('transformer/adaptive_softmax/cutoff_{}/b'.format(i))
- else:
- raise RuntimeError('Softmax had unexpected parameter: {}'.format(name))
- elif purpose == 'outprojection':
- assert postfix == '_weight'
- tf_param = tf_tensors.pop('transformer/adaptive_softmax/cutoff_{}/proj'.format(i)).T
- elif purpose == 'cluster':
- if postfix == '.weight':
- tf_param = tf_tensors.pop('transformer/adaptive_softmax/cutoff_0/cluster_W')
- elif postfix == '.bias':
- tf_param = tf_tensors.pop('transformer/adaptive_softmax/cutoff_0/cluster_b')
- else:
- raise RuntimeError('Softmax had unexpected parameter: {}'.format(name))
- else:
- raise RuntimeError('Softmax had unexpected parameter: {}'.format(name))
-
- param.set_data(mx.nd.array(tf_param))
- loaded.add(param)
- else: # Non-adaptive, (possibly) projected embedding and softmax
- # Embedding
- tf_param = tf_tensors.pop('transformer/adaptive_embed/lookup_table')
- model._net.embedding.embedding_weight.set_data(mx.nd.array(tf_param))
- loaded.add(model._net.embedding.embedding_weight)
- if kwargs['embed_size'] != kwargs['units']:
- tf_param = tf_tensors.pop('transformer/adaptive_embed/proj_W')
- model._net.embedding.projection_weight.set_data(mx.nd.array(tf_param))
- loaded.add(model._net.embedding.projection_weight)
- assert len(model._net.embedding.collect_params().keys()) == 2
- else:
- assert len(model._net.embedding.collect_params().keys()) == 1
-
- # Softmax
- for name, param in model._net.crit._collect_params_with_prefix().items():
- if param in loaded:
- continue # Some parameters are shared between Embedding and Softmax
-
- purpose, i, postfix = re.match(r'([a-zA-Z]*)(\d*)(.*)', name).groups()
- if purpose == 'outembedding':
- if postfix == '_weight':
- tf_param = tf_tensors.pop('transformer/adaptive_softmax/lookup_table')
- elif postfix == '_bias':
- tf_param = tf_tensors.pop('transformer/adaptive_softmax/bias')
- else:
- raise RuntimeError('Softmax had unexpected parameter: {}'.format(name))
- elif purpose == 'outprojection':
- assert postfix == '_weight'
- tf_param = tf_tensors.pop('transformer/adaptive_softmax/proj').T
- else:
- raise RuntimeError('Softmax had unexpected parameter: {}'.format(name))
-
- param.set_data(mx.nd.array(tf_param))
- loaded.add(param)
-
- tf_r_r_bias = tf_tensors.pop('transformer/r_r_bias')
- tf_r_w_bias = tf_tensors.pop('transformer/r_w_bias')
- for layer_i in range(kwargs['num_layers']):
- # Attention Cell
- attention_cell = model._net.transformer_cells[layer_i].attention_cell
- # TODO(leezu): Duplicate tied parameters until parameter sharing
- # support is improved in Gluon 2. (It is currently impossible to share
- # only subsets of parameters between Blocks due to name clashes between
- # the non-shared parameters (due to same prefix))
- attention_cell.query_key_bias.set_data(
- mx.nd.array(tf_r_w_bias if tie_r else tf_r_w_bias[layer_i]))
- attention_cell.query_emb_bias.set_data(
- mx.nd.array(tf_r_r_bias if tie_r else tf_r_r_bias[layer_i]))
- tf_param = np.split(
- tf_tensors.pop('transformer/layer_{}/rel_attn/qkv/kernel'.format(layer_i)).T, 3, axis=0)
- attention_cell.proj_query.weight.set_data(mx.nd.array(tf_param[0]))
- attention_cell.proj_key.weight.set_data(mx.nd.array(tf_param[1]))
- attention_cell.proj_value.weight.set_data(mx.nd.array(tf_param[2]))
- tf_param = tf_tensors.pop('transformer/layer_{}/rel_attn/r/kernel'.format(layer_i))
- attention_cell.proj_emb.weight.set_data(mx.nd.array(tf_param.T))
-
- # Projection
- tf_param = tf_tensors.pop('transformer/layer_{}/rel_attn/o/kernel'.format(layer_i))
- model._net.transformer_cells[layer_i].proj.weight.set_data(mx.nd.array(tf_param.T))
-
- # Layer Norm
- tf_param = tf_tensors.pop('transformer/layer_{}/rel_attn/LayerNorm/beta'.format(layer_i))
- model._net.transformer_cells[layer_i].layer_norm.beta.set_data(mx.nd.array(tf_param))
- tf_param = tf_tensors.pop('transformer/layer_{}/rel_attn/LayerNorm/gamma'.format(layer_i))
- model._net.transformer_cells[layer_i].layer_norm.gamma.set_data(mx.nd.array(tf_param))
-
- # FFN
- ffn = model._net.transformer_cells[layer_i].ffn
- tf_param = tf_tensors.pop('transformer/layer_{}/ff/LayerNorm/beta'.format(layer_i))
- ffn.layer_norm.beta.set_data(mx.nd.array(tf_param))
- tf_param = tf_tensors.pop('transformer/layer_{}/ff/LayerNorm/gamma'.format(layer_i))
- ffn.layer_norm.gamma.set_data(mx.nd.array(tf_param))
- tf_param = tf_tensors.pop('transformer/layer_{}/ff/layer_1/kernel'.format(layer_i))
- ffn.ffn_1.weight.set_data(mx.nd.array(tf_param.T))
- tf_param = tf_tensors.pop('transformer/layer_{}/ff/layer_1/bias'.format(layer_i))
- ffn.ffn_1.bias.set_data(mx.nd.array(tf_param))
- tf_param = tf_tensors.pop('transformer/layer_{}/ff/layer_2/kernel'.format(layer_i))
- ffn.ffn_2.weight.set_data(mx.nd.array(tf_param.T))
- tf_param = tf_tensors.pop('transformer/layer_{}/ff/layer_2/bias'.format(layer_i))
- ffn.ffn_2.bias.set_data(mx.nd.array(tf_param))
-
-
-def convert_transformerxl(args):
- # Load tf model and vocab
- with open(args.cache_pkl, 'rb') as f:
- corpus = pickle.load(f, encoding='latin1')
- vocab = to_gluon_vocab(corpus)
- tf_checkpoint_file = os.path.expanduser(
- os.path.join(args.tf_checkpoint_dir, args.tf_model_prefix))
- tf_tensors = read_tf_checkpoint(tf_checkpoint_file)
-
- # Initialize Gluon model
- kwargs, tie_r = to_gluon_kwargs(tf_tensors)
- model = TransformerXL(vocab_size=len(vocab), **kwargs)
- model.initialize(init=mx.init.Normal(0.02))
-
- # Shape inference based on forward pass
- batch_size, seq_len = 2, 16
- mem_length = 100
- mems = model.begin_mems(batch_size, mem_length, context=mx.cpu())
- x = mx.nd.ones(shape=(batch_size, seq_len))
- model(x, x, mems)
-
- # Convert parameters
- set_params(model, tf_tensors, kwargs, tie_r)
-
- # Serialization
- tmp_file_path = os.path.expanduser(os.path.join(args.out_dir, 'tmp'))
- with open(tmp_file_path, 'w') as f:
- f.write(vocab.to_json())
- hash_full, hash_short = get_hash(tmp_file_path)
- gluon_vocab_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.vocab'))
- with open(gluon_vocab_path, 'w') as f:
- f.write(vocab.to_json())
- logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full)
- model.save_parameters(tmp_file_path)
- hash_full, hash_short = get_hash(tmp_file_path)
- os.remove(tmp_file_path)
- gluon_param_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.params'))
- logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full)
- model.save_parameters(gluon_param_path)
- mx.nd.waitall()
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(
- description='Conversion script for Tensorflow Transformer-XL model',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--transformer-xl-repo', type=str, required=True,
- help='Path to https://github.com/kimiyoung/transformer-xl repo.')
- parser.add_argument('--tf-checkpoint-dir', type=str, required=True,
- help='Path to Tensorflow checkpoint folder.')
- parser.add_argument(
- '--tf-model-prefix', type=str, required=True, help='Prefix of the checkpoint files. '
- 'For example model.ckpt-0 or model.ckpt-1191000')
- parser.add_argument('--cache-pkl', type=str, required=True,
- help='Path to TransformerXL cache.pkl file.')
- parser.add_argument('--out-dir', type=str, required=True,
- help='Path to output folder. The folder must exist.')
- parser.add_argument('--debug', action='store_true', help='debugging mode')
- args = parser.parse_args()
- logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
- logging.info(args)
-
- # Load stuff required for unpickling
- sys.path.append(os.path.join((args.transformer_xl_repo), 'tf'))
- import vocabulary # pylint: disable=unused-import
- import data_utils # pylint: disable=unused-import
-
- sys.path.append(os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
- from transformer import TransformerXL
-
- convert_transformerxl(args)
diff --git a/scripts/language_model/conversion_utils/convert_xlnet.py b/scripts/language_model/conversion_utils/convert_xlnet.py
deleted file mode 100644
index 1b9a7da37a..0000000000
--- a/scripts/language_model/conversion_utils/convert_xlnet.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# coding: utf-8
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import json
-import logging
-import os
-import sys
-
-import mxnet as mx
-import numpy as np
-
-import gluonnlp as nlp
-from utils import _split_dict, get_hash, read_tf_checkpoint
-
-
-def set_params(model, tf_tensors, kwargs, tie_r):
- # Drop optimizer params
- _, tf_tensors = _split_dict(lambda k, v: k.endswith('Adam'), tf_tensors)
- _, tf_tensors = _split_dict(lambda k, v: k.endswith('Adam_1'), tf_tensors)
- del tf_tensors['global_step']
-
- # Embedding
- tf_param = tf_tensors.pop('model/transformer/word_embedding/lookup_table')
- model._net.word_embed.weight.set_data(mx.nd.array(tf_param))
- tf_param = tf_tensors.pop('model/transformer/mask_emb/mask_emb')
- model._net.mask_embed.set_data(mx.nd.array(tf_param))
-
- tf_rel_segment_emb = tf_tensors.pop('model/transformer/seg_embed')
-
- tf_r_r_bias = tf_tensors.pop('model/transformer/r_r_bias')
- tf_r_w_bias = tf_tensors.pop('model/transformer/r_w_bias')
- tf_r_s_bias = tf_tensors.pop('model/transformer/r_s_bias')
- for layer_i in range(kwargs['num_layers']):
- # Attention Cell
- attention_cell = model._net.transformer_cells[layer_i].attention_cell
- # TODO(leezu): Duplicate tied parameters until parameter sharing
- # support is improved in Gluon 2. (It is currently impossible to share
- # only subsets of parameters between Blocks due to name clashes between
- # the non-shared parameters (due to same prefix))
- attention_cell.query_key_bias.set_data(
- mx.nd.array(tf_r_w_bias if tie_r else tf_r_w_bias[layer_i]))
- attention_cell.query_emb_bias.set_data(
- mx.nd.array(tf_r_r_bias if tie_r else tf_r_r_bias[layer_i]))
- attention_cell.query_seg_bias.set_data(
- mx.nd.array(tf_r_s_bias if tie_r else tf_r_s_bias[layer_i]))
- shape = (kwargs['units'], kwargs['units'])
- tf_param = tf_tensors.pop('model/transformer/layer_{}/rel_attn/q/kernel'.format(layer_i))
- attention_cell.proj_query.weight.set_data(mx.nd.array(tf_param.reshape(shape).T))
- tf_param = tf_tensors.pop('model/transformer/layer_{}/rel_attn/k/kernel'.format(layer_i))
- attention_cell.proj_key.weight.set_data(mx.nd.array(tf_param.reshape(shape).T))
- tf_param = tf_tensors.pop('model/transformer/layer_{}/rel_attn/v/kernel'.format(layer_i))
- attention_cell.proj_value.weight.set_data(mx.nd.array(tf_param.reshape(shape).T))
- tf_param = tf_tensors.pop('model/transformer/layer_{}/rel_attn/r/kernel'.format(layer_i))
- attention_cell.proj_emb.weight.set_data(mx.nd.array(tf_param.reshape(shape).T))
- attention_cell.seg_emb.set_data(mx.nd.array(tf_rel_segment_emb[layer_i]))
-
- # Projection
- tf_param = tf_tensors.pop('model/transformer/layer_{}/rel_attn/o/kernel'.format(layer_i))
- model._net.transformer_cells[layer_i].proj.weight.set_data(
- mx.nd.array(tf_param.reshape(shape))) # o kernel should not be transposed
-
- # Layer Norm
- tf_param = tf_tensors.pop(
- 'model/transformer/layer_{}/rel_attn/LayerNorm/beta'.format(layer_i))
- model._net.transformer_cells[layer_i].layer_norm.beta.set_data(mx.nd.array(tf_param))
- tf_param = tf_tensors.pop(
- 'model/transformer/layer_{}/rel_attn/LayerNorm/gamma'.format(layer_i))
- model._net.transformer_cells[layer_i].layer_norm.gamma.set_data(mx.nd.array(tf_param))
-
- # FFN
- ffn = model._net.transformer_cells[layer_i].ffn
- tf_param = tf_tensors.pop('model/transformer/layer_{}/ff/LayerNorm/beta'.format(layer_i))
- ffn.layer_norm.beta.set_data(mx.nd.array(tf_param))
- tf_param = tf_tensors.pop('model/transformer/layer_{}/ff/LayerNorm/gamma'.format(layer_i))
- ffn.layer_norm.gamma.set_data(mx.nd.array(tf_param))
- tf_param = tf_tensors.pop('model/transformer/layer_{}/ff/layer_1/kernel'.format(layer_i))
- ffn.ffn_1.weight.set_data(mx.nd.array(tf_param.T))
- tf_param = tf_tensors.pop('model/transformer/layer_{}/ff/layer_1/bias'.format(layer_i))
- ffn.ffn_1.bias.set_data(mx.nd.array(tf_param))
- tf_param = tf_tensors.pop('model/transformer/layer_{}/ff/layer_2/kernel'.format(layer_i))
- ffn.ffn_2.weight.set_data(mx.nd.array(tf_param.T))
- tf_param = tf_tensors.pop('model/transformer/layer_{}/ff/layer_2/bias'.format(layer_i))
- ffn.ffn_2.bias.set_data(mx.nd.array(tf_param))
-
- if 'model/lm_loss/weight' in tf_tensors:
- tf_param = tf_tensors.pop('model/lm_loss/weight')
- model._net.decoder.weight.set_data(tf_param)
- tf_param = tf_tensors.pop('model/lm_loss/bias')
- model._net.decoder.bias.set_data(tf_param)
-
- assert len(tf_tensors.keys()) == 0
-
-
-def convert_xlnet(args):
- # Load vocab
- vocab_file = os.path.join(args.model_dir, 'spiece.model')
- vocab = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file, cls_token='', sep_token='',
- mask_token='')
-
- # Load config
- tf_config_names_to_gluon_config_names = {
- 'd_inner': 'hidden_size',
- 'd_model': 'units',
- 'ff_activation': 'activation',
- 'n_head': 'num_heads',
- 'n_layer': 'num_layers',
- 'n_token': 'vocab_size',
- }
- with open(os.path.join(args.model_dir, 'xlnet_config.json'), 'r') as f:
- tf_config = json.load(f)
- assert tf_config['untie_r']
- del tf_config['untie_r']
- del tf_config['d_head']
- assert len(tf_config) == len(tf_config_names_to_gluon_config_names)
- kwargs = {tf_config_names_to_gluon_config_names[k]: v for k, v in tf_config.items()}
- assert len(vocab) == kwargs['vocab_size']
- print(kwargs)
-
- # Load TF model
- tf_checkpoint_file = os.path.expanduser(os.path.join(args.model_dir, 'xlnet_model.ckpt'))
- tf_tensors = read_tf_checkpoint(tf_checkpoint_file)
-
- # Update kwargs
- kwargs['tie_decoder_weight'] = 'model/lm_loss/weight' not in tf_tensors
-
- # Initialize Gluon model
- model = XLNet(**kwargs)
- model.initialize(init=mx.init.Normal(0.02))
- model.hybridize()
-
- # Shape inference based on forward pass
- batch_size, qlen, mlen = 2, 16, 100
- mems = model.begin_mems(batch_size, mlen, context=mx.cpu())
- x = mx.nd.ones(shape=(batch_size, qlen))
- segments = mx.nd.random_normal(shape=(batch_size, qlen, mlen + qlen, 2))
- segments = segments < 0
- model(x, segments, mems)
-
- # Convert parameters
- set_params(model, tf_tensors, kwargs, tie_r=False)
-
- # Serialization
- tmp_file_path = os.path.expanduser(os.path.join(args.out_dir, 'tmp'))
- with open(tmp_file_path, 'w') as f:
- f.write(vocab.to_json())
- hash_full, hash_short = get_hash(tmp_file_path)
- gluon_vocab_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.vocab'))
- with open(gluon_vocab_path, 'w') as f:
- f.write(vocab.to_json())
- logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full)
- model.save_parameters(tmp_file_path)
- hash_full, hash_short = get_hash(tmp_file_path)
- os.remove(tmp_file_path)
- gluon_param_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.params'))
- logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full)
- model.save_parameters(gluon_param_path)
- mx.nd.waitall()
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='Conversion script for Tensorflow XLNet model',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument(
- '--model-dir', type=str, required=True,
- help='Path to folder including the TensorFlow checkpoint `xlnet_model.ckpt`, '
- 'the SentencePiece model `spiece.model` and the modle config `xlnet_config.json`')
- parser.add_argument('--out-dir', type=str, required=True,
- help='Path to output folder. The folder must exist.')
- parser.add_argument('--debug', action='store_true', help='debugging mode')
- args = parser.parse_args()
- logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
- logging.info(args)
-
- sys.path.append(os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
- from transformer import XLNet
-
- convert_xlnet(args)
diff --git a/scripts/language_model/conversion_utils/utils.py b/scripts/language_model/conversion_utils/utils.py
deleted file mode 100644
index d9c264c035..0000000000
--- a/scripts/language_model/conversion_utils/utils.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import hashlib
-import itertools
-import re
-
-
-def _filter_dict(func, dictionary):
- return {k: v for k, v in dictionary.items() if func(k, v)}
-
-
-def _split_dict(func, dictionary):
- part_one = _filter_dict(func, dictionary)
- part_two = _filter_dict(lambda *args: not func(*args), dictionary)
- return part_one, part_two
-
-
-def get_hash(filename):
- sha1 = hashlib.sha1()
- with open(filename, 'rb') as f:
- while True:
- data = f.read(1048576)
- if not data:
- break
- sha1.update(data)
- return sha1.hexdigest(), str(sha1.hexdigest())[:8]
-
-
-def read_tf_checkpoint(path):
- """read tensorflow checkpoint"""
- from tensorflow.python import pywrap_tensorflow
- tensors = {}
- reader = pywrap_tensorflow.NewCheckpointReader(path)
- var_to_shape_map = reader.get_variable_to_shape_map()
- for key in sorted(var_to_shape_map):
- tensor = reader.get_tensor(key)
- tensors[key] = tensor
- return tensors
-
-
-def to_gluon_kwargs(tf_tensors):
- kwargs = dict()
-
- # Main model
- kwargs['num_layers'] = len(
- set(itertools.chain.from_iterable(re.findall(r'layer_\d*', k) for k in tf_tensors)))
- kwargs['hidden_size'] = tf_tensors['transformer/layer_0/ff/layer_2/kernel'].shape[0]
- kwargs['units'] = tf_tensors['transformer/layer_0/ff/layer_2/kernel'].shape[1]
- tie_r = len(tf_tensors['transformer/r_w_bias'].shape) != 3
- kwargs['num_heads'] = tf_tensors['transformer/r_w_bias'].shape[0 if tie_r else 1]
-
- # Embedding and softmax
- if 'transformer/adaptive_embed/lookup_table' in tf_tensors:
- # Adaptive embedding is not used
- kwargs['embed_size'] = tf_tensors['transformer/adaptive_embed/lookup_table'].shape[1]
- kwargs['tie_input_output_embeddings'] = \
- 'transformer/adaptive_softmax/lookup_table' not in tf_tensors
- kwargs['tie_input_output_projections'] = \
- ['transformer/adaptive_softmax/proj' not in tf_tensors]
- else:
- # Adaptive embedding is used
- lookup_table_selector = 'transformer/adaptive_embed/cutoff_{i}/lookup_table'
- kwargs['embed_cutoffs'] = list(
- itertools.accumulate([
- tf_tensors[lookup_table_selector.format(i=i)].shape[0] for i in range(
- len(_filter_dict(lambda k, v: k.endswith('lookup_table'), tf_tensors)))
- ][:-1]))
- kwargs['embed_size'] = tf_tensors[lookup_table_selector.format(i=0)].shape[1]
- size_of_second = tf_tensors[lookup_table_selector.format(i=1)].shape[1]
- kwargs['embed_div_val'] = kwargs['embed_size'] // size_of_second
- assert kwargs['embed_size'] % size_of_second == 0
- kwargs['tie_input_output_embeddings'] = not bool(
- _filter_dict(
- lambda k, v: k.startswith('transformer/adaptive_softmax/cutoff_') and k.endswith(
- 'lookup_table'), tf_tensors))
- proj_selector = 'transformer/adaptive_softmax/cutoff_{i}/proj'
- kwargs['tie_input_output_projections'] = [
- proj_selector.format(i=i) not in tf_tensors
- for i in range(len(kwargs['embed_cutoffs']) + 1)
- ]
- if kwargs['embed_size'] == kwargs['embed_size'] and \
- 'transformer/adaptive_embed/cutoff_0/proj_W' not in tf_tensors:
- kwargs['project_same_dim'] = False
-
- # Dropout
- # All pre-trained TransformerXL models from
- # https://github.com/kimiyoung/transformer-xl come without dropout
- kwargs['dropout'] = 0
- kwargs['attention_dropout'] = 0
-
- print(kwargs)
- return kwargs, tie_r
diff --git a/scripts/language_model/index.rst b/scripts/language_model/index.rst
deleted file mode 100644
index 9a69f347e0..0000000000
--- a/scripts/language_model/index.rst
+++ /dev/null
@@ -1,301 +0,0 @@
-Language Model
---------------
-
-:download:`Download scripts `
-
-Word Language Model
-~~~~~~~~~~~~~~~~~~~~
-
-Reference: Merity, S., et al. "`Regularizing and optimizing LSTM language models `_". ICLR 2018
-
-
-The key features used to reproduce the results for pre-trained models are listed in the following tables.
-
-.. editing URL for the following table: https://bit.ly/2PHSHvc
-
-The dataset used for training the models is wikitext-2.
-
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Model | awd_lstm_lm_1150_wikitext-2 | awd_lstm_lm_600_wikitext-2 | standard_lstm_lm_1500_wikitext-2 | standard_lstm_lm_650_wikitext-2 | standard_lstm_lm_200_wikitext-2 |
-+===============+============================================================================================================================+===========================================================================================================================+=================================================================================================================================+================================================================================================================================+================================================================================================================================+
-| Mode | LSTM | LSTM | LSTM | LSTM | LSTM |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Num_layers | 3 | 3 | 2 | 2 | 2 |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Embed size | 400 | 200 | 1500 | 650 | 200 |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Hidden size | 1150 | 600 | 1500 | 650 | 200 |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Dropout | 0.4 | 0.2 | 0.65 | 0.5 | 0.2 |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Dropout_h | 0.2 | 0.1 | 0 | 0 | 0 |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Dropout_i | 0.65 | 0.3 | 0 | 0 | 0 |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Dropout_e | 0.1 | 0.05 | 0 | 0 | 0 |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Weight_drop | 0.5 | 0.2 | 0 | 0 | 0 |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Val PPL | 68.71 | 84.89 | 86.51 | 90.96 | 107.59 |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Test PPL | 65.62 | 80.67 | 82.29 | 86.91 | 101.64 |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Command | [1] | [2] | [3] | [4] | [5] |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Training logs | `log `__ | `log `__ | `log `__ | `log `__ | `log `__ |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-
-For all the above model settings, we set Tied = True and NTASGD = True .
-
-[1] awd_lstm_lm_1150_wikitext-2 (Val PPL 68.71 Test PPL 65.62 )
-
-.. code-block:: console
-
- $ python word_language_model.py --gpu 0 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_1150_wikitext-2
-
-[2] awd_lstm_lm_600_wikitext-2 (Val PPL 84.89 Test PPL 80.67)
-
-.. code-block:: console
-
- $ python word_language_model.py --gpu 0 --emsize 200 --nhid 600 --epochs 750 --dropout 0.2 --dropout_h 0.1 --dropout_i 0.3 --dropout_e 0.05 --weight_drop 0.2 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_600_wikitext-2
-
-[3] standard_lstm_lm_1500_wikitext-2 (Val PPL 86.51 Test PPL 82.29)
-
-.. code-block:: console
-
- $ python word_language_model.py --gpu 0 --emsize 1500 --nhid 1500 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.65 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_1500_wikitext-2
-
-[4] standard_lstm_lm_650_wikitext-2 (Val PPL 90.96 Test PPL 86.91)
-
-.. code-block:: console
-
- $ python word_language_model.py --gpu 0 --emsize 650 --nhid 650 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.5 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_650_wikitext-2
-
-[5] standard_lstm_lm_200_wikitext-2 (Val PPL 107.59 Test PPL 101.64)
-
-.. code-block:: console
-
- $ python word_language_model.py --gpu 0 --emsize 200 --nhid 200 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.2 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_200_wikitext-2
-
-Cache Language Model
-~~~~~~~~~~~~~~~~~~~~~
-
-Reference: Grave, E., et al. "`Improving neural language models with a continuous cache `_". ICLR 2017
-
-The key features used to reproduce the results based on the corresponding pre-trained models are listed in the following tables.
-
-.. editing URL for the following table: https://bit.ly/2NkpklU
-
-The dataset used for training the models is wikitext-2.
-
-+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Model | cache_awd_lstm_lm_1150_wikitext-2 | cache_awd_lstm_lm_600_wikitext-2 | cache_standard_lstm_lm_1500_wikitext-2 | cache_standard_lstm_lm_650_wikitext-2 | cache_standard_lstm_lm_200_wikitext-2 |
-+=====================+===================================================================================================================================+==================================================================================================================================+========================================================================================================================================+=======================================================================================================================================+=======================================================================================================================================+
-| Pre-trained setting | Refer to: awd_lstm_lm_1150_wikitext-2 | Refer to: awd_lstm_lm_600_wikitext-2 | Refer to: standard_lstm_lm_1500_wikitext-2 | Refer to: standard_lstm_lm_650_wikitext-2 | Refer to: standard_lstm_lm_200_wikitext-2 |
-+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Val PPL | 53.41 | 64.51 | 65.54 | 68.47 | 77.51 |
-+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Test PPL | 51.46 | 62.19 | 62.79 | 65.85 | 73.74 |
-+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Command | [1] | [2] | [3] | [4] | [5] |
-+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Training logs | `log `__ | `log `__ | `log `__ | `log `__ | `log `__ |
-+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-
-For all the above model settings, we set lambdas = 0.1279, theta = 0.662, window = 2000 and bptt= 2000 .
-
-[1] cache_awd_lstm_lm_1150_wikitext-2 (Val PPL 53.41 Test PPL 51.46)
-
-.. code-block:: console
-
- $ python cache_language_model.py --gpus 0 --model_name awd_lstm_lm_1150
-
-[2] cache_awd_lstm_lm_600_wikitext-2 (Val PPL 64.51 Test PPL 62.19)
-
-.. code-block:: console
-
- $ python cache_language_model.py --gpus 0 --model_name awd_lstm_lm_600
-
-[3] cache_standard_lstm_lm_1500_wikitext-2 (Val PPL 65.54 Test PPL 62.79)
-
-.. code-block:: console
-
- $ python cache_language_model.py --gpus 0 --model_name standard_lstm_lm_1500
-
-[4] cache_standard_lstm_lm_650_wikitext-2 (Val PPL 68.47 Test PPL 65.85)
-
-.. code-block:: console
-
- $ python cache_language_model.py --gpus 0 --model_name standard_lstm_lm_650
-
-[5] cache_standard_lstm_lm_200_wikitext-2 (Val PPL 77.51 Test PPL 73.74)
-
-.. code-block:: console
-
- $ python cache_language_model.py --gpus 0 --model_name standard_lstm_lm_200
-
-Large Scale Word Language Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Reference: Jozefowicz, Rafal, et al. "`Exploring the limits of language modeling `_". arXiv preprint arXiv:1602.02410 (2016).
-
-The key features used to reproduce the results for pre-trained models are listed in the following tables.
-
-.. editing URL for the following table: https://bit.ly/2w28VXS
-
-The dataset used for training the models is Google's 1 billion words dataset.
-
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Model | LSTM-2048-512 |
-+=================+==============================================================================================================================+
-| Mode | LSTMP |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Num layers | 1 |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Embed size | 512 |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Hidden size | 2048 |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Projection size | 512 |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Dropout | 0.1 |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Learning rate | 0.2 |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Num samples | 8192 |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Batch size | 128 |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Gradient clip | 10.0 |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Test perplexity | 43.62 |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Num epochs | 50 |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Training logs | `log `__ |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Evaluation logs | `log `__ |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-
-[1] LSTM-2048-512 (Test PPL 43.62)
-
-.. code-block:: console
-
- $ python large_word_language_model.py --gpus 0,1,2,3 --clip=10
- $ python large_word_language_model.py --gpus 4 --eval-only --batch-size=1
-
-
-XLNet: Generalized Autoregressive Pretraining for Language Understanding
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Reference: Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., &
-Le, Q. V. "`XLNet: Generalized Autoregressive Pretraining for Language
-Understanding. `_" arXiv preprint
-arXiv:1906.08237 (2019).
-
-
-The following pre-trained XLNet models are available from the **get_model** API:
-
-+-------------------+--------------------------+-----------------------------+
-| | xlnet_cased_l12_h768_a12 | xlnet_cased_l24_h1024_a16 |
-+===================+==========================+=============================+
-| 126gb | ✓ | ✓ |
-+-------------------+--------------------------+-----------------------------+
-
-where **126gb** refers to the 126 GB large training dataset used by the XLNet
-paper authors.
-
-.. code-block:: python
-
- import gluonnlp as nlp; import mxnet as mx
- from transformer import get_model, XLNetTokenizer
- model, vocab, tokenizer = get_model('xlnet_cased_l12_h768_a12', dataset_name='126gb', use_decoder=True)
- indices = mx.nd.array([vocab.to_indices(tokenizer('Hello world'))])
- token_types = mx.nd.ones_like(indices)
- mems = model.begin_mems(batch_size=1, mem_len=500, context=indices.context)
- output, new_mems = model(indices, token_types, mems)
-
-Sentence Classification
-~~~~~~~~~~~~~~~~~~~~~~~
-
-GluonNLP provides the following example script to fine-tune sentence classification with pre-trained
-XLNet model.
-
-Results using `xlnet_12_768_12`:
-
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-|Task Name |Metrics |Results on Dev Set |log |command |
-+=================+=====================+=======================+============================================================================================================================================+=================================================================================================================================================================+
-| CoLA |Matthew Corr. |59.33 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| SST-2 |Accuracy |94.61 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| MRPC |Accuracy/F1 |89.22/92.20 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| STS-B |Pearson Corr. |89.34 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| QQP |Accuracy |91.31 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| MNLI |Accuracy(m/mm) |87.19/86.45 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| QNLI |Accuracy |88 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| RTE |Accuracy |75.09 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-
-Results using `xlnet_24_1024_16`:
-We followed the hyperparameters reported by the paper authors.
-
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-|Task Name |Metrics |Results on Dev Set |log |command |
-+=================+=====================+=======================+============================================================================================================================================+=================================================================================================================================================================+
-| CoLA |Matthew Corr. |67 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| SST-2 |Accuracy |94 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| MRPC |Accuracy/F1 |90.2/93 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| STS-B |Pearson Corr. |91.37 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| QQP |Accuracy |91.94 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| MNLI |Accuracy(m/mm) |89.93/89.91 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| RTE |Accuracy |84.12 |`log `__ |`command `__ |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-
-Question Answering on SQuAD
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| Dataset | SQuAD 1.1 | SQuAD 1.1 | SQuAD 2.0 | SQuAD 2.0 |
-+===========+=========================================================================================================================================================+==========================================================================================================================================================+==================================================================================================================================================================================================================================================================================================================+==================================================================================================================================================================================================================================================================================================================+
-| Model | xlnet_12_768_12 | xlnet_24_1024_16 | xlnet_12_768_12 | xlnet_24_1024_16 |
-+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| EM / F1 | 85.50 / 91.77 | 89.08 / 94.52 | 80.47 / 83.22 | 86.08 / 86.69 |
-+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| Log | `log `__ | `log `__ | `log `__ | `log `__ |
-+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| Command | `command `__ | `command `__ | `command `__ | `command `__ |
-+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| Prediction| `predictions.json `__ | `predictions.json `__ | `predictions.json `__ `null_odds.json `__ | `predictions.json `__ `null_odds.json `__ |
-+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-
-For xlnet_24_1024_16, we used hyperparameters reported by the paper authors.
-
-
-To get the score of the dev data, you need to download the evaluate script (`evaluate-2.0.py `_).
-You can either put the evaluate script under the same folder with run_squad.py to let our script run it automatically,
-or run it manually by yourself. To run the evaluate script, you can use the following commands:
-
-SQuAD1.1:
-
-.. code-block:: console
-
- $ python evaluate-v2.0.py dev-v2.0.json predictions.json
-
-SQuAD2.0:
-
-.. code-block:: console
-
- $ python evaluate-v2.0.py dev-v2.0.json predictions.json --na-prob-file null_odds.json
diff --git a/scripts/language_model/large_word_language_model.py b/scripts/language_model/large_word_language_model.py
deleted file mode 100644
index 570b89e7e2..0000000000
--- a/scripts/language_model/large_word_language_model.py
+++ /dev/null
@@ -1,357 +0,0 @@
-"""
-Large Word Language Model
-===================
-
-This example shows how to build a word-level language model on Google Billion Words dataset
-with Gluon NLP Toolkit.
-By using the existing data pipeline tools and building blocks, the process is greatly simplified.
-
-We implement the LSTM 2048-512 language model proposed in the following work.
-
-@article{jozefowicz2016exploring,
- title={Exploring the Limits of Language Modeling},
- author={Jozefowicz, Rafal and Vinyals, Oriol and Schuster, Mike and Shazeer, Noam and Wu, Yonghui},
- journal={arXiv preprint arXiv:1602.02410},
- year={2016}
-}
-
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import time
-import math
-import os
-import sys
-import argparse
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-from gluonnlp.utils import Parallel
-from gluonnlp.model.train.language_model import ParallelBigRNN
-from sampler import LogUniformSampler
-
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, '..', '..'))
-
-nlp.utils.check_version('0.7.0')
-
-###############################################################################
-# Arg parser
-###############################################################################
-parser = argparse.ArgumentParser(description=
- 'Gluon-NLP Big LSTM 2048-512 Language Model on GBW')
-parser.add_argument('--save', type=str, default='model.params',
- help='path to save the final model.')
-parser.add_argument('--emsize', type=int, default=512,
- help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=2048,
- help='number of hidden units per layer')
-parser.add_argument('--nproj', type=int, default=512,
- help='number of projection units per layer. Could be different from embsize')
-parser.add_argument('--nlayers', type=int, default=1,
- help='number of layers')
-parser.add_argument('--from-epoch', type=int, default=None,
- help='start training or testing from the provided epoch')
-parser.add_argument('--epochs', type=int, default=50,
- help='number of epoch for training')
-parser.add_argument('--batch-size', type=int, default=128,
- help='batch size per gpu')
-parser.add_argument('--dropout', type=float, default=0.1,
- help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--eps', type=float, default=1,
- help='initial history accumulation for adagrad')
-parser.add_argument('--bptt', type=int, default=20,
- help='sequence length')
-parser.add_argument('--k', type=int, default=8192,
- help='number of noise samples for estimation')
-parser.add_argument('--gpus', type=str,
- help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.')
-parser.add_argument('--log-interval', type=int, default=1000,
- help='report interval')
-parser.add_argument('--seed', type=int, default=0,
- help='random seed')
-parser.add_argument('--lr', type=float, default=0.2,
- help='initial learning rate')
-parser.add_argument('--clip', type=float, default=1.0,
- help='gradient clipping by global norm.')
-parser.add_argument('--test-mode', action='store_true',
- help='Whether to run through the script with few examples')
-parser.add_argument('--eval-only', action='store_true',
- help='Whether to only run evaluation for the trained model')
-args = parser.parse_args()
-
-segments = ['train', 'test']
-max_nbatch_eval = None
-
-if args.test_mode:
- args.emsize = 200
- args.log_interval = 1
- args.nhid = 200
- args.nlayers = 1
- args.epochs = 20
- max_nbatch_eval = 3
- segments = ['test', 'test']
-
-print(args)
-mx.random.seed(args.seed)
-np.random.seed(args.seed)
-
-context = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
- [mx.gpu(int(x)) for x in args.gpus.split(',')]
-
-os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
-os.environ['MXNET_CPU_PARALLEL_RAND_COPY'] = str(len(context))
-os.environ['MXNET_CPU_WORKER_NTHREADS'] = str(len(context))
-
-###############################################################################
-# Data stream
-###############################################################################
-train_data_stream, test_data_stream = \
- [nlp.data.GBWStream(segment=segment, skip_empty=True, bos=None, eos='')
- for segment in segments]
-vocab = train_data_stream.vocab
-ntokens = len(vocab)
-
-# Sampler for generating negative classes during training with importance sampling
-sampler = LogUniformSampler(ntokens, args.k)
-
-# Given a list of (array, context) pairs, load array[i] on context[i]
-def _load(xs):
- ret = []
- for x, ctx in zip(xs, context):
- if isinstance(x, tuple):
- ret.append([y.as_in_context(ctx) for y in x])
- else:
- ret.append(x.as_in_context(ctx))
- return ret
-
-# Transformation for a data batch for training.
-# First, load the data, target and mask to target contexts.
-# Second, the LSTM-2048-512 model performs importance sampling for decoding
-# during training, we need to sample negative candidate classes by invoking the
-# log uniform sampler.
-def _split_and_sample(x, y):
- m = x != vocab[vocab.padding_token] # mask padding
- num_ctx = len(context)
- if num_ctx > 1:
- xs = gluon.utils.split_data(x, num_ctx, batch_axis=1, even_split=True)
- ys = gluon.utils.split_data(y, num_ctx, batch_axis=1, even_split=True)
- ms = gluon.utils.split_data(m, num_ctx, batch_axis=1, even_split=True)
- else:
- xs, ys, ms = [x], [y], [m]
- xs = _load(xs)
- ys = _load(ys)
- ms = _load(ms)
- ss = [sampler(y) for y in ys]
- ss = _load(ss)
- return xs, ys, ms, ss
-
-train_batch_size = args.batch_size * len(context)
-train_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, train_batch_size)
-train_data = train_batchify(train_data_stream)
-train_data = train_data.transform(_split_and_sample)
-
-test_batch_size = args.batch_size
-test_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, test_batch_size)
-test_data = test_batchify(test_data_stream)
-test_data = nlp.data.PrefetchingStream(test_data)
-
-###############################################################################
-# Build the model
-###############################################################################
-
-eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid,
- args.nlayers, args.nproj,
- embed_dropout=args.dropout,
- encode_dropout=args.dropout)
-model = nlp.model.language_model.train.BigRNN(ntokens, args.emsize, args.nhid,
- args.nlayers, args.nproj, args.k,
- embed_dropout=args.dropout,
- encode_dropout=args.dropout)
-loss = gluon.loss.SoftmaxCrossEntropyLoss()
-
-###############################################################################
-# Training code
-###############################################################################
-
-def train():
- """Training loop for language model.
- """
- print(model)
- from_epoch = 0
- model.initialize(mx.init.Xavier(factor_type='out'), ctx=context)
- trainer_params = {'learning_rate': args.lr, 'wd': 0, 'eps': args.eps}
- trainer = gluon.Trainer(model.collect_params(), 'adagrad', trainer_params)
- if args.from_epoch:
- from_epoch = args.from_epoch
- checkpoint_name = '%s.%s'%(args.save, format(from_epoch - 1, '02d'))
- model.load_parameters(checkpoint_name)
- trainer.load_states('%s.state'%args.save)
- print('Loaded parameters from checkpoint %s'%(checkpoint_name))
-
- model.hybridize(static_alloc=True, static_shape=True)
- encoder_params = model.encoder.collect_params().values()
- embedding_params = list(model.embedding.collect_params().values())
- parallel_model = ParallelBigRNN(model, loss, args.batch_size)
- parallel = Parallel(len(context), parallel_model)
- for epoch in range(from_epoch, args.epochs):
- sys.stdout.flush()
- total_L = 0.0
- start_epoch_time = time.time()
- start_log_interval_time = time.time()
- hiddens = [model.begin_state(batch_size=args.batch_size,
- func=mx.nd.zeros, ctx=ctx) for ctx in context]
- nbatch = 0
- has_next = True
- train_data_iter = iter(train_data)
- data, target, mask, sample = next(train_data_iter)
-
- while has_next:
- nbatch += 1
- hiddens = detach(hiddens)
- Ls = []
- for _, batch in enumerate(zip(data, target, mask, sample, hiddens)):
- parallel.put(batch)
-
- for _ in range(len(data)):
- hidden, ls = parallel.get()
- # hidden states are ordered by context id
- index = context.index(hidden[0].context)
- hiddens[index] = hidden
- Ls.append(ls)
-
- # prefetch the next batch of data
- try:
- data, target, mask, sample = next(train_data_iter)
- except StopIteration:
- has_next = False
-
- # rescale embedding grad
- for ctx in context:
- x = embedding_params[0].grad(ctx)
- x[:] *= args.batch_size
- encoder_grad = [p.grad(ctx) for p in encoder_params]
- # perform gradient clipping per ctx
- gluon.utils.clip_global_norm(encoder_grad, args.clip)
-
- trainer.step(len(context))
-
- total_L += sum([mx.nd.sum(L).asscalar() / args.bptt for L in Ls])
-
- if nbatch % args.log_interval == 0:
- cur_L = total_L / args.log_interval / len(context)
- ppl = math.exp(cur_L) if cur_L < 100 else float('inf')
- print('[Epoch %d Batch %d] loss %.2f, ppl %.2f, '
- 'throughput %.2f samples/s'
- %(epoch, nbatch, cur_L, ppl,
- train_batch_size*args.log_interval/(time.time()-start_log_interval_time)))
- total_L = 0.0
- start_log_interval_time = time.time()
- sys.stdout.flush()
-
- end_epoch_time = time.time()
- print('Epoch %d took %.2f seconds.'%(epoch, end_epoch_time - start_epoch_time))
- mx.nd.waitall()
- checkpoint_name = '%s.%s'%(args.save, format(epoch, '02d'))
- model.save_parameters(checkpoint_name)
- trainer.save_states('%s.state'%args.save)
-
-def detach(hidden):
- if isinstance(hidden, (tuple, list)):
- hidden = [detach(h) for h in hidden]
- else:
- hidden = hidden.detach()
- return hidden
-
-def test(data_stream, batch_size, ctx=None):
- """Evaluate the model on the dataset.
-
- Parameters
- ----------
- data_stream : DataStream
- The dataset is tested on.
- batch_size : int
- The size of the mini-batch.
- ctx : mx.cpu() or mx.gpu()
- The context of the computation.
-
- Returns
- -------
- loss: float
- The loss on the dataset
- """
- total_L = 0.0
- ntotal = 0
- nbatch = 0
- hidden = eval_model.begin_state(batch_size=batch_size, func=mx.nd.zeros, ctx=ctx)
- start_time = time.time()
- for data, target in data_stream:
- data = data.as_in_context(ctx)
- target = target.as_in_context(ctx)
- mask = data != vocab[vocab.padding_token]
- output, hidden = eval_model(data, hidden)
- hidden = detach(hidden)
- output = output.reshape((-3, -1))
- L = loss(output, target.reshape(-1,)) * mask.reshape((-1,))
- total_L += L.mean()
- ntotal += mask.mean()
- nbatch += 1
- avg = total_L / ntotal
- if nbatch % args.log_interval == 0:
- avg_scalar = float(avg.asscalar())
- ppl = math.exp(avg_scalar)
- throughput = batch_size*args.log_interval/(time.time()-start_time)
- print('Evaluation batch %d: test loss %.2f, test ppl %.2f, '
- 'throughput = %.2f samples/s'%(nbatch, avg_scalar, ppl, throughput))
- start_time = time.time()
- if max_nbatch_eval and nbatch > max_nbatch_eval:
- print('Quit evaluation early at batch %d'%nbatch)
- break
- return float(avg.asscalar())
-
-def evaluate():
- """ Evaluate loop for the trained model """
- print(eval_model)
- eval_model.initialize(mx.init.Xavier(), ctx=context[0])
- eval_model.hybridize(static_alloc=True, static_shape=True)
- epoch = args.from_epoch if args.from_epoch else 0
- while epoch < args.epochs:
- checkpoint_name = '%s.%s'%(args.save, format(epoch, '02d'))
- if not os.path.exists(checkpoint_name):
- print('Wait for a new checkpoint...')
- # check again after 600 seconds
- time.sleep(600)
- continue
- eval_model.load_parameters(checkpoint_name)
- print('Loaded parameters from checkpoint %s'%(checkpoint_name))
- start_epoch_time = time.time()
- final_test_L = test(test_data, test_batch_size, ctx=context[0])
- end_epoch_time = time.time()
- print('[Epoch %d] test loss %.2f, test ppl %.2f'%
- (epoch, final_test_L, math.exp(final_test_L)))
- print('Epoch %d took %.2f seconds.'%(epoch, end_epoch_time - start_epoch_time))
- sys.stdout.flush()
- epoch += 1
-
-if __name__ == '__main__':
- if args.eval_only:
- evaluate()
- else:
- train()
diff --git a/scripts/language_model/model/XLNet_classifier.py b/scripts/language_model/model/XLNet_classifier.py
deleted file mode 100644
index 18f91526b1..0000000000
--- a/scripts/language_model/model/XLNet_classifier.py
+++ /dev/null
@@ -1,90 +0,0 @@
-"""Model for sentence (pair) classification task/ regression with XLnet.
-"""
-from mxnet.gluon import Block
-from mxnet.gluon import nn
-import mxnet as mx
-
-
-class XLNetClassifier(Block):
- """XLNet Classifier
- """
- def __init__(self, xl, units=768, num_classes=2, dropout=0.0,
- prefix=None, params=None):
- super(XLNetClassifier, self).__init__(prefix=prefix, params=params)
- self.xlnet = xl
- self._units = units
- with self.name_scope():
- self.classifier = nn.HybridSequential(prefix=prefix)
- if dropout:
- self.classifier.add(nn.Dropout(rate=dropout))
- self.classifier.add(nn.Dense(units=num_classes, flatten=False))
- self.pooler = nn.Dense(units=units, flatten=False, activation='tanh', prefix=prefix)
-
- def __call__(self, inputs, token_types, valid_length=None, mems=None):
- # pylint: disable=arguments-differ
- """Generate the unnormalized score for the given the input sequences.
-
- Parameters
- ----------
- inputs : NDArray or Symbol, shape (batch_size, seq_length)
- Input words for the sequences.
- token_types : NDArray or Symbol, shape (batch_size, seq_length)
- Token types for the sequences, used to indicate whether the word belongs to the
- first sentence or the second one.
- valid_length : NDArray or Symbol, or None, shape (batch_size)
- Valid length of the sequence. This is used to mask the padded tokens.
-
- Returns
- -------
- outputs : NDArray or Symbol
- Shape (batch_size, num_classes)
- """
- return super(XLNetClassifier, self).__call__(inputs, token_types, valid_length, mems)
-
- def _apply_pooling(self, sequence, valid_length):
- """Generate the representation given the inputs.
-
- This is used for pre-training or fine-tuning a XLNet model.
- """
- F = mx.ndarray
- index = F.contrib.arange_like(sequence, axis=0, ctx=sequence.context).expand_dims(1)
- valid_length_rs = valid_length.reshape((-1, 1)) - 1
- gather_index = F.concat(index, valid_length_rs).T
- cls_states = F.gather_nd(sequence, gather_index)
- return self.pooler(cls_states)
-
- def _padding_mask(self, inputs, valid_length):
- F = mx.ndarray
- valid_length = valid_length.astype(inputs.dtype)
- steps = F.contrib.arange_like(inputs, axis=1)
- ones = F.ones_like(steps)
- mask = F.broadcast_lesser(F.reshape(steps, shape=(1, -1)),
- F.reshape(valid_length, shape=(-1, 1)))
- mask = F.broadcast_mul(F.expand_dims(mask, axis=1),
- F.broadcast_mul(ones, F.reshape(ones, shape=(-1, 1))))
- return mask
-
- def forward(self, inputs, token_types, valid_length=None, mems=None):
- # pylint: disable=arguments-differ
- """Generate the unnormalized score for the given the input sequences.
-
- Parameters
- ----------
- inputs : NDArray or Symbol, shape (batch_size, seq_length)
- Input words for the sequences.
- token_types : NDArray or Symbol, shape (batch_size, seq_length)
- Token types for the sequences, used to indicate whether the word belongs to the
- first sentence or the second one.
- valid_length : NDArray or None, shape (batch_size)
- Valid length of the sequence. This is used to mask the padded tokens.
-
- Returns
- -------
- outputs : NDArray
- Shape (batch_size, num_classes)
- """
- attention_mask = self._padding_mask(inputs, valid_length).astype('float32')
- output, _ = self.xlnet(inputs, token_types, mems, attention_mask)
- output = self._apply_pooling(output, valid_length.astype('float32'))
- pooler_out = self.pooler(output)
- return self.classifier(pooler_out)
diff --git a/scripts/language_model/model/qa.py b/scripts/language_model/model/qa.py
deleted file mode 100644
index 73619efff6..0000000000
--- a/scripts/language_model/model/qa.py
+++ /dev/null
@@ -1,345 +0,0 @@
-"""XLNetForQA models."""
-
-import mxnet as mx
-from mxnet.gluon import HybridBlock, Block, loss, nn
-
-
-class PoolerStartLogits(HybridBlock):
- """ Compute SQuAD start_logits from sequence hidden states."""
- def __init__(self, prefix=None, params=None):
- super(PoolerStartLogits, self).__init__(prefix=prefix, params=params)
- self.dense = nn.Dense(1, flatten=False)
-
- def __call__(self, hidden_states, p_masks=None):
- # pylint: disable=arguments-differ
- return super(PoolerStartLogits, self).__call__(hidden_states, p_masks)
-
- def hybrid_forward(self, F, hidden_states, p_mask):
- """Get start logits from the model output.
-
- Parameters
- ----------
- hidden_states : NDArray, shape (batch_size, seq_length, hidden_size)
- p_mask : NDArray or None, shape(batch_size, seq_length)
-
- Returns
- -------
- x : NDarray, shape(batch_size, seq_length)
- Masked start logits.
- """
- # pylint: disable=arguments-differ
- x = self.dense(hidden_states).squeeze(-1)
- if p_mask is not None:
- x = x * (1 - p_mask) - 1e30 * p_mask
- return x
-
-
-class PoolerEndLogits(HybridBlock):
- """ Compute SQuAD end_logits from sequence hidden states and start token hidden state."""
- def __init__(self, units=768, is_eval=False, prefix=None, params=None):
- super(PoolerEndLogits, self).__init__(prefix=prefix, params=params)
- self._eval = is_eval
- self._hsz = units
- with self.name_scope():
- self.dense_0 = nn.Dense(units, activation='tanh', flatten=False)
- self.dense_1 = nn.Dense(1, flatten=False)
- self.layernorm = nn.LayerNorm(epsilon=1e-12, in_channels=units)
-
- def __call__(self,
- hidden_states,
- start_states=None,
- start_positions=None,
- p_masks=None):
- # pylint: disable=arguments-differ
- return super(PoolerEndLogits,
- self).__call__(hidden_states, start_states,
- start_positions, p_masks)
-
- def hybrid_forward(self, F, hidden_states, start_states, start_positions, p_mask):
- # pylint: disable=arguments-differ
- """Get end logits from the model output and start states or start positions.
-
- Parameters
- ----------
- hidden_states : NDArray, shape (batch_size, seq_length, hidden_size)
- start_states : NDArray, shape (batch_size, seq_length, start_n_top, hidden_size)
- Used during inference
- start_positions : NDArray, shape (batch_size)
- Ground-truth start positions used during training.
- p_mask : NDArray or None, shape(batch_size, seq_length)
-
- Returns
- -------
- x : NDarray, shape(batch_size, seq_length)
- Masked end logits.
- """
- if not self._eval:
- start_states = F.gather_nd(
- hidden_states,
- F.concat(
- F.contrib.arange_like(hidden_states,
- axis=0).expand_dims(1),
- start_positions.expand_dims(
- 1)).transpose()) # shape(bsz, hsz)
- start_states = start_states.expand_dims(1)
- start_states = F.broadcast_like(
- start_states, hidden_states) # shape (bsz, slen, hsz)
- x = self.dense_0(F.concat(hidden_states, start_states, dim=-1))
- x = self.layernorm(x)
- x = self.dense_1(x).squeeze(-1)
- if p_mask is not None and self._eval:
- p_mask = p_mask.expand_dims(-1)
- p_mask = F.broadcast_like(p_mask, x)
- if p_mask is not None:
- x = x * (1 - p_mask) - 1e30 * p_mask
- return x
-
-
-class XLNetPoolerAnswerClass(HybridBlock):
- """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
- def __init__(self, units=768, dropout=0.1, prefix=None, params=None):
- super(XLNetPoolerAnswerClass, self).__init__(prefix=prefix,
- params=params)
- with self.name_scope():
- self._units = units
- self.dense_0 = nn.Dense(units,
- in_units=2 * units,
- activation='tanh',
- use_bias=True,
- flatten=False)
- self.dense_1 = nn.Dense(1,
- in_units=units,
- use_bias=False,
- flatten=False)
- self._dropout = nn.Dropout(dropout)
-
- def __call__(self, hidden_states, start_states=None, cls_index=None):
- # pylint: disable=arguments-differ
- return super(XLNetPoolerAnswerClass,
- self).__call__(hidden_states, start_states, cls_index)
-
- def hybrid_forward(self, F, hidden_states, start_states, cls_index):
- # pylint: disable=arguments-differ
- """Get answerability logits from the model output and start states.
-
- Parameters
- ----------
- hidden_states : NDArray, shape (batch_size, seq_length, hidden_size)
- start_states : NDArray, shape (batch_size, hidden_size)
- Typically weighted average hidden_states along second dimension.
- cls_index : NDArray, shape (batch_size)
- Index of [CLS] token in sequence.
-
- Returns
- -------
- x : NDarray, shape(batch_size,)
- CLS logits.
- """
- index = F.contrib.arange_like(hidden_states,
- axis=0).expand_dims(1)
- valid_length_rs = cls_index.reshape((-1, 1)) - 1
- gather_index = F.transpose(F.concat(index, valid_length_rs), axes=(1, 0))
- cls_token_state = F.gather_nd(hidden_states, gather_index)
-
- x = self.dense_0(F.concat(start_states, cls_token_state, dim=-1))
- x = self._dropout(x)
- x = self.dense_1(x).squeeze(-1)
- return x
-
-
-class XLNetForQA(Block):
- """Model for SQuAD task with XLNet.
-
- Parameters
- ----------
- xlnet_base: XLNet Block
- start_top_n : int
- Number of start position candidates during inference.
- end_top_n : int
- Number of end position candidates for each start position during inference.
- is_eval : Bool
- If set to True, do inference.
- prefix : str or None
- See document of `mx.gluon.Block`.
- params : ParameterDict or None
- See document of `mx.gluon.Block`.
- """
- def __init__(self,
- xlnet_base,
- start_top_n=None,
- end_top_n=None,
- is_eval=False,
- units=768,
- prefix=None,
- params=None):
- super(XLNetForQA, self).__init__(prefix=prefix, params=params)
- with self.name_scope():
- self.xlnet = xlnet_base
- self.start_top_n = start_top_n
- self.end_top_n = end_top_n
- self.loss = loss.SoftmaxCELoss()
- self.start_logits = PoolerStartLogits()
- self.end_logits = PoolerEndLogits(units=units, is_eval=is_eval)
- self.eval = is_eval
- self.answer_class = XLNetPoolerAnswerClass(units=units)
- self.cls_loss = loss.SigmoidBinaryCrossEntropyLoss()
-
- def __call__(self,
- inputs,
- token_types,
- valid_length=None,
- label=None,
- p_mask=None,
- is_impossible=None,
- mems=None):
- #pylint: disable=arguments-differ
- """Generate the unnormalized score for the given the input sequences."""
- valid_length = [] if valid_length is None else valid_length
- return super(XLNetForQA,
- self).__call__(inputs, token_types, valid_length, p_mask,
- label, is_impossible, mems)
-
- def _padding_mask(self, inputs, valid_length, left_pad=False):
- F = mx.ndarray
- if left_pad:
- # left padding
- valid_length_start = valid_length.astype('int64')
- steps = F.contrib.arange_like(inputs, axis=1) + 1
- ones = F.ones_like(steps)
- mask = F.broadcast_greater(
- F.reshape(steps, shape=(1, -1)),
- F.reshape(valid_length_start, shape=(-1, 1)))
- mask = F.broadcast_mul(
- F.expand_dims(mask, axis=1),
- F.broadcast_mul(ones, F.reshape(ones, shape=(-1, 1))))
- else:
- # right padding
- valid_length = valid_length.astype(inputs.dtype)
- steps = F.contrib.arange_like(inputs, axis=1)
- ones = F.ones_like(steps)
- mask = F.broadcast_lesser(F.reshape(steps, shape=(1, -1)),
- F.reshape(valid_length, shape=(-1, 1)))
- mask = F.broadcast_mul(
- F.expand_dims(mask, axis=1),
- F.broadcast_mul(ones, F.reshape(ones, shape=(-1, 1))))
- return mask
-
- def forward(self, inputs, token_types, valid_length, p_mask, label,
- is_impossible, mems):
- # pylint: disable=arguments-differ
- """Generate the unnormalized score for the given the input sequences.
-
- Parameters
- ----------
- inputs : NDArray, shape (batch_size, seq_length)
- Input words for the sequences.
- token_types : NDArray, shape (batch_size, seq_length)
- Token types for the sequences, used to indicate whether the word belongs to the
- first sentence or the second one.
- valid_length : NDArray or None, shape (batch_size,)
- Valid length of the sequence. This is used to mask the padded tokens.
- p_mask : NDArray or None, shape (batch_size, seq_length)
- We do not want special tokens(e.g., [SEP], [PAD]) and question tokens to be
- included in answer. Set to 1 to mask the token.
- label : NDArray, shape (batch_size, 1)
- Ground-truth label(start/end position) for loss computation.
- is_impossible : NDArray or None, shape (batch_size ,1)
- Ground-truth label(is impossible) for loss computation. Set to None for squad1.
- mems : NDArray
- We do not use memory(a Transformer XL component) during finetuning.
-
- Returns
- -------
- For training we have:
- total_loss : list of NDArray
- Specifically, we have a span loss (batch_size, ) and a cls_loss (batch_size, )
- total_loss_sum : NDArray
-
- For inference we have:
- start_top_log_probs : NDArray, shape (batch_size, start_n_top, )
- start_top_index : NDArray, shape (batch_size, start_n_top)
- end_top_log_probs : NDArray, shape (batch_size, start_n_top * end_n_top)
- end_top_index : NDArray, shape (batch_size, start_n_top * end_n_top)
- cls_logits : NDArray or None, shape (batch_size, )
- """
- if isinstance(valid_length, list) and len(valid_length) == 0:
- valid_length = None
- attention_mask = self._padding_mask(inputs,
- valid_length).astype('float32')
- output, _ = self.xlnet(inputs, token_types, mems, attention_mask)
- start_logits = self.start_logits(output,
- p_masks=p_mask) # shape (bsz, slen)
- bsz, slen, hsz = output.shape
- if not self.eval:
- # training
- start_positions, end_positions = label
- end_logit = self.end_logits(output,
- start_positions=start_positions,
- p_masks=p_mask)
- span_loss = (self.loss(start_logits, start_positions) +
- self.loss(end_logit, end_positions)) / 2
-
- total_loss = [span_loss]
-
- # get cls loss
- start_log_probs = mx.nd.softmax(start_logits, axis=-1)
- start_states = mx.nd.batch_dot(output,
- start_log_probs.expand_dims(-1),
- transpose_a=True).squeeze(-1)
-
- cls_logits = self.answer_class(output, start_states,
- valid_length)
- cls_loss = self.cls_loss(cls_logits, is_impossible)
- total_loss.append(0.5 * cls_loss)
- total_loss_sum = span_loss + 0.5 * cls_loss
- return total_loss, total_loss_sum
- else:
- #inference
- start_log_probs = mx.nd.log_softmax(start_logits,
- axis=-1) # shape (bsz, slen)
- start_top_log_probs, start_top_index = mx.ndarray.topk(
- start_log_probs, k=self.start_top_n, axis=-1,
- ret_typ='both') # shape (bsz, start_n_top)
- index = mx.nd.concat(*[
- mx.nd.arange(bsz, ctx=start_log_probs.context).expand_dims(1)
- ] * self.start_top_n).reshape(bsz * self.start_top_n, 1)
- start_top_index_rs = start_top_index.reshape((-1, 1))
- gather_index = mx.nd.concat(
- index, start_top_index_rs).T #shape(2, bsz * start_n_top)
- start_states = mx.nd.gather_nd(output, gather_index).reshape(
- (bsz, self.start_top_n, hsz)) #shape (bsz, start_n_top, hsz)
-
- start_states = start_states.expand_dims(1)
- start_states = mx.nd.broadcast_to(
- start_states, (bsz, slen, self.start_top_n,
- hsz)) # shape (bsz, slen, start_n_top, hsz)
- hidden_states_expanded = output.expand_dims(2)
- hidden_states_expanded = mx.ndarray.broadcast_to(
- hidden_states_expanded, shape=start_states.shape
- ) # shape (bsz, slen, start_n_top, hsz)
- end_logits = self.end_logits(
- hidden_states_expanded,
- start_states=start_states,
- p_masks=p_mask) # shape (bsz, slen, start_n_top)
- end_log_probs = mx.nd.log_softmax(
- end_logits, axis=1) # shape (bsz, slen, start_n_top)
- # Note that end_top_index and end_top_log_probs have shape (bsz, END_N_TOP, start_n_top)
- # So that for each start position, there are end_n_top end positions on the second dim.
- end_top_log_probs, end_top_index = mx.ndarray.topk(
- end_log_probs, k=self.end_top_n, axis=1,
- ret_typ='both') # shape (bsz, end_n_top, start_n_top)
- end_top_log_probs = end_top_log_probs.reshape(
- (-1, self.start_top_n * self.end_top_n))
- end_top_index = end_top_index.reshape(
- (-1, self.start_top_n * self.end_top_n))
-
- start_probs = mx.nd.softmax(start_logits, axis=-1)
- start_states = mx.nd.batch_dot(output,
- start_probs.expand_dims(-1),
- transpose_a=True).squeeze(-1)
- cls_logits = self.answer_class(output, start_states,
- valid_length)
-
- outputs = (start_top_log_probs, start_top_index, end_top_log_probs,
- end_top_index, cls_logits)
- return outputs
diff --git a/scripts/language_model/run_glue.py b/scripts/language_model/run_glue.py
deleted file mode 100644
index 7f9041f836..0000000000
--- a/scripts/language_model/run_glue.py
+++ /dev/null
@@ -1,658 +0,0 @@
-"""
-Sentence Pair Classification with XLNet
-"""
-import io
-import os
-import time
-import argparse
-import random
-import logging
-import warnings
-from functools import partial
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-from gluonnlp.data.classification import get_task
-from gluonnlp.data.bert.glue import truncate_seqs_equal, concat_sequences
-from model.XLNet_classifier import XLNetClassifier
-from transformer import model
-
-parser = argparse.ArgumentParser(
- description='XLNet fine-tune examples for classification/regression tasks.',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-# Training config
-parser.add_argument('--epochs', type=int, default=3, help='number of epochs.')
-parser.add_argument('--training_steps',
- type=int,
- help='If specified, epochs will be ignored.')
-parser.add_argument(
- '--batch_size',
- type=int,
- default=128,
- help='Batch size. Number of examples per gpu in a minibatch.')
-
-parser.add_argument(
- '--accumulate',
- type=int,
- default=None,
- help=
- 'The number of batches for gradients accumulation to simulate large batch size. '
- 'Default is None')
-
-parser.add_argument('--dev_batch_size',
- type=int,
- default=32,
- help='Batch size for dev set and test set')
-
-parser.add_argument('--dropout', type=float, default=0.1, help='dropout')
-parser.add_argument('--attention_dropout',
- type=float,
- default=0.1,
- help='attention dropout')
-parser.add_argument('--log_interval',
- type=int,
- default=10,
- help='report interval')
-parser.add_argument(
- '--early_stop',
- type=int,
- default=None,
- help='Whether to perform early stopping based on the metric on dev set. '
- 'The provided value is the patience. ')
-
-# Optimizer config
-parser.add_argument('--optimizer', type=str, default='Adam', help='')
-parser.add_argument('--lr',
- type=float,
- default=3e-5,
- help='Initial learning rate')
-parser.add_argument('--lr_decay',
- type=str,
- choices=['linear'],
- default='linear',
- help='lr schedule')
-parser.add_argument('--epsilon',
- type=float,
- default=1e-6,
- help='Small value to avoid division by 0')
-parser.add_argument(
- '--warmup_ratio',
- type=float,
- default=0,
- help='ratio of warmup steps used in NOAM\'s stepsize schedule')
-
-# task spesific & data preprocessing
-parser.add_argument('--gpu',
- type=int,
- default=None,
- help='Number of gpus for finetuning.')
-parser.add_argument('--task_name',
- default='MRPC',
- choices=['MRPC', 'QNLI', 'RTE', 'STS-B', 'CoLA',
- 'MNLI', 'WNLI', 'SST', 'XNLI', 'LCQMC', 'ChnSentiCorp'],
- type=str,
- help='The name of the task to fine-tune.')
-
-parser.add_argument(
- '--model_name',
- type=str,
- default='xlnet_cased_l12_h768_a12',
- choices=['xlnet_cased_l24_h1024_a16', 'xlnet_cased_l12_h768_a12'],
- help='The name of pre-trained XLNet model to fine-tune')
-
-parser.add_argument('--dataset',
- type=str,
- default='126gb',
- help='The dataset BERT pre-trained with.')
-parser.add_argument('--max_len',
- type=int,
- default=128,
- help='Maximum length of the sentence pairs')
-
-parser.add_argument(
- '--round_to', type=int, default=None,
- help='The length of padded sequences will be rounded up to be multiple of this argument.'
- 'When round to is set to 8, training throughput may increase for mixed precision'
- 'training on GPUs with tensorcores.')
-
-parser.add_argument(
- '--only_inference',
- action='store_true',
- help=
- 'If set, we skip training and only perform inference on dev and test data.'
-)
-
-# Initializing config
-parser.add_argument('--seed', type=int, default=2, help='Random seed')
-
-# I/O config
-parser.add_argument(
- '--output_dir',
- type=str,
- default='./output_dir',
- help='The output directory where the model params will be written.')
-parser.add_argument(
- '--model_parameters',
- type=str,
- default=None,
- help='A parameter file for the model that is loaded into the model'
- ' before training/inference. It is different from the parameter'
- ' file written after the model is trained.')
-
-args = parser.parse_args()
-
-
-def split_array(arr, num_of_splits):
- """split an array into equal pieces"""
- # TODO Replace this function with gluon.utils.split_data() once targeting MXNet 1.7
- size = arr.shape[0]
- if size < num_of_splits:
- return [arr[i:i + 1] for i in range(size)]
- slice_len, rest = divmod(size, num_of_splits)
- div_points = [0] + [(slice_len * index + min(index, rest) + slice_len +
- (index < rest)) for index in range(num_of_splits)]
- slices = [
- arr[div_points[i]:div_points[i + 1]] for i in range(num_of_splits)
- ]
- return slices
-
-
-def split_and_load(arrs, _ctxs):
- """split and load arrays to a list of contexts"""
- # TODO Replace split_array() with gluon.utils.split_data() once targeting MXNet 1.7
- assert isinstance(arrs, (list, tuple))
- # split and load
- loaded_arrs = [[
- i.as_in_context(ctx)
- for i, ctx in zip(split_array(arr, len(_ctxs)), _ctxs)
- ] for arr in arrs]
- return zip(*loaded_arrs)
-
-
-def convert_examples_to_features(example,
- tokenizer=None,
- truncate_length=512,
- cls_token=None,
- sep_token=None,
- class_labels=None,
- label_alias=None,
- vocab=None,
- is_test=False):
- #pylint: disable=redefined-outer-name
- """convert glue examples into necessary features"""
- assert vocab
- if not is_test:
- label_dtype = 'int32' if class_labels else 'float32'
- # get the label
- label = example[-1]
- example = example[:-1]
- #create label maps if classification task
- if class_labels:
- label_map = {}
- for (i, l) in enumerate(class_labels):
- label_map[l] = i
- if label_alias:
- for key in label_alias:
- label_map[key] = label_map[label_alias[key]]
- label = label_map[label]
- label = np.array([label], dtype=label_dtype)
-
- # tokenize raw text
- tokens_raw = [tokenizer(l) for l in example]
- # truncate to the truncate_length,
- tokens_trun = truncate_seqs_equal(tokens_raw, truncate_length)
- # concate the sequences with special tokens, cls_token is added to the end in XlNet
- special_tokens = [[sep_token]] * len(tokens_trun) + [[cls_token]]
- tokens, segment_ids, _ = concat_sequences(tokens_trun, special_tokens)
- # convert the token to ids
- input_ids = vocab[tokens]
- valid_length = len(input_ids)
- if not is_test:
- return input_ids, valid_length, segment_ids, label
- else:
- return input_ids, valid_length, segment_ids
-
-
-def preprocess_data(_tokenizer,
- _task,
- batch_size,
- dev_batch_size,
- max_len,
- _vocab):
- """Train/eval Data preparation function."""
- label_dtype = 'int32' if _task.class_labels else 'float32'
- truncate_length = max_len - 3 if _task.is_pair else max_len - 2
- trans = partial(convert_examples_to_features,
- tokenizer=_tokenizer,
- truncate_length=truncate_length,
- cls_token=_vocab.cls_token,
- sep_token=_vocab.sep_token,
- class_labels=_task.class_labels,
- label_alias=_task.label_alias,
- vocab=_vocab)
-
- # data train
- # task.dataset_train returns (segment_name, dataset)
- train_tsv = _task.dataset_train()[1]
- data_train = list(map(trans, train_tsv))
- data_train = mx.gluon.data.SimpleDataset(data_train)
- data_train_len = data_train.transform(
- lambda _, valid_length, segment_ids, label: valid_length, lazy=False)
-
- # bucket sampler for training
- pad_val = _vocab[_vocab.padding_token]
- batchify_fn = nlp.data.batchify.Tuple(
- nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to), # input
- nlp.data.batchify.Stack(), # length
- nlp.data.batchify.Pad(axis=0, pad_val=4, round_to=args.round_to), # segment
- nlp.data.batchify.Stack(label_dtype)) # label
- batch_sampler = nlp.data.sampler.FixedBucketSampler(data_train_len,
- batch_size=batch_size,
- num_buckets=10,
- ratio=0,
- shuffle=True)
- # data loader for training
- loader_train = gluon.data.DataLoader(dataset=data_train,
- num_workers=4,
- batch_sampler=batch_sampler,
- batchify_fn=batchify_fn)
-
- # data dev. For MNLI, more than one dev set is available
- dev_tsv = _task.dataset_dev()
- dev_tsv_list = dev_tsv if isinstance(dev_tsv, list) else [dev_tsv]
- loader_dev_list = []
- for segment, data in dev_tsv_list:
- data_dev = mx.gluon.data.SimpleDataset(list(map(trans, data)))
- loader_dev = mx.gluon.data.DataLoader(data_dev,
- batch_size=dev_batch_size,
- num_workers=4,
- shuffle=False,
- batchify_fn=batchify_fn)
- loader_dev_list.append((segment, loader_dev))
-
- # batchify for data test
- test_batchify_fn = nlp.data.batchify.Tuple(
- nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),
- nlp.data.batchify.Stack(),
- nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to))
-
- # transform for data test
- test_trans = partial(convert_examples_to_features,
- tokenizer=_tokenizer,
- truncate_length=max_len,
- cls_token=_vocab.cls_token,
- sep_token=_vocab.sep_token,
- class_labels=None,
- is_test=True,
- vocab=_vocab)
-
- # data test. For MNLI, more than one test set is available
- test_tsv = _task.dataset_test()
- test_tsv_list = test_tsv if isinstance(test_tsv, list) else [test_tsv]
- loader_test_list = []
- for segment, data in test_tsv_list:
- data_test = mx.gluon.data.SimpleDataset(list(map(test_trans, data)))
- loader_test = mx.gluon.data.DataLoader(data_test,
- batch_size=dev_batch_size,
- num_workers=4,
- shuffle=False,
- batchify_fn=test_batchify_fn)
- loader_test_list.append((segment, loader_test))
- return loader_train, loader_dev_list, loader_test_list, len(data_train)
-
-
-logger = logging.getLogger()
-logger.setLevel(logging.INFO)
-logging.captureWarnings(True)
-handler = logging.FileHandler('log_{0}.txt'.format(args.task_name))
-handler.setLevel(logging.INFO)
-handler2 = logging.StreamHandler()
-handler2.setLevel(logging.INFO)
-formatter = logging.Formatter(
- '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-handler.setFormatter(formatter)
-handler2.setFormatter(formatter)
-logger.addHandler(handler)
-logger.addHandler(handler2)
-logging.info(args)
-
-log_interval = args.log_interval * args.accumulate if args.accumulate else args.log_interval
-
-if args.accumulate:
- logging.info('Using gradient accumulation. Effective batch size = ' \
- 'batch_size * accumulate = %d', args.accumulate * args.batch_size)
-
-# random seed
-np.random.seed(args.seed)
-random.seed(args.seed)
-mx.random.seed(args.seed)
-
-num_workers = 0
-ctxs = [mx.cpu(0)] if not args.gpu else [mx.gpu(i) for i in range(args.gpu)]
-
-task = get_task(args.task_name)
-
-# model and loss
-if args.only_inference and not args.model_parameters:
- warnings.warn('model_parameters is not set. '
- 'Randomly initialized model will be used for inference.')
-
-get_pretrained = True
-
-get_model_params = {
- 'name': args.model_name,
- 'dataset_name': args.dataset,
- 'pretrained': get_pretrained,
- 'ctx': ctxs,
- 'use_decoder': False,
- 'dropout': args.dropout,
- 'attention_dropout': args.attention_dropout
-}
-
-xlnet_base, vocab, tokenizer = model.get_model(**get_model_params)
-# initialize the rest of the parameters
-initializer = mx.init.Normal(0.02)
-
-do_regression = not task.class_labels
-if do_regression:
- num_classes = 1
- loss_function = gluon.loss.L2Loss()
-else:
- num_classes = len(task.class_labels)
- loss_function = gluon.loss.SoftmaxCELoss()
-# reuse the XLnetClassifier class with num_classes=1 for regression
-model = XLNetClassifier(xlnet_base,
- units=xlnet_base._net._units,
- dropout=0.1,
- num_classes=num_classes)
-
-num_ctxes = len(ctxs)
-
-# initialize classifier
-if not args.model_parameters:
- model.classifier.initialize(init=initializer, ctx=ctxs)
- model.pooler.initialize(init=initializer, ctx=ctxs)
-
-# load checkpointing
-output_dir = args.output_dir
-
-if args.model_parameters:
- logging.info('loading model params from %s', args.model_parameters)
- nlp.utils.load_parameters(model,
- args.model_parameters,
- ctx=ctxs,
- cast_dtype=True)
-
-nlp.utils.mkdir(output_dir)
-
-logging.debug(model)
-model.hybridize(static_alloc=True)
-loss_function.hybridize(static_alloc=True)
-
-logging.info('processing dataset...')
-train_data, dev_data_list, test_data_list, num_train_examples = preprocess_data(
- tokenizer, task, args.batch_size, args.dev_batch_size, args.max_len, vocab)
-
-
-def test(loader_test, segment):
- """Inference function on the test dataset."""
- logging.info('Now we are doing testing on %s with %s.', segment, ctxs)
-
- tic = time.time()
- results = []
- for _, seqs in enumerate(loader_test):
- #input_ids, valid_length, segment_ids = seqs
- data_list = list(split_and_load(seqs, ctxs))
- out_list = []
- for splited_data in data_list:
- input_ids, valid_length, segment_ids = splited_data
- out = model(input_ids, segment_ids, valid_length=valid_length)
- out_list.append(out)
- out_list = np.vstack([o.asnumpy() for o in out_list])
- if not task.class_labels:
- # regression task
- for result in out_list.reshape(-1).tolist():
- results.append('{:.3f}'.format(result))
- else:
- # classification task
- out = out_list.reshape(-1, out_list.shape[-1])
- indices = out.argmax(axis=-1)
- for index in indices:
- results.append(task.class_labels[int(index)])
-
- mx.nd.waitall()
- toc = time.time()
- logging.info('Time cost=%.2fs, throughput=%.2f samples/s', toc - tic,
- args.dev_batch_size * len(loader_test) / (toc - tic))
- # write result to a file.
- segment = segment.replace('_mismatched', '-mm')
- segment = segment.replace('_matched', '-m')
- segment = segment.replace('SST', 'SST-2')
- filename = args.task_name + segment.replace('test', '') + '.tsv'
- test_path = os.path.join(args.output_dir, filename)
- with io.open(test_path, 'w', encoding='utf-8') as f:
- f.write(u'index\tprediction\n')
- for i, pred in enumerate(results):
- f.write(u'%d\t%s\n' % (i, str(pred)))
-
-
-def log_metric(metric, is_training=True):
- prefix = 'training' if is_training else 'validation'
- metric_nm, metric_val = metric.get()
- if not isinstance(metric_nm, list):
- metric_nm, metric_val = [metric_nm], [metric_val]
- logging_str = prefix + ' metrics:' + ','.join(
- [i + ':%.4f' for i in metric_nm])
- logging.info(logging_str, *metric_val)
- return metric_nm, metric_val
-
-
-def log_train(batch_id, batch_num, step_loss, _log_interval, epoch_id,
- learning_rate):
- """Generate and print out the log message for training. """
- train_str = '[Epoch %d Batch %d/%d] loss=%.4f, lr=%.7f'
- logging.info(train_str, epoch_id + 1, batch_id + 1, batch_num,
- step_loss / _log_interval, learning_rate)
-
-
-def log_eval(batch_id, batch_num, step_loss, _log_interval):
- """Generate and print out the log message for inference. """
- eval_str = '[Batch %d/%d] loss=%.4f'
- logging.info(eval_str, batch_id + 1, batch_num, step_loss / _log_interval)
-
-
-def train(metric):
- """Training function."""
- if not args.only_inference:
- logging.info('Now we are doing XLNet classification training on %s!',
- ctxs)
-
- all_model_params = model.collect_params()
- optimizer_params = {
- 'learning_rate': args.lr,
- 'epsilon': args.epsilon,
- 'wd': 0
- }
- trainer = gluon.Trainer(all_model_params,
- args.optimizer,
- optimizer_params,
- update_on_kvstore=False)
-
- step_size = args.batch_size * args.accumulate if args.accumulate else args.batch_size
- num_train_steps = int(num_train_examples / step_size * args.epochs)
- epoch_number = args.epochs
- if args.training_steps:
- num_train_steps = args.training_steps
- epoch_number = 9999
- logging.info('training steps=%d', num_train_steps)
- warmup_ratio = args.warmup_ratio
- num_warmup_steps = int(num_train_steps * warmup_ratio)
- step_num = 0
-
- # Do not apply weight decay on LayerNorm and bias terms
- for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
- v.wd_mult = 0.0
- # Collect differentiable parameters
- params = [p for p in all_model_params.values() if p.grad_req != 'null']
-
- # Set grad_req if gradient accumulation is required
- if args.accumulate and args.accumulate > 1:
- for p in params:
- p.grad_req = 'add'
- # track best eval score
- metric_history = []
- best_metric = None
- patience = args.early_stop
-
- tic = time.time()
- finish_flag = False
- for epoch_id in range(epoch_number):
- if args.early_stop and patience == 0:
- logging.info('Early stopping at epoch %d', epoch_id)
- break
- if finish_flag:
- break
- if not args.only_inference:
- metric.reset()
- step_loss = 0
- tic = time.time()
- all_model_params.zero_grad()
- for batch_id, seqs in enumerate(train_data):
- new_lr = args.lr
- # learning rate schedule
- if step_num < num_warmup_steps:
- new_lr = args.lr * step_num / num_warmup_steps
- elif args.lr_decay == 'linear':
- non_warmup_steps = step_num - num_warmup_steps
- offset = non_warmup_steps / (num_train_steps -
- num_warmup_steps)
- new_lr = max(0, args.lr - offset * args.lr)
- trainer.set_learning_rate(new_lr)
- batch_loss = []
- # forward and backward
- with mx.autograd.record():
- data_list = list(split_and_load(seqs, ctxs))
- for splited_data in data_list:
- input_ids, valid_length, segment_ids, label = splited_data
- out = model(input_ids,
- segment_ids,
- valid_length=valid_length)
- ls = loss_function(out, label).mean() / len(ctxs)
- batch_loss.append(ls)
- if args.accumulate:
- ls = ls / args.accumulate
- ls.backward()
- # update
- if not args.accumulate or (batch_id +
- 1) % args.accumulate == 0:
- trainer.allreduce_grads()
- nlp.utils.clip_grad_global_norm(params, 1)
- trainer.update(args.accumulate if args.accumulate else 1,
- ignore_stale_grad=True)
- step_num += 1
- if args.accumulate and args.accumulate > 1:
- # set grad to zero for gradient accumulation
- all_model_params.zero_grad()
- if batch_id == 0 and epoch_id == 0:
- toc = time.time()
- logging.info(
- 'Time cost for the first forward-backward =%.2fs',
- toc - tic)
- batch_loss = sum([ls.asscalar() for ls in batch_loss])
- step_loss += batch_loss
- if (batch_id + 1) % (args.log_interval) == 0:
- log_train(batch_id, len(train_data), step_loss,
- args.log_interval, epoch_id,
- trainer.learning_rate)
- step_loss = 0
- if step_num >= num_train_steps:
- logging.info('Finish training step: %d', step_num)
- finish_flag = True
- break
-
- mx.nd.waitall()
-
- # inference on dev data
- for segment, dev_data in dev_data_list:
- metric_nm, metric_val = evaluate(dev_data, metric, segment)
- if best_metric is None or metric_val >= best_metric:
- best_metric = metric_val
- patience = args.early_stop
- else:
- if args.early_stop is not None:
- patience -= 1
- metric_history.append((epoch_id, metric_nm, metric_val))
-
- if not args.only_inference:
- # save params
- ckpt_name = 'model_xlnet_{0}_{1}.params'.format(
- args.task_name, epoch_id)
- params_saved = os.path.join(output_dir, ckpt_name)
- nlp.utils.save_parameters(model, params_saved)
- logging.info('params saved in: %s', params_saved)
- toc = time.time()
- logging.info('Time cost=%.2fs', toc - tic)
- tic = toc
-
- if not args.only_inference:
- # we choose the best model based on metric[0],
- # assuming higher score stands for better model quality
- metric_history.sort(key=lambda x: x[2][0], reverse=True)
- epoch_id, metric_nm, metric_val = metric_history[0]
- ckpt_name = 'model_xlnet_{0}_{1}.params'.format(
- args.task_name, epoch_id)
- params_saved = os.path.join(output_dir, ckpt_name)
- nlp.utils.load_parameters(model, params_saved)
- metric_str = 'Best model at epoch {}. Validation metrics:'.format(
- epoch_id + 1)
- metric_str += ','.join([i + ':%.4f' for i in metric_nm])
- logging.info(metric_str, *metric_val)
-
- # inference on test data
- for segment, test_data in test_data_list:
- test(test_data, segment)
- print('finish test!')
-
-
-def evaluate(loader_dev, metric, segment):
- """Evaluate the model on validation dataset."""
- logging.info('Now we are doing evaluation on %s with %s.', segment, ctxs)
- metric.reset()
- step_loss = 0
- tic = time.time()
- out_list = []
- label_list = []
- for batch_id, seqs in enumerate(loader_dev):
- batch_loss = []
- # forward and backward
- data_list = list(split_and_load(seqs, ctxs))
- for splited_data in data_list:
- input_ids, valid_length, segment_ids, label = splited_data
- out = model(input_ids, segment_ids, valid_length=valid_length)
- batch_loss.append(loss_function(out, label).mean() / len(ctxs))
- if not do_regression:
- label = label.reshape((-1))
- out_list.append(out.as_in_context(mx.cpu(0)))
- label_list.append(label.as_in_context(mx.cpu(0)))
-
- batch_loss = sum([ls.asscalar() for ls in batch_loss])
- step_loss += batch_loss
- if (batch_id + 1) % (args.log_interval) == 0:
- log_eval(batch_id, len(loader_dev), step_loss, args.log_interval)
- step_loss = 0
-
- label_list = mx.nd.concat(*label_list, dim=0)
- out_list = mx.nd.concat(*out_list, dim=0)
- metric.update([label_list], [out_list])
- metric_nm, metric_val = log_metric(metric, is_training=False)
- mx.nd.waitall()
- toc = time.time()
- logging.info('Time cost=%.2fs, throughput=%.2f samples/s', toc - tic,
- args.dev_batch_size * len(loader_dev) / (toc - tic))
- return metric_nm, metric_val
-
-
-if __name__ == '__main__':
- train(task.metrics)
diff --git a/scripts/language_model/run_squad.py b/scripts/language_model/run_squad.py
deleted file mode 100644
index ab57edf7c4..0000000000
--- a/scripts/language_model/run_squad.py
+++ /dev/null
@@ -1,721 +0,0 @@
-"""
-Question Answering with XLNet
-"""
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-
-import os
-import time
-import argparse
-import random
-import logging
-import warnings
-import json
-import collections
-import pickle
-import sys
-import itertools
-import subprocess
-import multiprocessing as mp
-from functools import partial
-import numpy as np
-import mxnet as mx
-import gluonnlp as nlp
-from gluonnlp.data import SQuAD
-from gluonnlp.data.bert.glue import concat_sequences
-from gluonnlp.data.bert.squad import get_doc_spans, \
- check_is_max_context, convert_squad_examples, align_position2doc_spans
-from gluonnlp.data.xlnet.squad import lcs_match, convert_index
-from model.qa import XLNetForQA
-from transformer import model
-from xlnet_qa_evaluate import predict_extended
-parser = argparse.ArgumentParser(description='XLNet QA example.'
- 'We fine-tune the XLNet model on SQuAD dataset.')
-
-# I/O configuration
-parser.add_argument('--sentencepiece', type=str, default=None,
- help='Path to the sentencepiece .model file for both tokenization and vocab.')
-parser.add_argument('--pretrained_xlnet_parameters', type=str, default=None,
- help='Pre-trained bert model parameter file. default is None')
-parser.add_argument('--load_pickle', action='store_true',
- help='Whether do data preprocessing or load from pickled file')
-parser.add_argument('--dev_dataset_file', default='./output_dir/out.dev', type=str,
- help='Path to dev data features')
-parser.add_argument('--train_dataset_file', default='./output_dir/out.train', type=str,
- help='Path to train data features')
-parser.add_argument('--model_parameters', type=str, default=None, help='Model parameter file')
-parser.add_argument(
- '--output_dir', type=str, default='./output_dir',
- help='The output directory where the model params will be written.'
- ' default is ./output_dir')
-
-# Training configuration
-parser.add_argument('--seed', type=int, default=3, help='Random seed')
-parser.add_argument('--version_2', action='store_true', help='Whether use SQuAD v2.0 dataset')
-parser.add_argument('--model', type=str, default='xlnet_cased_l12_h768_a12',
- choices=['xlnet_cased_l24_h1024_a16', 'xlnet_cased_l12_h768_a12'],
- help='The name of pre-trained XLNet model to fine-tune')
-parser.add_argument('--dataset', type=str, default='126gb', choices=['126gb'],
- help='The dataset BERT pre-trained with. Currently only 126gb is available')
-parser.add_argument(
- '--uncased', action='store_true', help=
- 'if set, inputs are converted to lower case. Up to 01/04/2020, all released models are cased')
-parser.add_argument('--gpu', type=int, default=None,
- help='Number of gpus to use for finetuning. CPU is used if not set.')
-parser.add_argument('--log_interval', type=int, default=10, help='report interval. default is 10')
-parser.add_argument('--debug', action='store_true',
- help='Run the example in test mode for sanity checks')
-parser.add_argument('--only_predict', action='store_true', help='Whether to predict only.')
-
-# Hyperparameters
-parser.add_argument('--epochs', type=int, default=3, help='number of epochs, default is 3')
-parser.add_argument(
- '--training_steps', type=int, help='training steps. Note that epochs will be ignored '
- 'if training steps are set')
-
-parser.add_argument('--batch_size', type=int, default=32,
- help='Batch size. Number of examples per gpu in a minibatch. default is 32')
-
-parser.add_argument('--test_batch_size', type=int, default=24,
- help='Test batch size. default is 24')
-
-parser.add_argument('--optimizer', type=str, default='bertadam',
- help='optimization algorithm. default is bertadam')
-
-parser.add_argument(
- '--accumulate', type=int, default=None, help='The number of batches for '
- 'gradients accumulation to simulate large batch size. Default is None')
-
-parser.add_argument('--lr', type=float, default=3e-5,
- help='Initial learning rate. default is 5e-5')
-
-parser.add_argument(
- '--warmup_ratio', type=float, default=0,
- help='ratio of warmup steps that linearly increase learning rate from '
- '0 to target learning rate. default is 0')
-parser.add_argument('--layerwise_decay', type=float, default=0.75, help='Layer-wise lr decay')
-parser.add_argument('--wd', type=float, default=0.01, help='weight decay')
-parser.add_argument('--dropout', type=float, default=0.1, help='dropout')
-parser.add_argument('--attention_dropout', type=float, default=0.1, help='attention dropout')
-
-# Data pre/post processing
-parser.add_argument(
- '--max_seq_length', type=int, default=512,
- help='The maximum total input sequence length after WordPiece tokenization.'
- 'Sequences longer than this will be truncated, and sequences shorter '
- 'than this will be padded. default is 512')
-
-parser.add_argument(
- '--doc_stride', type=int, default=128,
- help='When splitting up a long document into chunks, how much stride to '
- 'take between chunks. default is 128')
-
-parser.add_argument(
- '--max_query_length', type=int, default=64,
- help='The maximum number of tokens for the question. Questions longer than '
- 'this will be truncated to this length. default is 64')
-
-parser.add_argument(
- '--round_to', type=int, default=None,
- help='The length of padded sequences will be rounded up to be multiple of this argument.'
- 'When round to is set to 8, training throughput may increase for mixed precision'
- 'training on GPUs with tensorcores.')
-
-parser.add_argument('--start_top_n', type=int, default=5,
- help='Number of start-position candidates')
-parser.add_argument('--end_top_n', type=int, default=5,
- help='Number of end-position candidates corresponding '
- 'to a start position')
-parser.add_argument('--n_best_size', type=int, default=5, help='top N results written to file')
-parser.add_argument(
- '--max_answer_length', type=int, default=64,
- help='The maximum length of an answer that can be generated. This is needed '
- 'because the start and end predictions are not conditioned on one another.'
- ' default is 64')
-parser.add_argument('--num_workers', type=int, default=4,
- help='Number of workers used for data preprocessing')
-parser.add_argument(
- '--null_score_diff_threshold', type=float, default=0.0,
- help='If null_score - best_non_null is greater than the threshold predict null.'
- 'Typical values are between -1.0 and -5.0. default is 0.0. '
- 'Note that a best value can be automatically found by the evaluation script')
-
-args = parser.parse_args()
-
-# random seed
-np.random.seed(args.seed)
-random.seed(args.seed)
-mx.random.seed(args.seed)
-
-if not os.path.exists(args.output_dir):
- os.mkdir(args.output_dir)
-
-# set the logger
-log = logging.getLogger('gluonnlp')
-log.setLevel(logging.DEBUG)
-formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
- datefmt='%H:%M:%S')
-fh = logging.FileHandler(os.path.join(args.output_dir, 'finetune_squad.log'))
-fh.setLevel(logging.INFO)
-fh.setFormatter(formatter)
-console = logging.StreamHandler()
-console.setLevel(logging.INFO)
-console.setFormatter(formatter)
-log.addHandler(console)
-log.addHandler(fh)
-
-log.info(args)
-
-pretrained_xlnet_parameters = args.pretrained_xlnet_parameters
-if pretrained_xlnet_parameters and args.model_parameters:
- raise ValueError('Cannot provide both pre-trained BERT parameters and '
- 'BertForQA model parameters.')
-
-ctx = [mx.cpu(0)] if not args.gpu else [mx.gpu(i) for i in range(args.gpu)]
-
-log_interval = args.log_interval * args.accumulate if args.accumulate else args.log_interval
-if args.accumulate:
- log.info('Using gradient accumulation. Effective batch size = %d',
- args.accumulate * args.batch_size)
-if args.max_seq_length <= args.max_query_length + 3:
- raise ValueError('The max_seq_length (%d) must be greater than max_query_length '
- '(%d) + 3' % (args.max_seq_length, args.max_query_length))
-
-get_pretrained = True
-
-get_model_params = {
- 'name': args.model,
- 'dataset_name': args.dataset,
- 'pretrained': get_pretrained,
- 'ctx': ctx,
- 'use_decoder': False,
- 'dropout': args.dropout,
- 'attention_dropout': args.attention_dropout
-}
-
-# model, vocabulary and tokenizer
-xlnet_base, vocab, tokenizer = model.get_model(**get_model_params)
-
-batchify_fn = nlp.data.batchify.Tuple(
- nlp.data.batchify.Stack('int32'), # example_id
- nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], dtype='int32',
- round_to=args.round_to), # input_ids
- nlp.data.batchify.Pad(axis=0, pad_val=3, dtype='int32', round_to=args.round_to), # segment_ids
- nlp.data.batchify.Stack('float32'), # valid_length
- nlp.data.batchify.Pad(axis=0, pad_val=1, round_to=args.round_to), # p_mask
- nlp.data.batchify.Stack('float32'), # start_position
- nlp.data.batchify.Stack('float32'), # end_position
- nlp.data.batchify.Stack('float32')) # is_impossible
-
-if pretrained_xlnet_parameters:
- # only load XLnetModel parameters
- nlp.utils.load_parameters(xlnet_base, pretrained_xlnet_parameters, ctx=ctx, ignore_extra=True,
- cast_dtype=True)
-
-units = xlnet_base._net._units
-net = XLNetForQA(xlnet_base=xlnet_base, start_top_n=args.start_top_n, end_top_n=args.end_top_n,
- units=units)
-
-net_eval = XLNetForQA(xlnet_base=xlnet_base, start_top_n=args.start_top_n,
- end_top_n=args.end_top_n, units=units, is_eval=True,
- params=net.collect_params())
-
-initializer = mx.init.Normal(0.02)
-
-if args.model_parameters:
- # load complete XLNetForQA parameters
- nlp.utils.load_parameters(net, args.model_parameters, ctx=ctx, cast_dtype=True)
-else:
- net.start_logits.initialize(init=initializer, ctx=ctx)
- net.end_logits.initialize(init=initializer, ctx=ctx)
- net.answer_class.initialize(init=initializer, ctx=ctx)
-
-net.hybridize(static_alloc=True)
-net_eval.hybridize(static_alloc=True)
-
-SquadXLNetFeautre = collections.namedtuple('SquadXLNetFeautre', [
- 'example_id', 'qas_id', 'valid_length', 'tokens', 'tok_start_to_orig_index',
- 'tok_end_to_orig_index', 'token_is_max_context', 'input_ids', 'p_mask', 'segment_ids',
- 'start_position', 'end_position', 'paragraph_text', 'paragraph_len', 'is_impossible'
-])
-
-
-def convert_examples_to_features(example, tokenizer=None, cls_token=None, sep_token=None,
- vocab=None, max_seq_length=384, doc_stride=128,
- max_query_length=64, is_training=True):
- """convert the examples to the XLNet features"""
- query_tokenized = tokenizer(example.question_text)[:max_query_length]
- #tokenize paragraph and get start/end position of the answer in tokenized paragraph
- paragraph_tokenized = tokenizer(example.paragraph_text)
-
- chartok_to_tok_index = [] # char to its corresponding token's index
- tok_start_to_chartok_index = [] # token index to its first character's index
- tok_end_to_chartok_index = [] # token index to its last character's index
- char_cnt = 0
- for i, token in enumerate(paragraph_tokenized):
- chartok_to_tok_index.extend([i] * len(token))
- tok_start_to_chartok_index.append(char_cnt)
- char_cnt += len(token)
- tok_end_to_chartok_index.append(char_cnt - 1)
-
- tok_cat_text = ''.join(paragraph_tokenized).replace(u'▁', ' ')
-
- # XLNet takes a more complicated strategy to match the origin text
- # and the tokenized tokens
- # Get the LCS matching between origin text and token-concatenated text.
- n, m = len(example.paragraph_text), len(tok_cat_text)
- max_dist = abs(n - m) + 5
- for _ in range(2):
- f, g = lcs_match(max_dist, example.paragraph_text, tok_cat_text)
- if f[n - 1, m - 1] > 0.8 * n:
- break
- max_dist *= 2
-
- # Get the mapping from orgin text/tokenized text to tokenized text/origin text
- orig_to_chartok_index = [None] * n
- chartok_to_orig_index = [None] * m
- i, j = n - 1, m - 1
- while i >= 0 and j >= 0:
- if (i, j) not in g:
- break
- if g[(i, j)] == 2:
- orig_to_chartok_index[i] = j
- chartok_to_orig_index[j] = i
- i, j = i - 1, j - 1
- elif g[(i, j)] == 1:
- j = j - 1
- else:
- i = i - 1
-
- # get start/end mapping
- tok_start_to_orig_index = []
- tok_end_to_orig_index = []
- for i in range(len(paragraph_tokenized)): # for each token in the tokenized paragraph
- start_chartok_pos = tok_start_to_chartok_index[i] # first character's index in origin text
- end_chartok_pos = tok_end_to_chartok_index[i] # last character's index in origin text
- start_orig_pos = convert_index(chartok_to_orig_index, start_chartok_pos, n, is_start=True)
- end_orig_pos = convert_index(chartok_to_orig_index, end_chartok_pos, m, is_start=False)
-
- tok_start_to_orig_index.append(start_orig_pos)
- tok_end_to_orig_index.append(end_orig_pos)
-
- tok_start_position, tok_end_position = -1, -1
-
- # get mapped start/end position
- if is_training and not example.is_impossible:
- start_chartok_pos = convert_index(orig_to_chartok_index, example.start_offset,
- is_start=True)
- tok_start_position = chartok_to_tok_index[start_chartok_pos]
-
- end_chartok_pos = convert_index(orig_to_chartok_index, example.end_offset, is_start=False)
- tok_end_position = chartok_to_tok_index[end_chartok_pos]
- assert tok_start_position <= tok_end_position
-
- # get doc spans using sliding window
- doc_spans, doc_spans_indices = get_doc_spans(paragraph_tokenized,
- max_seq_length - len(query_tokenized) - 3,
- doc_stride)
-
- # record whether the tokens in a docspan have max context
- token_is_max_context = [{
- p: check_is_max_context(doc_spans_indices, i, p + doc_spans_indices[i][0])
- for p in range(len(doc_span))
- } for (i, doc_span) in enumerate(doc_spans)]
-
- # get token -> origin text mapping
- cur_tok_start_to_orig_index = [[tok_start_to_orig_index[p + st] for p in range(len(doc_span))]
- for doc_span, (st, ed) in zip(doc_spans, doc_spans_indices)]
- cur_tok_end_to_orig_index = [[tok_end_to_orig_index[p + st] for p in range(len(doc_span))]
- for doc_span, (st, ed) in zip(doc_spans, doc_spans_indices)]
-
- # get sequence features: tokens, segment_ids, p_masks
- seq_features = [
- concat_sequences([doc_span, query_tokenized], [[sep_token]] * 2 + [[cls_token]],
- [[0] * len(doc_span), [1] * len(query_tokenized)], [[1], [1], [0]])
- for doc_span in doc_spans
- ]
-
- # get the start/end positions aligned to doc spans. If is_impossible or position out of span
- # set position to cls_index, i.e., last token in the sequence.
- if not example.is_impossible:
- positions = [
- align_position2doc_spans([tok_start_position, tok_end_position], doc_idx, offset=0,
- default_value=len(seq[0]) - 1)
- for (doc_idx, seq) in zip(doc_spans_indices, seq_features)
- ]
- else:
- positions = [(len(seq_feature[0]) - 1, len(seq_feature[0]) - 1)
- for seq_feature in seq_features]
-
- features = [
- SquadXLNetFeautre(example_id=example.example_id, qas_id=example.qas_id,
- tok_start_to_orig_index=t2st, tok_end_to_orig_index=t2ed,
- valid_length=len(tokens), tokens=tokens, token_is_max_context=is_max,
- input_ids=vocab[tokens], p_mask=p_mask, segment_ids=segment_ids,
- start_position=start, end_position=end,
- paragraph_text=example.paragraph_text, paragraph_len=len(tokens),
- is_impossible=(start == len(tokens) - 1))
- for (tokens, segment_ids, p_mask), (
- start,
- end), is_max, t2st, t2ed in zip(seq_features, positions, token_is_max_context,
- cur_tok_start_to_orig_index, cur_tok_end_to_orig_index)
- ]
- return features
-
-
-def preprocess_dataset(tokenizer, dataset, vocab=None, max_seq_length=384, doc_stride=128,
- max_query_length=64, num_workers=16, load_from_pickle=False,
- feature_file=None, is_training=True):
- """Loads a dataset into features"""
- vocab = tokenizer.vocab if vocab is None else vocab
- trans = partial(convert_examples_to_features, tokenizer=tokenizer, cls_token=vocab.cls_token,
- sep_token=vocab.sep_token, vocab=vocab, max_seq_length=max_seq_length,
- doc_stride=doc_stride, max_query_length=max_query_length)
- pool = mp.Pool(num_workers)
- start = time.time()
- if not load_from_pickle:
- example_trans = partial(convert_squad_examples, is_training=is_training)
- # convert the raw dataset into raw features
- examples = pool.map(example_trans, dataset)
- raw_features = list(map(trans, examples)) #pool.map(trans, examples)
- if feature_file:
- with open(feature_file, 'wb') as file:
- pickle.dump(raw_features, file)
- else:
- assert feature_file, 'feature file should be provided.'
- with open(feature_file, 'rb') as file:
- raw_features = pickle.load(file)
-
- end = time.time()
- pool.close()
- log.info('Done! Transform dataset costs %.2f seconds.', (end - start))
- return raw_features
-
-
-def convert_full_features_to_input_features(raw_features):
- """convert the full features into the input features"""
- data_features = mx.gluon.data.SimpleDataset(list(itertools.chain.from_iterable(raw_features)))
- data_features = data_features.transform(lambda *example: (
- example[0], # example_id
- example[7], # inputs_id
- example[9], # segment_ids
- example[2], # valid_length,
- example[8], # p_mask
- example[10], # start_position,
- example[11], # end_position
- example[14])) # is_impossible
- return data_features
-
-
-def split_array(arr, num_of_splits):
- """split an array into equal pieces"""
- # TODO Replace this function with gluon.utils.split_data() once targeting MXNet 1.7
- size = arr.shape[0]
- if size < num_of_splits:
- return [arr[i:i + 1] for i in range(size)]
- slice_len, rest = divmod(size, num_of_splits)
- div_points = [0] + [(slice_len * index + min(index, rest) + slice_len + (index < rest))
- for index in range(num_of_splits)]
- slices = [arr[div_points[i]:div_points[i + 1]] for i in range(num_of_splits)]
- return slices
-
-
-def split_and_load(arrs, _ctxs):
- """split and load arrays to a list of contexts"""
- # TODO Replace split_array() with gluon.utils.split_data() once targeting MXNet 1.7
- assert isinstance(arrs, (list, tuple))
- # split and load
- loaded_arrs = [[i.as_in_context(ctx) for i, ctx in zip(split_array(arr, len(_ctxs)), _ctxs)]
- for arr in arrs]
- return zip(*loaded_arrs)
-
-
-def _apply_gradient_decay():
- """apply layer-wise gradient decay.
-
- Note that the description in origin paper about layer-wise learning rate decay
- is inaccurate. According to their implementation, they are actually performing
- layer-wise gradient decay. Gradient decay and learning rate decay could be the
- same by using standard SGD, but different by using Adaptive optimizer(e.g., Adam).
- """
- parameter_not_included = ['seg_emb', 'query_key_bias', 'query_emb_bias', 'query_seg_bias']
- num_layers = len(xlnet_base._net.transformer_cells)
- for (i, layer_parameters) in enumerate(xlnet_base._net.transformer_cells):
- layer_params = layer_parameters.collect_params()
- for key, value in layer_params.items():
- skip = False
- for pn in parameter_not_included:
- if pn in key:
- skip = True
- if skip:
- continue
- if value.grad_req != 'null':
- for arr in value.list_grad():
- arr *= args.layerwise_decay**(num_layers - i - 1)
-
-
-def train():
- """Training function."""
- segment = 'train'
- log.info('Loading %s data...', segment)
- # Note that for XLNet, the authors always use squad2 dataset for training
- train_data = SQuAD(segment, version='2.0')
- if args.debug:
- sampled_data = [train_data[i] for i in range(100)]
- train_data = mx.gluon.data.SimpleDataset(sampled_data)
- log.info('Number of records in Train data: %s', len(train_data))
-
- train_data_features = preprocess_dataset(
- tokenizer, train_data, vocab=vocab, max_seq_length=args.max_seq_length,
- doc_stride=args.doc_stride, num_workers=args.num_workers,
- max_query_length=args.max_query_length, load_from_pickle=args.load_pickle,
- feature_file=args.train_dataset_file)
-
- train_data_input = convert_full_features_to_input_features(train_data_features)
- log.info('The number of examples after preprocessing: %s', len(train_data_input))
-
- train_dataloader = mx.gluon.data.DataLoader(train_data_input, batchify_fn=batchify_fn,
- batch_size=args.batch_size, num_workers=4,
- shuffle=True)
-
- optimizer_params = {'learning_rate': args.lr, 'wd': args.wd}
- try:
- trainer = mx.gluon.Trainer(net.collect_params(), args.optimizer, optimizer_params,
- update_on_kvstore=False)
- except ValueError as _:
- warnings.warn('AdamW optimizer is not found. Please consider upgrading to '
- 'mxnet>=1.5.0. Now the original Adam optimizer is used instead.')
- trainer = mx.gluon.Trainer(net.collect_params(), 'bertadam', optimizer_params,
- update_on_kvstore=False)
-
- num_train_examples = len(train_data_input)
- step_size = args.batch_size * args.accumulate if args.accumulate else args.batch_size
- num_train_steps = int(num_train_examples / step_size * args.epochs)
- epoch_number = args.epochs
- if args.training_steps:
- num_train_steps = args.training_steps
- epoch_number = 100000
-
- log.info('training steps=%d', num_train_steps)
- num_warmup_steps = int(num_train_steps * args.warmup_ratio)
- step_num = 0
-
- def set_new_lr(step_num, batch_id):
- """set new learning rate"""
- # set grad to zero for gradient accumulation
- if args.accumulate:
- if batch_id % args.accumulate == 0:
- net.collect_params().zero_grad()
- step_num += 1
- else:
- step_num += 1
- # learning rate schedule
- # Notice that this learning rate scheduler is adapted from traditional linear learning
- # rate scheduler where step_num >= num_warmup_steps, new_lr = 1 - step_num/num_train_steps
- if step_num < num_warmup_steps:
- new_lr = args.lr * step_num / num_warmup_steps
- else:
- offset = (step_num - num_warmup_steps) * args.lr / \
- (num_train_steps - num_warmup_steps)
- new_lr = args.lr - offset
- trainer.set_learning_rate(new_lr)
- return step_num
-
- # Do not apply weight decay on LayerNorm and bias terms
- for _, v in net.collect_params('.*beta|.*gamma|.*bias').items():
- v.wd_mult = 0.0
- # Collect differentiable parameters
- params = [p for p in net.collect_params().values() if p.grad_req != 'null']
- # Set grad_req if gradient accumulation is required
- if args.accumulate:
- for p in params:
- p.grad_req = 'add'
-
- epoch_tic = time.time()
- total_num = 0
- log_num = 0
- finish_flag = False
- for epoch_id in range(epoch_number):
- step_loss = 0.0
- step_loss_span = 0
- step_loss_cls = 0
- tic = time.time()
- if finish_flag:
- break
- for batch_id, data in enumerate(train_dataloader):
- # set new lr
- step_num = set_new_lr(step_num, batch_id)
- data_list = list(split_and_load(data, ctx))
- # forward and backward
- batch_loss = []
- batch_loss_sep = []
- with mx.autograd.record():
- for splited_data in data_list:
- _, inputs, token_types, valid_length, p_mask, start_label, end_label, is_impossible = splited_data # pylint: disable=line-too-long
- valid_length = valid_length.astype('float32')
- log_num += len(inputs)
- total_num += len(inputs)
- out_sep, out = net(
- inputs,
- token_types,
- valid_length,
- [start_label, end_label],
- p_mask=p_mask, # pylint: disable=line-too-long
- is_impossible=is_impossible)
- ls = out.mean() / len(ctx)
- batch_loss_sep.append(out_sep)
- batch_loss.append(ls)
- if args.accumulate:
- ls = ls / args.accumulate
- ls.backward()
- # update
- if not args.accumulate or (batch_id + 1) % args.accumulate == 0:
- trainer.allreduce_grads()
- nlp.utils.clip_grad_global_norm(params, 1)
- _apply_gradient_decay()
- trainer.update(1, ignore_stale_grad=True)
-
- step_loss_sep_tmp = np.array(
- [[span_ls.mean().asscalar(),
- cls_ls.mean().asscalar()] for span_ls, cls_ls in batch_loss_sep])
- step_loss_sep_tmp = list(np.sum(step_loss_sep_tmp, axis=0))
- step_loss_span += step_loss_sep_tmp[0] / len(ctx)
- step_loss_cls += step_loss_sep_tmp[1] / len(ctx)
-
- step_loss += sum([ls.asscalar() for ls in batch_loss])
- if (batch_id + 1) % log_interval == 0:
- toc = time.time()
- log.info(
- 'Epoch: %d, Batch: %d/%d, Loss=%.4f, lr=%.7f '
- 'Time cost=%.1f Thoughput=%.2f samples/s', epoch_id + 1, batch_id + 1,
- len(train_dataloader), step_loss / log_interval, trainer.learning_rate,
- toc - tic, log_num / (toc - tic))
- log.info('span_loss: %.4f, cls_loss: %.4f', step_loss_span / log_interval,
- step_loss_cls / log_interval)
-
- tic = time.time()
- step_loss = 0.0
- step_loss_span = 0
- step_loss_cls = 0
- log_num = 0
- if step_num >= num_train_steps:
- logging.info('Finish training step: %d', step_num)
- finish_flag = True
- break
- epoch_toc = time.time()
- log.info('Time cost=%.2f s, Thoughput=%.2f samples/s', epoch_toc - epoch_tic,
- total_num / (epoch_toc - epoch_tic))
- version_prefix = 'squad2' if args.version_2 else 'squad1'
- ckpt_name = 'model_{}_{}_{}.params'.format(args.model, version_prefix, epoch_id + 1)
- params_saved = os.path.join(args.output_dir, ckpt_name)
- nlp.utils.save_parameters(net, params_saved)
- log.info('params saved in: %s', params_saved)
-
-
-RawResultExtended = collections.namedtuple(
- 'RawResultExtended',
- ['start_top_log_probs', 'start_top_index', 'end_top_log_probs', 'end_top_index', 'cls_logits'])
-
-
-def evaluate():
- """Evaluate the model on validation dataset.
- """
- log.info('Loading dev data...')
- if args.version_2:
- dev_data = SQuAD('dev', version='2.0')
- else:
- dev_data = SQuAD('dev', version='1.1')
- (_, _), (data_file_name, _) \
- = dev_data._data_file[dev_data._version][dev_data._segment]
- dev_data_path = os.path.join(dev_data._root, data_file_name)
-
- if args.debug:
- sampled_data = [dev_data[0], dev_data[1], dev_data[2]]
- dev_data = mx.gluon.data.SimpleDataset(sampled_data)
- log.info('Number of records in dev data: %d', len(dev_data))
-
- dev_data_features = preprocess_dataset(
- tokenizer, dev_data, vocab=vocab, max_seq_length=args.max_seq_length,
- doc_stride=args.doc_stride, num_workers=args.num_workers,
- max_query_length=args.max_query_length, load_from_pickle=args.load_pickle,
- feature_file=args.dev_dataset_file)
-
- dev_data_input = convert_full_features_to_input_features(dev_data_features)
- log.info('The number of examples after preprocessing: %d', len(dev_data_input))
-
- dev_dataloader = mx.gluon.data.DataLoader(dev_data_input, batchify_fn=batchify_fn,
- num_workers=4, batch_size=args.test_batch_size,
- shuffle=False, last_batch='keep')
-
- log.info('start prediction')
-
- all_results = collections.defaultdict(list)
-
- epoch_tic = time.time()
- total_num = 0
- for (batch_id, data) in enumerate(dev_dataloader):
- data_list = list(split_and_load(data, ctx))
- for splited_data in data_list:
- example_ids, inputs, token_types, valid_length, p_mask, _, _, _ = splited_data
- total_num += len(inputs)
- outputs = net_eval(inputs, token_types, valid_length, p_mask=p_mask)
- example_ids = example_ids.asnumpy().tolist()
- for c, example_ids in enumerate(example_ids):
- result = RawResultExtended(start_top_log_probs=outputs[0][c].asnumpy().tolist(),
- start_top_index=outputs[1][c].asnumpy().tolist(),
- end_top_log_probs=outputs[2][c].asnumpy().tolist(),
- end_top_index=outputs[3][c].asnumpy().tolist(),
- cls_logits=outputs[4][c].asnumpy().tolist())
- all_results[example_ids].append(result)
- if batch_id % args.log_interval == 0:
- log.info('Batch: %d/%d', batch_id + 1, len(dev_dataloader))
-
- epoch_toc = time.time()
- log.info('Time cost=%2f s, Thoughput=%.2f samples/s', epoch_toc - epoch_tic,
- total_num / (epoch_toc - epoch_tic))
-
- log.info('Get prediction results...')
-
- all_predictions = collections.OrderedDict()
- all_nbest_json = collections.OrderedDict()
- scores_diff_json = collections.OrderedDict()
- for features in dev_data_features:
- results = all_results[features[0].example_id]
- example_qas_id = features[0].qas_id
- score_diff, best_non_null_entry, nbest_json = predict_extended(
- features=features, results=results, n_best_size=args.n_best_size,
- max_answer_length=args.max_answer_length, start_n_top=args.start_top_n,
- end_n_top=args.end_top_n)
- scores_diff_json[example_qas_id] = score_diff
- all_predictions[example_qas_id] = best_non_null_entry
- all_nbest_json[example_qas_id] = nbest_json
-
- output_prediction_file = os.path.join(args.output_dir, 'predictions.json')
- output_nbest_file = os.path.join(args.output_dir, 'nbest_predictions.json')
- output_null_log_odds_file = os.path.join(args.output_dir, 'null_odds.json')
-
- with open(output_prediction_file, 'w') as writer:
- writer.write(json.dumps(all_predictions, indent=4) + '\n')
- with open(output_nbest_file, 'w') as writer:
- writer.write(json.dumps(all_nbest_json, indent=4) + '\n')
- with open(output_null_log_odds_file, 'w') as writer:
- writer.write(json.dumps(scores_diff_json, indent=4) + '\n')
-
- if os.path.exists(sys.path[0] + '/evaluate-v2.0.py'):
- arguments = [
- dev_data_path, output_prediction_file, '--na-prob-thresh',
- str(args.null_score_diff_threshold)
- ]
- if args.version_2:
- arguments += ['--na-prob-file', output_null_log_odds_file]
- subprocess.call([sys.executable, sys.path[0] + '/evaluate-v2.0.py'] + arguments)
- else:
- log.info('Please download evaluate-v2.0.py to get evaluation results for SQuAD. '
- 'Check index.rst for the detail.')
-
-
-if __name__ == '__main__':
- if not args.only_predict:
- train()
- evaluate()
- else:
- evaluate()
diff --git a/scripts/language_model/sampler.py b/scripts/language_model/sampler.py
deleted file mode 100644
index f841fba160..0000000000
--- a/scripts/language_model/sampler.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Log Uniform Candidate Sampler"""
-
-import math
-import numpy as np
-from mxnet import ndarray, gluon
-
-
-class LogUniformSampler(gluon.block.Block):
- """Draw random samples from an approximately log-uniform or Zipfian distribution.
-
- This operation randomly samples *num_sampled* candidates the range of integers [0, range_max).
- The elements of sampled_candidates are drawn without replacement from the base distribution.
-
- The base distribution for this operator is an approximately log-uniform or Zipfian distribution:
-
- P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)
-
- This sampler is useful when the true classes approximately follow such a distribution.
-
- For example, if the classes represent words in a lexicon sorted in decreasing order of
- frequency. If your classes are not ordered by decreasing frequency, do not use this op.
-
- Additionally, it also returns the number of times each of the
- true classes and the sampled classes is expected to occur.
-
- As the candidates are drawn without replacement, the expected count for the sampled candidates
- and true classes are approximated. If the candidates are drawn with `num_tries` draws, we assume
- (falsely) that the number of tries to get a batch of batch_size distinct values is always
- `num_tries`, and the probability that the value is in a batch is 1 - (1-p)**num_tries.
-
- Parameters
- ----------
- num_sampled: int
- The number of classes to randomly sample.
- range_max: int
- The number of possible classes.
- dtype: str or np.dtype
- The dtype for outputs
- """
- def __init__(self, range_max, num_sampled, dtype=None, **kwargs):
- super(LogUniformSampler, self).__init__(**kwargs)
- self._num_sampled = num_sampled
- self._log_range = math.log(range_max + 1)
- self._dtype = np.float32 if dtype is None else dtype
- self._range_max = range_max
-
- def _prob_helper(self, num_tries, prob):
- return (num_tries.astype('float64') * (-prob).log1p()).expm1() * -1
-
- def forward(self, true_classes): # pylint: disable=arguments-differ
- """Draw samples from log uniform distribution and returns sampled candidates,
- expected count for true classes and sampled classes.
-
- Parameters
- ----------
- true_classes: NDArray
- The true classes.
-
- Returns
- -------
- samples: NDArray
- The sampled candidate classes.
- expected_count_sample: NDArray
- The expected count for sampled candidates.
- expected_count_true: NDArray
- The expected count for true classes in the same shape as `true_classes`.
- """
- num_sampled = self._num_sampled
- ctx = true_classes.context
- num_tries = 0
- log_range = math.log(self._range_max + 1)
-
- # sample candidates
- f = ndarray._internal._sample_unique_zipfian
- sampled_classes, num_tries = f(self._range_max, shape=(1, num_sampled))
- sampled_classes = sampled_classes.reshape((-1,))
- sampled_classes = sampled_classes.as_in_context(ctx)
- num_tries = num_tries.as_in_context(ctx)
-
- # expected count for true classes
- true_cls = true_classes.as_in_context(ctx).astype('float64')
- prob_true = ((true_cls + 2.0) / (true_cls + 1.0)).log() / log_range
- count_true = self._prob_helper(num_tries, prob_true)
- # expected count for sampled classes
- sampled_classes = ndarray.array(sampled_classes, ctx=ctx, dtype='int64')
- sampled_cls_fp64 = sampled_classes.astype('float64')
- prob_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 1.0)).log() / log_range
- count_sampled = self._prob_helper(num_tries, prob_sampled)
- # convert to dtype
- sampled_classes = sampled_classes.astype(self._dtype, copy=False)
- count_true = count_true.astype(self._dtype, copy=False)
- count_sampled = count_sampled.astype(self._dtype, copy=False)
- return sampled_classes, count_sampled, count_true
diff --git a/scripts/language_model/transformer/__init__.py b/scripts/language_model/transformer/__init__.py
deleted file mode 100644
index f687b12e5b..0000000000
--- a/scripts/language_model/transformer/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utilities for Language Models based on the Transformer architecture."""
-
-from .attention_cell import *
-from .data import *
-from .embedding import *
-from .model import *
-from .softmax import *
-from .transformer import *
-
-__all__ = attention_cell.__all__ + embedding.__all__ + softmax.__all__ + \
- transformer.__all__ + model.__all__ + data.__all__
diff --git a/scripts/language_model/transformer/attention_cell.py b/scripts/language_model/transformer/attention_cell.py
deleted file mode 100644
index d82ece2960..0000000000
--- a/scripts/language_model/transformer/attention_cell.py
+++ /dev/null
@@ -1,394 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Attention cells."""
-
-__all__ = [
- 'PositionalEmbeddingMultiHeadAttentionCell',
- 'RelativeSegmentEmbeddingPositionalEmbeddingMultiHeadAttentionCell'
-]
-
-import math
-
-import mxnet as mx
-
-from gluonnlp.model.attention_cell import _masked_softmax
-
-
-def _rel_shift(F, x):
- """Perform relative shift operation following Dai et al. (2019) Appendix B.
-
- Unlike Dai et al.'s implementation, the relative shift is performed on the
- last two dimensions of the ndarray x. Further, we follow Yang et al. (2019)
- in not performing zero-padding but expecting the input array to be one
- element longer along the to be shifted dimension. For example, for
- TransformerXL, the pos_seq should be `arange(start=klen, stop=-qlen,
- step=-1)` instead of `arange(start=klen - 1, stop=-qlen, step=-1)`.
-
- Assumes len(x.shape) == 3 (could be generalized once F.swapaxes supports
- negative indices)
-
- """
- x_ = x
- # Reshape to x.shape[:-2] + [x.shape[-1] + 1, x.shape[-2]]
- x_ = F.reshape_like(x_, F.swapaxes(x_, 1, 2))
- # Remove padded elements
- x_ = F.slice_axis(x_, axis=-2, begin=1, end=None)
- # Reshape back to original shape
- x = F.reshape_like(x_, F.swapaxes(x_, 1, 2))
- return x
-
-
-class PositionalEmbeddingMultiHeadAttentionCell(mx.gluon.HybridBlock):
- """Multi-head Attention Cell with positional embeddings.
-
- Parameters
- ----------
- d_head
- Number of projected units for respectively query, key, value and
- positional embeddings per attention head.
- num_heads
- Number of parallel attention heads
- dropout
- scaled
- weight_initializer : str or `Initializer` or None, default None
- Initializer of the weights.
- bias_initializer : str or `Initializer`, default 'zeros'
- Initializer of the bias.
- """
-
- def __init__(self, d_head: int, num_heads: int, dropout: float, scaled: bool,
- weight_initializer=None, bias_initializer='zeros', dtype='float32', prefix=None,
- params=None):
- super().__init__(prefix=prefix, params=params)
- self._d_head = d_head
- self._num_heads = num_heads
- self._dropout = dropout
- self._scaled = scaled
- self._dtype = dtype
- units = ['query', 'key', 'value', 'emb']
- with self.name_scope():
- for name in units:
- setattr(
- self, 'proj_{}'.format(name),
- mx.gluon.nn.Dense(units=d_head * num_heads, use_bias=False, flatten=False,
- weight_initializer=weight_initializer,
- bias_initializer=bias_initializer, prefix='{}_'.format(name)))
- self.query_key_bias = self.params.get('query_key_bias', shape=(num_heads, d_head),
- init=bias_initializer)
- self.query_emb_bias = self.params.get('query_emb_bias', shape=(num_heads, d_head),
- init=bias_initializer)
- if dropout:
- self._dropout_layer = mx.gluon.nn.Dropout(dropout)
-
- def hybrid_forward(self, F, query, key, value, emb, mask, query_key_bias, query_emb_bias): # pylint: disable=arguments-differ
- """Compute the attention.
-
- Parameters
- ----------
- query : Symbol or NDArray
- Query vector. Shape (batch_size, query_length, query_dim)
- key : Symbol or NDArray
- Key of the memory. Shape (batch_size, memory_length, key_dim)
- value : Symbol or NDArray
- Value of the memory. Shape (batch_size, memory_length, value_dim)
- emb : Symbol or NDArray
- Positional embeddings. Shape (memory_length + 1, value_dim)
- mask : Symbol or NDArray
- Mask of the memory slots. Shape (batch_size, query_length, memory_length)
- Only contains 0 or 1 where 0 means that the memory slot will not be used.
- If set to None. No mask will be used.
-
- Returns
- -------
- context_vec : Symbol or NDArray
- Shape (batch_size, query_length, context_vec_dim)
- att_weights : Symbol or NDArray
- Attention weights of multiple heads.
- Shape (batch_size, num_heads, query_length, memory_length)
- """
- att_weights = self._compute_weight(F, query, key, emb, mask, query_key_bias=query_key_bias,
- query_emb_bias=query_emb_bias)
- context_vec = self._read_by_weight(F, att_weights, value)
- return context_vec, att_weights
-
- def _project(self, F, name, x):
- # Shape (batch_size, query_length, num_heads * d_head)
- x = getattr(self, 'proj_{}'.format(name))(x)
- # Shape (batch_size * num_heads, query_length, d_head)
- x = F.transpose(x.reshape(shape=(0, 0, self._num_heads, -1)),
- axes=(0, 2, 1, 3))\
- .reshape(shape=(-1, 0, 0), reverse=True)
- return x
-
- def _compute_weight(self, F, query, key, emb, mask, query_key_bias, query_emb_bias):
- # Project query, key and emb
- proj_query = self.proj_query(query).reshape(shape=(0, 0, self._num_heads, -1))
- proj_key = self.proj_key(key).reshape(shape=(0, 0, self._num_heads, -1))
- proj_emb = self.proj_emb(emb).reshape(shape=(-1, self._num_heads, self._d_head))
-
- # Add biases and transpose to (batch_size, num_heads, query_length,
- # d_head) or (num_heads, query_length, d_head)
- query_with_key_bias = F.transpose(
- F.broadcast_add(proj_query, F.reshape(query_key_bias, shape=(1, 1, 0, 0),
- reverse=True)), axes=(0, 2, 1, 3))
- query_with_emb_bias = F.transpose(
- F.broadcast_add(proj_query, F.reshape(query_emb_bias, shape=(1, 1, 0, 0),
- reverse=True)), axes=(0, 2, 1, 3))
- proj_key = F.transpose(proj_key, axes=(0, 2, 1, 3))
- proj_emb = F.transpose(proj_emb, axes=(1, 0, 2))
-
- # Broadcast emb along batch axis
- proj_emb = F.broadcast_like(F.expand_dims(proj_emb, axis=0), proj_key, lhs_axes=(0, ),
- rhs_axes=(0, ))
-
- # Merge batch and num_heads axes
- query_with_key_bias = query_with_key_bias.reshape(shape=(-1, 0, 0), reverse=True)
- proj_key = proj_key.reshape(shape=(-1, 0, 0), reverse=True)
- query_with_emb_bias = query_with_emb_bias.reshape(shape=(-1, 0, 0), reverse=True)
- proj_emb = proj_emb.reshape(shape=(-1, 0, 0), reverse=True)
-
- if mask is not None:
- # Insert and broadcast along num_heads axis. Merge num_heads and
- # batch_size axes: (batch_size * num_heads, query_length,
- # memory_length)
- mask = F.broadcast_axis(F.expand_dims(mask, axis=1), axis=1, size=self._num_heads)\
- .reshape(shape=(-1, 0, 0), reverse=True)
-
- att_score_AC = F.batch_dot(query_with_key_bias, proj_key, transpose_b=True)
- att_score_BD = F.batch_dot(query_with_emb_bias, proj_emb, transpose_b=True)
-
- # Relative shift
- shifted_att_score_BD = _rel_shift(F, att_score_BD)
- shifted_att_score_BD = F.slice_like(shifted_att_score_BD, shape_like=att_score_AC,
- axes=(2, ))
-
- att_score = att_score_AC + shifted_att_score_BD
- if self._scaled:
- att_score = att_score / math.sqrt(self._d_head)
-
- att_weights = _masked_softmax(F, att_score, mask, self._dtype)
- if self._dropout:
- att_weights = self._dropout_layer(att_weights)
-
- return att_weights.reshape(shape=(-1, self._num_heads, 0, 0), reverse=True)
-
- def _read_by_weight(self, F, att_weights, value):
- att_weights = att_weights.reshape(shape=(-1, 0, 0), reverse=True)
- proj_value = self._project(F, 'value', value)
- context_vec = F.batch_dot(att_weights, proj_value)
- context_vec = F.transpose(
- context_vec.reshape(shape=(-1, self._num_heads, 0, 0), reverse=True),
- axes=(0, 2, 1, 3)).reshape(shape=(0, 0, -1))
- return context_vec
-
-
-class RelativeSegmentEmbeddingPositionalEmbeddingMultiHeadAttentionCell(mx.gluon.HybridBlock):
- """Multi-head Attention Cell with positional embeddings.
-
- Parameters
- ----------
- d_head
- Number of projected units for respectively query, key, value and
- positional embeddings per attention head.
- num_heads
- Number of parallel attention heads
- dropout
- scaled
- weight_initializer : str or `Initializer` or None, default None
- Initializer of the weights.
- bias_initializer : str or `Initializer`, default 'zeros'
- Initializer of the bias.
- embedding_initializer
- Initializer of the segment embeddings.
- """
-
- def __init__(self, d_head: int, num_heads: int, dropout: float, scaled: bool,
- weight_initializer=None, embedding_initializer=None, bias_initializer='zeros',
- dtype='float32', prefix=None, params=None):
- super().__init__(prefix=prefix, params=params)
- self._d_head = d_head
- self._num_heads = num_heads
- self._dropout = dropout
- self._scaled = scaled
- self._dtype = dtype
- units = ['query', 'key', 'value', 'emb']
- with self.name_scope():
- for name in units:
- setattr(
- self, 'proj_{}'.format(name),
- mx.gluon.nn.Dense(units=d_head * num_heads, use_bias=False, flatten=False,
- weight_initializer=weight_initializer,
- bias_initializer=bias_initializer, prefix='{}_'.format(name)))
- self.query_key_bias = self.params.get('query_key_bias', shape=(num_heads, d_head),
- init=bias_initializer)
- self.query_emb_bias = self.params.get('query_emb_bias', shape=(num_heads, d_head),
- init=bias_initializer)
- self.seg_emb = self.params.get('seg_emb', shape=(2, num_heads, d_head),
- init=embedding_initializer)
- self.query_seg_bias = self.params.get('query_seg_bias', shape=(num_heads, d_head),
- init=bias_initializer)
- if dropout:
- self._dropout_layer = mx.gluon.nn.Dropout(dropout)
-
- # pylint: disable=arguments-differ
- def hybrid_forward(self, F, query, key, value, emb, mask, segments, query_key_bias,
- query_emb_bias, seg_emb, query_seg_bias):
- """Compute the attention.
-
- Parameters
- ----------
- query : Symbol or NDArray
- Query vector. Shape (batch_size, query_length, query_dim)
- key : Symbol or NDArray
- Key of the memory. Shape (batch_size, memory_length, key_dim)
- value : Symbol or NDArray
- Value of the memory. Shape (batch_size, memory_length, value_dim)
- emb : Symbol or NDArray
- Positional embeddings. Shape (memory_length + 1, value_dim) or
- (memory_length + query_length + 1, value_dim)
- mask : Symbol or NDArray
- Mask of the memory slots. Shape (batch_size, query_length, memory_length)
- Only contains 0 or 1 where 0 means that the memory slot will not be used.
- If set to None. No mask will be used.
- segments : Symbol or NDArray
- One-hot vector indicating if a query-key pair is in the same
- segment or not. Shape [batch_size, query_length, key_length , 2].
- `1` indicates that the pair is not in the same segment.
-
- Returns
- -------
- context_vec : Symbol or NDArray
- Shape (batch_size, query_length, context_vec_dim)
- att_weights : Symbol or NDArray
- Attention weights of multiple heads.
- Shape (batch_size, num_heads, query_length, memory_length)
- """
- att_weights = self._compute_weight(F, query=query, key=key, emb=emb, segments=segments,
- seg_emb=seg_emb, mask=mask,
- query_key_bias=query_key_bias,
- query_emb_bias=query_emb_bias,
- query_seg_bias=query_seg_bias)
- context_vec = self._read_by_weight(F, att_weights, value)
- return context_vec, att_weights
-
- def _project(self, F, name, x):
- # Shape (batch_size, query_length, num_heads * d_head)
- x = getattr(self, 'proj_{}'.format(name))(x)
- # Shape (batch_size * num_heads, query_length, d_head)
- x = F.transpose(x.reshape(shape=(0, 0, self._num_heads, -1)),
- axes=(0, 2, 1, 3))\
- .reshape(shape=(-1, 0, 0), reverse=True)
- return x
-
- def _compute_weight(self, F, query, key, emb, segments, seg_emb, mask, query_key_bias,
- query_emb_bias, query_seg_bias):
- # Project query, key and emb
- proj_query = self.proj_query(query).reshape(shape=(0, 0, self._num_heads, -1))
- proj_key = self.proj_key(key).reshape(shape=(0, 0, self._num_heads, -1))
- proj_emb = self.proj_emb(emb).reshape(shape=(-1, self._num_heads, self._d_head))
-
- # Add biases and transpose to (batch_size, num_heads, query_length,
- # d_head) or (num_heads, query_length, d_head)
- query_with_key_bias = F.transpose(
- F.broadcast_add(proj_query, F.reshape(query_key_bias, shape=(1, 1, 0, 0),
- reverse=True)), axes=(0, 2, 1, 3))
- query_with_emb_bias = F.transpose(
- F.broadcast_add(proj_query, F.reshape(query_emb_bias, shape=(1, 1, 0, 0),
- reverse=True)), axes=(0, 2, 1, 3))
- query_with_seg_bias = F.transpose(
- F.broadcast_add(proj_query, F.reshape(query_seg_bias, shape=(1, 1, 0, 0),
- reverse=True)), axes=(0, 2, 1, 3))
- proj_key = F.transpose(proj_key, axes=(0, 2, 1, 3))
- proj_emb = F.transpose(proj_emb, axes=(1, 0, 2))
-
- # Broadcast emb along batch axis
- proj_emb = F.broadcast_like(F.expand_dims(proj_emb, axis=0), proj_key, lhs_axes=(0, ),
- rhs_axes=(0, ))
-
- # Merge batch and num_heads axes
- query_with_key_bias = query_with_key_bias.reshape(shape=(-1, 0, 0), reverse=True)
- proj_key = proj_key.reshape(shape=(-1, 0, 0), reverse=True)
- query_with_emb_bias = query_with_emb_bias.reshape(shape=(-1, 0, 0), reverse=True)
- proj_emb = proj_emb.reshape(shape=(-1, 0, 0), reverse=True)
- query_with_seg_bias = query_with_seg_bias.reshape(shape=(-1, 0, 0), reverse=True)
-
- if mask is not None:
- # Insert and broadcast along num_heads axis. Merge num_heads and
- # batch_size axes: (batch_size * num_heads, query_length,
- # memory_length)
- mask = F.broadcast_axis(F.expand_dims(mask, axis=1), axis=1, size=self._num_heads)\
- .reshape(shape=(-1, 0, 0), reverse=True)
-
- att_score_AC = F.batch_dot(query_with_key_bias, proj_key, transpose_b=True)
- att_score_BD = F.batch_dot(query_with_emb_bias, proj_emb, transpose_b=True)
-
- # Relative Segment Embeddings
- # einsum bnid,snd->bnis
- seg_emb = F.transpose(seg_emb, axes=(1, 2, 0)).expand_dims(0)
- seg_emb = F.broadcast_like(lhs=seg_emb, rhs=query, lhs_axes=[0], rhs_axes=[0])
- seg_emb = seg_emb.reshape(shape=(-1, 0, 0), reverse=True)
- # seg_emb of shape (batch_size * num_heads, d_head, 2)
-
- ef = F.batch_dot(query_with_seg_bias, seg_emb)
- ef = ef.reshape(shape=(-1, self._num_heads, 0, 2), reverse=True)
- # ef of shape (batch_size, num_heads, query_length, 2)
-
- # einsum bijs,bnis->bnij
- segments = segments.reshape(shape=(-1, 2), reverse=True)
- # segments of shape (batch_size * query_length * memory_length, 2)
- efs = []
- for n in range(self._num_heads):
- # shape (batch_size, 1, query_length, 2)
- ef_n = ef.slice_axis(axis=1, begin=n, end=n + 1)
- ef_n = ef_n.transpose((0, 2, 1, 3)) # shape (batch_size, query_length, 1, 2)
- ef_n = F.broadcast_like(lhs=ef_n, rhs=key, lhs_axes=[2], rhs_axes=[1])
- ef_n_merged = ef_n.reshape(shape=(-1, 2), reverse=True)
- # ef_n_merged of shape (batch_size * query_length * memory_length, 2)
-
- ef_n_result = F.batch_dot(segments.expand_dims(1), ef_n_merged.expand_dims(2))
- # ef_n_result of shape (batch_size * query_length * memory_length, 1, 1)
- ef_n_result = ef_n_result.reshape_like(ef_n, lhs_begin=0, lhs_end=3, rhs_begin=0,
- rhs_end=3).expand_dims(1)
- # ef_n_result of shape (batch_size, 1, query_length, memory_length)
- efs.append(ef_n_result)
-
- att_score_EF = F.concat(*efs, dim=1).reshape(shape=(-1, 0, 0), reverse=True)
- # shape (batch_size * num_heads, query_length, memory_length)
-
- # Relative shift
- shifted_att_score_BD = _rel_shift(F, att_score_BD)
- shifted_att_score_BD = F.slice_like(shifted_att_score_BD, shape_like=att_score_AC,
- axes=(2, ))
-
- att_score = att_score_AC + shifted_att_score_BD + att_score_EF
- if self._scaled:
- att_score = att_score / math.sqrt(self._d_head)
-
- att_weights = _masked_softmax(F, att_score, mask, self._dtype)
- if self._dropout:
- att_weights = self._dropout_layer(att_weights)
-
- return att_weights.reshape(shape=(-1, self._num_heads, 0, 0), reverse=True)
-
- def _read_by_weight(self, F, att_weights, value):
- att_weights = att_weights.reshape(shape=(-1, 0, 0), reverse=True)
- proj_value = self._project(F, 'value', value)
- context_vec = F.batch_dot(att_weights, proj_value)
- context_vec = F.transpose(
- context_vec.reshape(shape=(-1, self._num_heads, 0, 0), reverse=True),
- axes=(0, 2, 1, 3)).reshape(shape=(0, 0, -1))
- return context_vec
diff --git a/scripts/language_model/transformer/data.py b/scripts/language_model/transformer/data.py
deleted file mode 100644
index b4b0ef3b7f..0000000000
--- a/scripts/language_model/transformer/data.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-encoded-data, too-many-lines
-"""Transformer API.
-
-It provides tools for common transformation on samples in text dataset, such as
-clipping, padding, and tokenization.
-"""
-import unicodedata
-from typing import List, Optional
-
-import gluonnlp as nlp
-
-__all__ = ['XLNetTokenizer']
-
-
-class XLNetTokenizer:
- """End-to-end tokenization for XLNet models.
-
- Parameters
- ----------
- sentencepiece_path
- Path to sentencepiece model, to be used for obtaining word pieces.
-
- .. note::
-
- For multi-processing, making an extra copy of the XLNetTokenizer instance
- is recommended before calling it for the first time is recommended.
- SentencePiece models can't be pickled, which is needed for
- multi-processing. The SentencePiece model is initialized during the first
- call.
-
- Examples
- --------
- >>> _, vocab = gluonnlp.model.bert_12_768_12(dataset_name='wiki_multilingual_uncased',
- ... pretrained=False, root='./model')
- -etc-
- >>> tokenizer = gluonnlp.data.BERTTokenizer(vocab=vocab)
- >>> tokenizer('gluonnlp: 使NLP变得简单。')
- ['gl', '##uo', '##nn', '##lp', ':', '使', 'nl', '##p', '变', '得', '简', '单', '。']
-
- """
- _spiece_prefix = '▁'
-
- def __init__(self, sentencepiece_path: str, lower: bool = False, remove_space: bool = True,
- keep_accents: bool = False):
- self._sentencepiece_path = sentencepiece_path
- self._lower = lower
- self._remove_space = remove_space
- self._keep_accents = keep_accents
- self._sentencepiece = None # type: Optional[nlp.data.SentencepieceTokenizer]
-
- def __call__(self, sample: str) -> List[str]:
- """Tokenize a sample.
-
- Parameters
- ----------
- sample
- The string to tokenize.
-
- Returns
- -------
- tokens
- List of tokens
- """
-
- if self._remove_space:
- outputs = ' '.join(sample.strip().split())
- else:
- outputs = sample
- outputs = outputs.replace('``', '"').replace('\'\'', '"')
-
- if not self._keep_accents:
- outputs = unicodedata.normalize('NFKD', outputs)
- outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
- if self._lower:
- outputs = outputs.lower()
-
- if self._sentencepiece is None:
- self._sentencepiece = nlp.data.SentencepieceTokenizer(self._sentencepiece_path)
-
- pieces = self._sentencepiece(outputs)
- new_pieces = [] # type: List[str]
- for piece in pieces:
- if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
- cur_pieces = self._sentencepiece(piece[:-1].replace(self._spiece_prefix, ''))
- if piece[0] != self._spiece_prefix and cur_pieces[0][0] == self._spiece_prefix:
- if len(cur_pieces[0]) == 1:
- cur_pieces = cur_pieces[1:]
- else:
- cur_pieces[0] = cur_pieces[0][1:]
- cur_pieces.append(piece[-1])
- new_pieces.extend(cur_pieces)
- else:
- new_pieces.append(piece)
-
- return new_pieces
diff --git a/scripts/language_model/transformer/embedding.py b/scripts/language_model/transformer/embedding.py
deleted file mode 100644
index c937e09457..0000000000
--- a/scripts/language_model/transformer/embedding.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Attention cells."""
-
-__all__ = ['AdaptiveEmbedding', 'ProjectedEmbedding']
-
-from typing import List
-
-import mxnet as mx
-
-
-class ProjectedEmbedding(mx.gluon.HybridBlock):
- """Projected Embedding"""
-
- def __init__(self, vocab_size: int, embed_size: int, units: int, project_same_dim: bool = True,
- embedding_initializer=None, projection_initializer=None, prefix=None, params=None):
- super().__init__(prefix=prefix, params=params)
- self._vocab_size = vocab_size
- self._embed_size = embed_size
- self._units = units
- self._project_same_dim = project_same_dim
- self._emb_scale = units**0.5
-
- with self.name_scope():
- self.embedding_weight = self.params.get('embedding_weight',
- shape=(vocab_size, embed_size),
- init=embedding_initializer)
- if units != embed_size or project_same_dim:
- self.projection_weight = self.params.get('projection_weight',
- shape=(units, embed_size),
- init=projection_initializer)
-
- def hybrid_forward(self, F, inp, **params): # pylint: disable=arguments-differ
- emb = F.Embedding(data=inp, weight=params['embedding_weight'], input_dim=self._vocab_size,
- output_dim=self._embed_size)
- if self._units != self._embed_size or self._project_same_dim:
- emb = F.FullyConnected(data=emb, weight=params['projection_weight'], no_bias=True,
- flatten=False, num_hidden=self._units)
- return emb * self._emb_scale
-
-
-class AdaptiveEmbedding(mx.gluon.HybridBlock):
- """Adaptive Embedding
-
- Baevski, A., & Auli, M. (2019). Adaptive input representations for neural
- language modeling. In International Conference on Learning Representations.
-
- """
-
- # TODO: Transformer-XL has a sample_softmax argument here
-
- def __init__(self, vocab_size: int, embed_size: int, units: int, cutoffs: List[int],
- div_val: int = 1, project_same_dim: bool = True, embedding_initializer=None,
- projection_initializer=None, prefix=None, params=None):
- super().__init__(prefix=prefix, params=params)
- # Sanity checks
- if cutoffs != sorted(cutoffs):
- raise ValueError('cutoffs must be a sorted list of cutoff values. '
- 'Got {}, but expected {}'.format(cutoffs, sorted(cutoffs)))
- if not cutoffs:
- raise ValueError('cutoffs must not be empty. Got {}'.format(cutoffs))
- if cutoffs[0] <= 0:
- raise ValueError('The first cutoff value ({}) must be greater 0.'.format(cutoffs[0]))
- if cutoffs[-1] >= vocab_size:
- raise ValueError(
- 'The last cutoff value ({}) must be smaller than vocab_size ({}).'.format(
- cutoffs[-1], vocab_size))
-
- self._vocab_size = vocab_size
- self._embed_size = embed_size
- self._cutoffs = [0] + cutoffs + [vocab_size]
- self._div_val = div_val
- self._units = units
- self._project_same_dim = project_same_dim
- self._emb_scale = units**0.5
-
- with self.name_scope():
- if self._div_val == 1:
- name = 'embedding0_weight'
- setattr(
- self, name,
- self.params.get(name, shape=(vocab_size, embed_size),
- init=embedding_initializer))
-
- if units != embed_size or project_same_dim:
- name = 'projection0_weight'
- setattr(
- self, name,
- self.params.get(name, shape=(units, embed_size),
- init=projection_initializer))
- else:
- for i, (l_idx, r_idx) in enumerate(zip(self._cutoffs, self._cutoffs[1:])):
- name = 'embedding{}_weight'.format(i)
- setattr(
- self, name,
- self.params.get(name, shape=(r_idx - l_idx, embed_size // div_val**i),
- init=embedding_initializer))
-
- if units != embed_size // div_val**i or project_same_dim:
- name = 'projection{}_weight'.format(i)
- setattr(
- self, name,
- self.params.get(name, shape=(units, embed_size // div_val**i),
- init=projection_initializer))
-
- def hybrid_forward(self, F, inp, **params): # pylint: disable=arguments-differ
- if self._div_val == 1:
- emb = F.Embedding(data=inp, weight=params['embedding0_weight'],
- input_dim=self._vocab_size, output_dim=self._embed_size)
- if self._units != self._embed_size or self._project_same_dim:
- emb = F.FullyConnected(data=emb, weight=params['projection0_weight'], no_bias=True,
- flatten=False, num_hidden=self._units)
- else:
- inp_flat = inp.reshape((-1, ))
- zeros_like_inp_flat = F.zeros_like(inp_flat)
- ones_like_inp_flat = F.ones_like(inp_flat)
- emb_flat = None
- for i, (l_idx, r_idx) in enumerate(zip(self._cutoffs, self._cutoffs[1:])):
- cond_i = F.broadcast_logical_and(inp_flat >= l_idx, inp_flat < r_idx)
- inp_i = F.where(cond_i, inp_flat - l_idx, zeros_like_inp_flat)
- mask_i = F.expand_dims(F.where(cond_i, ones_like_inp_flat, zeros_like_inp_flat),
- axis=1)
-
- emb_i = F.Embedding(data=inp_i, weight=params['embedding{}_weight'.format(i)],
- input_dim=r_idx - l_idx,
- output_dim=self._embed_size // self._div_val**i)
- emb_i = F.broadcast_mul(emb_i, mask_i)
- if self._units != self._embed_size // self._div_val**i or self._project_same_dim:
- emb_i = F.FullyConnected(data=emb_i,
- weight=params['projection{}_weight'.format(i)],
- no_bias=True, flatten=False, num_hidden=self._units)
-
- if emb_flat is None: # i == 0
- emb_flat = emb_i
- else:
- emb_flat = emb_flat + emb_i
-
- emb = F.reshape_like(emb_flat, inp, lhs_begin=0, lhs_end=1)
-
- emb = emb * self._emb_scale
-
- return emb
diff --git a/scripts/language_model/transformer/model.py b/scripts/language_model/transformer/model.py
deleted file mode 100644
index de4d7dbbe6..0000000000
--- a/scripts/language_model/transformer/model.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import, arguments-differ
-"""Module for pre-defined NLP models."""
-
-import errno
-import os
-import time
-import zipfile
-from typing import Optional
-
-import mxnet as mx
-from mxnet.gluon.model_zoo import model_store
-from mxnet.gluon.utils import _get_repo_url, check_sha1, download
-
-import gluonnlp as nlp
-from gluonnlp.base import get_home_dir
-from gluonnlp.data.utils import _url_format
-from gluonnlp.model.utils import _load_pretrained_params, _load_vocab
-
-from .data import XLNetTokenizer
-from .transformer import TransformerXL, XLNet
-
-__all__ = ['get_model']
-
-model_store._model_sha1.update({
- name: checksum
- for checksum, name in [
- ('ca7a092186ec3f42ef25590a872450409faaa84f', 'xlnet_cased_l12_h768_a12_126gb'),
- ('ceae74798c1577bcf5ffb3c46b73b056a5ead786', 'xlnet_cased_l24_h1024_a16_126gb'),
- ]
-})
-
-
-def get_model(name, **kwargs):
- """Returns a pre-defined model by name."""
- models = {
- # TODO better naming scheme when moving this to main API?
- 'transformerxl': transformerxl,
- 'xlnet_cased_l12_h768_a12': xlnet_cased_l12_h768_a12,
- 'xlnet_cased_l24_h1024_a16': xlnet_cased_l24_h1024_a16
- }
- name = name.lower()
- if name not in models:
- raise ValueError('Model %s is not supported. Available options are\n\t%s' %
- (name, '\n\t'.join(sorted(models.keys()))))
- return models[name](**kwargs)
-
-
-def transformerxl(dataset_name: str, vocab: nlp.Vocab, **kwargs):
- """Generic pre-trained Transformer-XL model.
-
- The hyperparameters are chosen based on the specified dataset_name from the
- published hyperparameters of Dai et al.
-
-
- References:
- Dai, Z., Yang, Z., Yang, Y., Cohen, W. W., Carbonell, J., Le, Q. V., &
- Salakhutdinov, R. (2019). Transformer-XL: Attentive language models beyond
- a fixed-length context. arXiv preprint arXiv:1901.02860, (), .
-
- Parameters
- ----------
- dataset_name
- Used to load hyperparameters for the dataset.
- vocab
- Vocabulary for the dataset.
-
- Returns
- -------
- TransformerXL, gluonnlp.Vocab
-
- """
-
- dataset_name_to_kwargs = dict(
- wt103={
- 'embed_cutoffs': [20000, 40000, 200000],
- 'embed_size': 1024,
- 'embed_div_val': 4,
- 'tie_input_output_embeddings': True,
- 'tie_input_output_projections': [False, True, True, True],
- 'num_layers': 18,
- 'hidden_size': 4096,
- 'units': 1024,
- 'num_heads': 16,
- 'dropout': 0,
- 'attention_dropout': 0
- }, lm1b={
- 'embed_cutoffs': [60000, 100000, 640000],
- 'embed_size': 1280,
- 'embed_div_val': 4,
- 'project_same_dim': False,
- 'tie_input_output_embeddings': True,
- 'num_layers': 24,
- 'hidden_size': 8192,
- 'units': 1280,
- 'num_heads': 16,
- 'dropout': 0,
- 'attention_dropout': 0
- }, enwik8={
- 'embed_size': 1024,
- 'tie_input_output_embeddings': True,
- 'num_layers': 24,
- 'hidden_size': 3072,
- 'units': 1024,
- 'num_heads': 8,
- 'dropout': 0,
- 'attention_dropout': 0
- }, text8={
- 'embed_size': 1024,
- 'tie_input_output_embeddings': True,
- 'num_layers': 24,
- 'hidden_size': 3072,
- 'units': 1024,
- 'num_heads': 8,
- 'dropout': 0,
- 'attention_dropout': 0
- })
-
- options = dataset_name_to_kwargs[dataset_name]
- options.update(**kwargs)
- model = TransformerXL(vocab_size=len(vocab), **options)
- return model, vocab
-
-
-def xlnet_cased_l12_h768_a12(dataset_name: Optional[str] = None, vocab: Optional[nlp.Vocab] = None,
- tokenizer: Optional[XLNetTokenizer] = None, pretrained: bool = True,
- ctx: mx.Context = mx.cpu(),
- root=os.path.join(get_home_dir(), 'models'),
- do_lower_case=False, **kwargs):
- """XLNet model.
-
- References:
- Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., & Le, Q. V.
- (2019). XLNet: Generalized Autoregressive Pretraining for Language
- Understanding. arXiv preprint arXiv:1906.08237.
-
-
- Parameters
- ----------
- dataset_name : str or None, default None
- If not None, the dataset name is used to load a vocabulary for the
- dataset. If the `pretrained` argument is set to True, the dataset name
- is further used to select the pretrained parameters to load.
- Options include 'books_enwiki_giga5_clueweb2012b_commoncrawl'.
- vocab : gluonnlp.vocab.Vocab or None, default None
- Vocabulary for the dataset. Must be provided if dataset_name is not
- specified. Ignored if dataset_name is specified.
- tokenizer : XLNetTokenizer or None, default None
- XLNetTokenizer for the dataset. Must be provided if dataset_name is not
- specified. Ignored if dataset_name is specified.
- pretrained : bool, default True
- Whether to load the pretrained weights for model.
- ctx : Context, default CPU
- The context in which to load the pretrained weights.
- root : str, default '$MXNET_HOME/models'
- Location for keeping the model parameters.
- MXNET_HOME defaults to '~/.mxnet'.
-
- Returns
- -------
- XLNet, gluonnlp.Vocab
- """
-
- kwargs.update(**{
- 'hidden_size': 3072,
- 'units': 768,
- 'activation': 'gelu',
- 'num_heads': 12,
- 'num_layers': 12,
- })
- if vocab is None or dataset_name is not None:
- vocab = _load_vocab('xlnet_' + dataset_name, vocab, root)
- net = XLNet(vocab_size=len(vocab), **kwargs)
- if pretrained:
- _load_pretrained_params(net=net, model_name='xlnet_cased_l12_h768_a12',
- dataset_name=dataset_name, root=root, ctx=ctx,
- ignore_extra=not kwargs.get('use_decoder', True))
- if tokenizer is None or dataset_name is not None:
- tokenizer = _get_xlnet_tokenizer(dataset_name, root, do_lower_case)
- return net, vocab, tokenizer
-
-
-def xlnet_cased_l24_h1024_a16(dataset_name: Optional[str] = None, vocab: Optional[nlp.Vocab] = None,
- tokenizer: Optional[XLNetTokenizer] = None, pretrained: bool = True,
- ctx: mx.Context = mx.cpu(),
- root=os.path.join(get_home_dir(), 'models'),
- do_lower_case=False, **kwargs):
- """XLNet model.
-
- References:
- Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., & Le, Q. V.
- (2019). XLNet: Generalized Autoregressive Pretraining for Language
- Understanding. arXiv preprint arXiv:1906.08237.
-
-
- Parameters
- ----------
- dataset_name : str or None, default None
- If not None, the dataset name is used to load a vocabulary for the
- dataset. If the `pretrained` argument is set to True, the dataset name
- is further used to select the pretrained parameters to load.
- Options include 'books_enwiki_giga5_clueweb2012b_commoncrawl'.
- vocab : gluonnlp.vocab.Vocab or None, default None
- Vocabulary for the dataset. Must be provided if dataset_name is not
- specified. Ignored if dataset_name is specified.
- tokenizer : XLNetTokenizer or None, default None
- XLNetTokenizer for the dataset. Must be provided if dataset_name is not
- specified. Ignored if dataset_name is specified.
- pretrained : bool, default True
- Whether to load the pretrained weights for model.
- ctx : Context, default CPU
- The context in which to load the pretrained weights.
- root : str, default '$MXNET_HOME/models'
- Location for keeping the model parameters.
- MXNET_HOME defaults to '~/.mxnet'.
-
- Returns
- -------
- XLNet, gluonnlp.Vocab, XLNetTokenizer
-
- """
- kwargs.update(**{
- 'hidden_size': 4096,
- 'units': 1024,
- 'activation': 'approx_gelu',
- 'num_heads': 16,
- 'num_layers': 24,
- })
- if vocab is None or dataset_name is not None:
- vocab = _load_vocab('xlnet_' + dataset_name, vocab, root)
- net = XLNet(vocab_size=len(vocab), **kwargs)
- if pretrained:
- _load_pretrained_params(net=net, model_name='xlnet_cased_l24_h1024_a16',
- dataset_name=dataset_name, root=root, ctx=ctx,
- ignore_extra=not kwargs.get('use_decoder', True))
- if tokenizer is None or dataset_name is not None:
- tokenizer = _get_xlnet_tokenizer(dataset_name, root, do_lower_case)
- return net, vocab, tokenizer
-
-
-def _get_xlnet_tokenizer(dataset_name, root, do_lower_case=False):
- assert dataset_name.lower() == '126gb'
- root = os.path.expanduser(root)
- file_path = os.path.join(root, 'xlnet_126gb-871f0b3c.spiece')
- sha1_hash = '871f0b3c13b92fc5aea8fba054a214c420e302fd'
- if os.path.exists(file_path):
- if not check_sha1(file_path, sha1_hash):
- print('Detected mismatch in the content of model tokenizer. Downloading again.')
- else:
- print('Tokenizer file is not found. Downloading.')
-
- if not os.path.exists(root):
- try:
- os.makedirs(root)
- except OSError as e:
- if e.errno == errno.EEXIST and os.path.isdir(root):
- pass
- else:
- raise e
-
- repo_url = _get_repo_url()
- prefix = str(time.time())
- zip_file_path = os.path.join(root, prefix + 'xlnet_126gb-871f0b3c.zip')
- if repo_url[-1] != '/':
- repo_url = repo_url + '/'
- download(_url_format.format(repo_url=repo_url, file_name='xlnet_126gb-871f0b3c'),
- path=zip_file_path, overwrite=True)
- with zipfile.ZipFile(zip_file_path) as zf:
- if not os.path.exists(file_path):
- zf.extractall(root)
- try:
- os.remove(zip_file_path)
- except OSError as e:
- # file has already been removed.
- if e.errno == 2:
- pass
- else:
- raise e
-
- if not check_sha1(file_path, sha1_hash):
- raise ValueError('Downloaded file has different hash. Please try again.')
-
- tokenizer = XLNetTokenizer(file_path, lower=do_lower_case)
- return tokenizer
diff --git a/scripts/language_model/transformer/softmax.py b/scripts/language_model/transformer/softmax.py
deleted file mode 100644
index cbd86d01e5..0000000000
--- a/scripts/language_model/transformer/softmax.py
+++ /dev/null
@@ -1,360 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Attention cells."""
-
-__all__ = ['AdaptiveLogSoftmaxWithLoss', 'ProjectedLogSoftmaxWithLoss']
-
-from typing import List, Optional
-
-import mxnet as mx
-
-
-class ProjectedLogSoftmaxWithLoss(mx.gluon.HybridBlock):
- """ProjectedLogSoftmaxWithLoss"""
-
- def __init__(self, vocab_size: int, embed_size: int, units: int, use_bias: bool = True,
- project_same_dim: bool = True, projection_initializer=None,
- embedding_initializer=None, tie_embeddings: bool = False,
- tie_projections: bool = False, prefix: Optional[str] = None,
- params: Optional[mx.gluon.ParameterDict] = None):
- super().__init__(prefix=prefix, params=params)
- self._vocab_size = vocab_size
- self._embed_size = embed_size
- self._use_bias = use_bias
- self._units = units
- self._project_same_dim = project_same_dim
- self._embedding_initializer = embedding_initializer
- self._projection_initializer = projection_initializer
- self._tie_embeddings = tie_embeddings
- self._tie_projections = tie_projections
-
- self._projections_name = '{}projection_weight'
- self._embeddings_name = '{}embedding_weight'
- with self.name_scope():
- if units != embed_size or project_same_dim:
- name = self._get_param_name('projection')
- param = self.params.get(name, shape=(units, embed_size),
- init=self._projection_initializer)
- setattr(self, name, param)
-
- name = self._get_param_name('embedding')
- param = self.params.get(name, shape=(vocab_size, embed_size),
- init=self._embedding_initializer)
- setattr(self, name, param)
- if use_bias:
- name = 'outembedding_bias'
- param = self.params.get(name, shape=(self._vocab_size, ))
- setattr(self, name, param)
-
- def _get_param_name(self, name):
- if name == 'projection':
- return self._projections_name.format('' if self._tie_projections else 'out')
- elif name == 'embedding':
- return self._embeddings_name.format('' if self._tie_embeddings else 'out')
- else:
- raise ValueError('Invalid name')
-
- def hybrid_forward(self, F, hidden, target, **params): # pylint: disable=arguments-differ
- """Compute adaptive softmax.
-
- Parameters
- ----------
- hidden : Symbol or NDArray
- Inputs of shape [batch_size, sequence_length, units]
- target : Symbol or NDArray
- Targets of shape [batch_size, sequence_length]
-
- Returns
- -------
- out : Symbol or NDArray
- Negative log likelihood of targets with shape [batch_size,
- sequence_length]
- """
- if target is None: # TODO support None or add separate log_prob method
- raise NotImplementedError()
-
- # Work with flat data for simplicity
- target_flat = target.reshape((-1, ))
- hidden = F.reshape(hidden, shape=(-1, 0), reverse=True)
-
- # Helper arrays
- if F is mx.nd:
- range_bs_len = mx.nd.arange(target_flat.shape[0], dtype=target_flat.dtype,
- ctx=target_flat.context)
- else:
- # Shape inference fails when relying on F.stack(range_bs_len, ...)
- # below. Thus add zeros of intended shape here to simplify the
- # shape inference problem.
- range_bs_len = F.zeros_like(target_flat) + F.arange(start=0, stop=None,
- infer_range=True)
-
- if self._units != self._embed_size or self._project_same_dim:
- name = self._get_param_name('projection')
- hidden = F.FullyConnected(data=hidden, weight=F.transpose(params[name]), no_bias=True,
- flatten=False, num_hidden=self._embed_size)
-
- name = self._get_param_name('embedding')
- logits = F.FullyConnected(data=hidden, weight=params[name],
- bias=params['outembedding_bias'] if self._use_bias else None,
- no_bias=not self._use_bias, flatten=False,
- num_hidden=self._vocab_size)
- logprob = F.log_softmax(logits)
- target_ = F.stack(range_bs_len, target_flat)
- out = F.gather_nd(logprob, indices=target_)
-
- out = F.reshape_like(out, target)
-
- return -out
-
-
-class AdaptiveLogSoftmaxWithLoss(mx.gluon.HybridBlock):
- """Efficient softmax approximation
-
- Grave, E., Joulin, A., Cissé, M., Jégou, H., & others, (2017). Efficient
- softmax approximation for GPUs. In , Proceedings of the 34th International
- Conference on Machine Learning-Volume 70 (pp. 1302–1310).
-
- Parameters
- ----------
- vocab_size
- embed_size
- units
- Feature dimension of inputs. Must be specified, as shape inference
- would fail if the first batch does not contain target indices of every
- cluster.
- cutoffs
- Ordered list of cutoff values for the clusters.
- div_val
- Division value to obtain embed_size per cluster. For cluster i:
- embed_size / div_val**i.
- use_bias
- Use a bias for the output layer.
- projection_initializer
- Initializer for the projection layers.
- embedding_initializer
- Initializer for the output layers and cluster weights. Called
- embedding_initializer, as the parameters may be tied to the embedding
- parameters of AdaptiveEmbedding.
- tie_embeddings
- Share embedding parameters with an AdaptiveEmbedding Block? If True, the
- params argument must be provided and set to the ParameterDict of the
- AdaptiveEmbedding Block.
- tie_projections
- Share projection parameters with an AdaptiveEmbedding Block? If True, the
- params argument must be provided and set to the ParameterDict of the
- AdaptiveEmbedding Block. tie_projections should be a list of boolean
- values, specifying if the projection weights for the respective
- parameter are to be shared or not.
-
- """
-
- def __init__(self, vocab_size: int, embed_size: int, units: int, cutoffs: List[int],
- div_val: int = 1, use_bias: bool = True, project_same_dim: bool = True,
- projection_initializer=None, embedding_initializer=None,
- tie_embeddings: bool = False, tie_projections: Optional[List[bool]] = None,
- prefix: Optional[str] = None, params: Optional[mx.gluon.ParameterDict] = None):
- super().__init__(prefix=prefix, params=params)
- self._vocab_size = vocab_size
- self._embed_size = embed_size
- self._cutoffs = [0] + cutoffs + [vocab_size]
- self._div_val = div_val
- self._use_bias = use_bias
- self._units = units
- self._project_same_dim = project_same_dim
- self._embedding_initializer = embedding_initializer
- self._projection_initializer = projection_initializer
- self._tie_embeddings = tie_embeddings
- self._tie_projections = tie_projections
-
- # Sanity checks
- if cutoffs != sorted(cutoffs):
- raise ValueError('cutoffs must be a sorted list of cutoff values. '
- 'Got {}, but expected {}'.format(cutoffs, sorted(cutoffs)))
- if not cutoffs:
- raise ValueError('cutoffs must not be empty. Got {}'.format(cutoffs))
- if cutoffs[0] <= 0:
- raise ValueError('The first cutoff value ({}) must be greater 0.'.format(cutoffs[0]))
- if cutoffs[-1] >= vocab_size:
- raise ValueError(
- 'The last cutoff value ({}) must be smaller than vocab_size ({}).'.format(
- cutoffs[-1], vocab_size))
-
- if tie_embeddings:
- assert params is not None
- if tie_projections is not None:
- assert params is not None
- if div_val == 1:
- if self._units == self._embed_size:
- assert len(tie_projections) == 0
- elif len(tie_projections) != 1:
- raise ValueError(
- 'tie_projections should be None or a boolean for every cluster. '
- 'As div_val == 1 there is only a single cluster. But got ({}).'.format(
- tie_projections))
- if len(tie_projections) != len(cutoffs) + 1:
- raise ValueError(
- 'tie_projections should be None or a boolean for every cluster. '
- 'It must thus have len(cutoffs) + 1. But got ({}) for cutoffs ({}).'.format(
- tie_projections, cutoffs))
-
- self._projections_name = '{}projection{}_weight'
- self._embeddings_name = '{}embedding{}_weight'
- with self.name_scope():
- if self._div_val == 1:
- if self._units != self._embed_size or project_same_dim:
- name = self._get_param_name('projection', 0)
- param = self.params.get(name, shape=(self._units, self._embed_size),
- init=self._projection_initializer)
- setattr(self, name, param)
-
- name = self._get_param_name('embedding', 0)
- param = self.params.get(name, shape=(self._vocab_size, self._embed_size),
- init=self._embedding_initializer)
- setattr(self, name, param)
- if use_bias:
- name = 'outembedding0_bias'
- param = self.params.get(name, shape=(self._vocab_size, ))
- setattr(self, name, param)
- else:
- for i, (l_idx, r_idx) in enumerate(zip(self._cutoffs, self._cutoffs[1:])):
- if self._units != self._embed_size // self._div_val**i or project_same_dim:
- name = self._get_param_name('projection', i)
- param = self.params.get(
- name, shape=(self._units, self._embed_size // self._div_val**i),
- init=self._projection_initializer)
- setattr(self, name, param)
-
- name = self._get_param_name('embedding', i)
- param = self.params.get(
- name, shape=(r_idx - l_idx, self._embed_size // self._div_val**i),
- init=self._embedding_initializer)
- setattr(self, name, param)
- if use_bias:
- name = 'outembedding{}_bias'.format(i)
- param = self.params.get(name, shape=(r_idx - l_idx, ))
- setattr(self, name, param)
-
- if self._div_val != 1:
- self.cluster = mx.gluon.nn.Dense(len(cutoffs), flatten=False,
- in_units=embed_size,
- weight_initializer=embedding_initializer)
-
- def _get_param_name(self, name, i):
- if name == 'projection':
- tied = self._tie_projections is not None and self._tie_projections[i]
- return self._projections_name.format('' if tied else 'out', i)
- elif name == 'embedding':
- return self._embeddings_name.format('' if self._tie_embeddings else 'out', i)
- else:
- raise ValueError('Invalid name')
-
- def hybrid_forward(self, F, hidden, target, **params): # pylint: disable=arguments-differ
- """Compute adaptive softmax.
-
- Parameters
- ----------
- hidden : Symbol or NDArray
- Inputs of shape [batch_size, sequence_length, units]
- target : Symbol or NDArray
- Targets of shape [batch_size, sequence_length]
-
- Returns
- -------
- out : Symbol or NDArray
- Negative log likelihood of targets with shape [batch_size,
- sequence_length]
- """
- if target is None: # TODO support None or add separate log_prob method
- raise NotImplementedError()
-
- # Work with flat data for simplicity
- target_flat = target.reshape((-1, ))
- hidden = F.reshape(hidden, shape=(-1, 0), reverse=True)
-
- # Helper arrays
- if F is mx.nd:
- range_bs_len = mx.nd.arange(target_flat.shape[0], dtype=target_flat.dtype,
- ctx=target_flat.context)
- else:
- # Shape inference fails when relying on F.stack(range_bs_len, ...)
- # below. Thus add zeros of intended shape here to simplify the
- # shape inference problem.
- range_bs_len = F.zeros_like(target_flat) + F.arange(start=0, stop=None,
- infer_range=True)
-
- if self._div_val == 1:
- if self._units != self._embed_size or self._project_same_dim:
- name = self._get_param_name('projection', 0)
- hidden = F.FullyConnected(data=hidden, weight=F.transpose(params[name]),
- no_bias=True, flatten=False, num_hidden=self._embed_size)
-
- name = self._get_param_name('embedding', 0)
- logits = F.FullyConnected(data=hidden, weight=params[name],
- bias=params['outembedding0_bias'] if self._use_bias else None,
- no_bias=not self._use_bias, flatten=False,
- num_hidden=self._vocab_size)
- logprob = F.log_softmax(logits)
- target_ = F.stack(range_bs_len, target_flat)
- out = F.gather_nd(logprob, indices=target_)
- else:
- # Prepare output
- if F is mx.nd:
- assert target.dtype == hidden.dtype
- out = F.zeros_like(target_flat)
-
- for i, (l_idx, r_idx) in enumerate(zip(self._cutoffs, self._cutoffs[1:])):
- if self._units != self._embed_size // self._div_val**i or self._project_same_dim:
- name = self._get_param_name('projection', i)
- proj_i = F.FullyConnected(data=hidden, weight=F.transpose(params[name]),
- no_bias=True, flatten=False,
- num_hidden=self._embed_size // self._div_val**i)
- else:
- proj_i = hidden
- # Shape [batch_size * sequence_length, r_idx - l_idx]
- name = self._get_param_name('embedding', i)
- logits_i = F.FullyConnected(
- data=proj_i, weight=params[name],
- bias=params['outembedding{}_bias'.format(i)] if self._use_bias else None,
- no_bias=not self._use_bias, flatten=False, num_hidden=r_idx - l_idx)
- if i == 0: # Shortlist
- logits_cluster = self.cluster(proj_i)
- logits_shortlist_cluster = F.concat(logits_i, logits_cluster, dim=1)
- logprob_shortlist_cluster = F.log_softmax(logits_shortlist_cluster)
-
- logprob_i = F.slice_axis(logprob_shortlist_cluster, axis=1, begin=0,
- end=-(len(self._cutoffs) - 2))
- logprob_cluster = F.slice_axis(logprob_shortlist_cluster, axis=1,
- begin=-(len(self._cutoffs) - 2), end=None)
- else: # Tail cluster
- logprob_i = F.broadcast_add(
- F.log_softmax(logits_i),
- F.gather_nd(logprob_cluster,
- F.stack(range_bs_len,
- F.ones_like(range_bs_len) * (i - 1))).expand_dims(1))
-
- # Targets limited to current cluster
- cond_i = F.broadcast_logical_and(target_flat >= l_idx, target_flat < r_idx)
- target_i = F.where(cond_i, target_flat - l_idx, F.zeros_like(target_flat))
- target_i = F.stack(range_bs_len, target_i)
-
- # Copy for targets that fall into the current cluster to out
- out_i = F.gather_nd(logprob_i, indices=target_i)
- out = F.where(cond_i, out_i, out)
-
- out = F.reshape_like(out, target)
-
- return -out
diff --git a/scripts/language_model/transformer/transformer.py b/scripts/language_model/transformer/transformer.py
deleted file mode 100644
index 0c02df35bc..0000000000
--- a/scripts/language_model/transformer/transformer.py
+++ /dev/null
@@ -1,755 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Attention cells."""
-
-__all__ = ['TransformerXLCell', 'TransformerXL', 'XLNet']
-
-import typing
-
-import numpy as np
-import mxnet as mx
-from mxnet.gluon import nn
-
-import gluonnlp as nlp
-
-from .attention_cell import PositionalEmbeddingMultiHeadAttentionCell, \
- RelativeSegmentEmbeddingPositionalEmbeddingMultiHeadAttentionCell
-from .embedding import AdaptiveEmbedding, ProjectedEmbedding
-from .softmax import AdaptiveLogSoftmaxWithLoss, ProjectedLogSoftmaxWithLoss
-
-
-class PositionalEmbedding(mx.gluon.HybridBlock):
- """Positional embedding.
-
- Parameters
- ----------
- embed_size : int
- Dimensionality of positional embeddings.
- """
-
- def __init__(self, embed_size, **kwargs):
- super().__init__(**kwargs)
-
- inv_freq = 1 / mx.nd.power(10000, mx.nd.arange(0.0, embed_size, 2.0) / embed_size)
- with self.name_scope():
- self.inv_freq = self.params.get_constant('inv_freq', inv_freq.reshape((1, -1)))
-
- def hybrid_forward(self, F, pos_seq, inv_freq): # pylint: disable=arguments-differ
- """Compute positional embeddings.
-
- Parameters
- ----------
- pos_seq : Symbol or NDArray
- Positions to compute embedding for. Shape (length, )
-
- Returns
- -------
- pos_emb: Symbol or NDArray
- Positional embeddings for positions secified in pos_seq. Shape
- (length, embed_size).
- """
- inp = F.dot(pos_seq.reshape((-1, 1)), inv_freq)
- pos_emb = F.concat(F.sin(inp), F.cos(inp), dim=-1)
- return pos_emb
-
-
-class TransformerXLCell(mx.gluon.HybridBlock):
- """Transformer-XL Cell.
-
- Parameters
- ----------
- attention_cell
- Attention cell to be used.
- units : int
- Number of units for the output
- hidden_size : int
- number of units in the hidden layer of position-wise feed-forward networks
- num_heads : int
- Number of heads in multi-head attention
- scaled : bool
- Whether to scale the softmax input by the sqrt of the input dimension
- in multi-head attention
- dropout : float
- attention_dropout : float
- layer_norm_eps : float, default 1e-5
- Epsilon parameter passed to for mxnet.gluon.nn.LayerNorm
- use_residual : bool
- output_attention: bool
- Whether to output the attention weights
- weight_initializer : str or Initializer
- Initializer for the input weights matrix, used for the linear
- transformation of the inputs.
- bias_initializer : str or Initializer
- Initializer for the bias vector.
- prefix : str, default None
- Prefix for name of `Block`s
- (and name of weight if params is `None`).
- params : Parameter or None
- Container for weight sharing between cells.
- Created if `None`.
- """
-
- def __init__(self, attention_cell: PositionalEmbeddingMultiHeadAttentionCell, units=128,
- hidden_size=512, num_heads=4, activation='relu', scaled=True, dropout=0.0,
- layer_norm_eps=1e-5, output_attention=False, use_residual=True,
- weight_initializer=None, bias_initializer='zeros', prefix=None, params=None):
- super().__init__(prefix=prefix, params=params)
- self._units = units
- self._num_heads = num_heads
- self._activation = activation
- self._dropout = dropout
- self._use_residual = use_residual
- self._output_attention = output_attention
- self._scaled = scaled
- with self.name_scope():
- if dropout:
- self.dropout_layer = nn.Dropout(rate=dropout)
- assert units % num_heads == 0
- self.attention_cell = attention_cell
- self.proj = nn.Dense(units=units, flatten=False, use_bias=False,
- weight_initializer=weight_initializer,
- bias_initializer=bias_initializer, prefix='proj_')
- self.ffn = nlp.model.PositionwiseFFN(hidden_size=hidden_size, units=units,
- use_residual=use_residual, dropout=dropout,
- ffn1_dropout=True, activation=activation,
- weight_initializer=weight_initializer,
- bias_initializer=bias_initializer,
- layer_norm_eps=layer_norm_eps)
- self.layer_norm = nn.LayerNorm(in_channels=units, epsilon=layer_norm_eps)
-
- def hybrid_forward(self, F, inputs, pos_emb, mem_value, mask):
- # pylint: disable=arguments-differ
- """Transformer Decoder Attention Cell.
-
- Parameters
- ----------
- inputs : Symbol or NDArray
- Input sequence. Shape (batch_size, length, C_in)
- mem_value : Symbol or NDArray
- Memory value, i.e. output of the encoder. Shape (batch_size, mem_length, C_in)
- pos_emb : Symbol or NDArray
- Positional embeddings. Shape (mem_length, C_in)
- mask : Symbol or NDArray or None
- Attention mask of shape (batch_size, length, length + mem_length)
-
- Returns
- -------
- decoder_cell_outputs: list
- Outputs of the decoder cell. Contains:
-
- - outputs of the transformer decoder cell. Shape (batch_size, length, C_out)
- - additional_outputs of all the transformer decoder cell
- """
- key_value = F.concat(mem_value, inputs, dim=1)
- outputs, attention_outputs = self.attention_cell(inputs, key_value, key_value, pos_emb,
- mask)
- outputs = self.proj(outputs)
- if self._dropout:
- outputs = self.dropout_layer(outputs)
- if self._use_residual:
- outputs = outputs + inputs
- outputs = self.layer_norm(outputs)
- outputs = self.ffn(outputs)
- additional_outputs = [attention_outputs] if self._output_attention else []
- return outputs, additional_outputs
-
-
-class _BaseTransformerXL(mx.gluon.HybridBlock):
- def __init__(self, vocab_size, embed_size, embed_cutoffs=None, embed_div_val=None, num_layers=2,
- units=128, hidden_size=2048, num_heads=4, scaled=True, dropout=0.0,
- attention_dropout=0.0, use_residual=True, clamp_len: typing.Optional[int] = None,
- project_same_dim: bool = True, tie_input_output_embeddings: bool = False,
- tie_input_output_projections: typing.Optional[typing.List[bool]] = None,
- output_attention=False, weight_initializer=None, bias_initializer='zeros',
- prefix=None, params=None):
- super().__init__(prefix=prefix, params=params)
- assert units % num_heads == 0, 'In TransformerDecoder, the units should be divided ' \
- 'exactly by the number of heads. Received units={}, ' \
- 'num_heads={}'.format(units, num_heads)
-
- self._num_layers = num_layers
- self._units = units
- self._embed_size = embed_size
- self._hidden_size = hidden_size
- self._num_heads = num_heads
- self._dropout = dropout
- self._use_residual = use_residual
- self._clamp_len = clamp_len
- self._project_same_dim = project_same_dim
- self._tie_input_output_embeddings = tie_input_output_embeddings
- self._tie_input_output_projections = tie_input_output_projections
- if output_attention:
- # Will be implemented when splitting this Block to separate the
- # AdaptiveLogSoftmaxWithLoss used with targets
- raise NotImplementedError()
- self._output_attention = output_attention
- with self.name_scope():
- if embed_cutoffs is not None and embed_div_val != 1:
- self.embedding = AdaptiveEmbedding(vocab_size=vocab_size, embed_size=embed_size,
- units=units, cutoffs=embed_cutoffs,
- div_val=embed_div_val,
- project_same_dim=project_same_dim)
- self.crit = AdaptiveLogSoftmaxWithLoss(vocab_size=vocab_size, embed_size=embed_size,
- units=units, cutoffs=embed_cutoffs,
- div_val=embed_div_val,
- project_same_dim=project_same_dim,
- tie_embeddings=tie_input_output_embeddings,
- tie_projections=tie_input_output_projections,
- params=self.embedding.collect_params())
- else:
- self.embedding = ProjectedEmbedding(vocab_size=vocab_size, embed_size=embed_size,
- units=units, project_same_dim=project_same_dim)
- self.crit = ProjectedLogSoftmaxWithLoss(
- vocab_size=vocab_size, embed_size=embed_size, units=units,
- project_same_dim=project_same_dim, tie_embeddings=tie_input_output_embeddings,
- tie_projections=tie_input_output_projections[0]
- if tie_input_output_projections is not None else None,
- params=self.embedding.collect_params())
-
- self.pos_emb = PositionalEmbedding(embed_size)
- if dropout:
- self.dropout_layer = nn.Dropout(rate=dropout)
-
- self.transformer_cells = nn.HybridSequential()
- for i in range(num_layers):
- attention_cell = PositionalEmbeddingMultiHeadAttentionCell(
- d_head=units // num_heads, num_heads=num_heads, scaled=scaled,
- dropout=attention_dropout)
- self.transformer_cells.add(
- TransformerXLCell(attention_cell=attention_cell, units=units,
- hidden_size=hidden_size, num_heads=num_heads,
- weight_initializer=weight_initializer,
- bias_initializer=bias_initializer, dropout=dropout,
- scaled=scaled, use_residual=use_residual,
- output_attention=output_attention,
- prefix='transformer%d_' % i))
-
- def hybrid_forward(self, F, step_input, target, mask, pos_seq, mems): # pylint: disable=arguments-differ
- """
-
- Parameters
- ----------
- step_input : NDArray or Symbol
- Input of shape [batch_size, length]
- target : NDArray or Symbol
- Targets of shape [batch_size, length]
- mask : NDArray or Symbol
- Attention mask of shape [length + memory_length]
- pos_seq : NDArray or Symbol
- Array of [length + memory_length] created with arange(length +
- memory_length).
- mems : List of NDArray or Symbol, optional
- Optional memory from previous forward passes containing
- `num_layers` `NDArray`s or `Symbol`s each of shape [batch_size,
- memory_length, units].
-
- Returns
- -------
- softmax_output : NDArray or Symbol
- Negative log likelihood of targets with shape [batch_size, length]
- hids : List of NDArray or Symbol
- List containing `num_layers` `NDArray`s or `Symbol`s each of shape
- [batch_size, mem_len, units] representing the mememory states at
- the entry of each layer (does not include last_hidden).
- last_hidden
-
- """
- core_out = self.embedding(step_input)
- if self._clamp_len is not None and self._clamp_len >= 0:
- pos_seq = F.clip(pos_seq, a_min=0, a_max=self._clamp_len)
- pos_emb = self.pos_emb(pos_seq)
-
- if self._dropout:
- core_out = self.dropout_layer(core_out)
- pos_emb = self.dropout_layer(pos_emb)
-
- hids = []
- for i, layer in enumerate(self.transformer_cells):
- hids.append(core_out)
- mems_i = None if mems is None else mems[i]
- # inputs, mem_value, emb, mask=None
- core_out, _ = layer(core_out, pos_emb, mems_i, mask)
-
- if self._dropout:
- core_out = self.dropout_layer(core_out)
-
- softmax_output = self.crit(core_out, target)
-
- return softmax_output, hids, core_out
-
-
-class TransformerXL(mx.gluon.Block):
- """Structure of the Transformer-XL.
-
- Dai, Z., Yang, Z., Yang, Y., Cohen, W. W., Carbonell, J., Le, Q. V., &
- Salakhutdinov, R. (2019). Transformer-XL: Attentive language models beyond
- a fixed-length context. arXiv preprint arXiv:1901.02860.
-
- Parameters
- ----------
- attention_cell : None
- Argument reserved for later.
- vocab_size : int or None, default None
- The size of the vocabulary.
- num_layers : int
- units : int
- hidden_size : int
- number of units in the hidden layer of position-wise feed-forward networks
- num_heads : int
- Number of heads in multi-head attention
- scaled : bool
- Whether to scale the softmax input by the sqrt of the input dimension
- in multi-head attention
- dropout : float
- use_residual : bool
- output_attention: bool
- Whether to output the attention weights
- tie_input_output_embeddings : boolean, default False
- If True, tie embedding parameters for all clusters between
- AdaptiveEmbedding and AdaptiveLogSoftmaxWithLoss.
- tie_input_output_projections : List[boolean] or None, default None
- If not None, tie projection parameters for the specified clusters
- between AdaptiveEmbedding and AdaptiveLogSoftmaxWithLoss. The number of
- clusters is `len(tie_input_output_projections) == len(cutoffs) + 1`.
- weight_initializer : str or Initializer
- Initializer for the input weights matrix, used for the linear
- transformation of the inputs.
- bias_initializer : str or Initializer
- Initializer for the bias vector.
- prefix : str, default 'rnn_'
- Prefix for name of `Block`s
- (and name of weight if params is `None`).
- params : Parameter or None
- Container for weight sharing between cells.
- Created if `None`.
-
- """
-
- def __init__(self, *args, **kwargs):
- prefix = kwargs.pop('prefix', None)
- params = kwargs.pop('params', None)
- super().__init__(prefix=prefix, params=params)
-
- with self.name_scope():
- self._net = _BaseTransformerXL(*args, **kwargs)
-
- def begin_mems(self, batch_size, mem_len, context):
- mems = [
- mx.nd.zeros((batch_size, mem_len, self._net._units), ctx=context)
- for _ in range(len(self._net.transformer_cells))
- ]
- return mems
-
- def forward(self, step_input, target, mems): # pylint: disable=arguments-differ
- """
-
- Parameters
- ----------
- step_input : NDArray or Symbol
- Input of shape [batch_size, length]
- target : NDArray or Symbol
- Input of shape [batch_size, length]
- mems : List of NDArray or Symbol, optional
- Optional memory from previous forward passes containing
- `num_layers` `NDArray`s or `Symbol`s each of shape [batch_size,
- mem_len, units].
-
- Returns
- -------
- softmax_output : NDArray or Symbol
- Negative log likelihood of targets with shape [batch_size, length]
- mems : List of NDArray or Symbol
- List containing `num_layers` `NDArray`s or `Symbol`s each of shape
- [batch_size, mem_len, units] representing the mememory states at
- the entry of each layer.
-
- """
- # Uses same number of unmasked memory steps for every step
- batch_size, qlen = step_input.shape[:2]
- mlen = mems[0].shape[1] if mems is not None else 0
- klen = qlen + mlen
-
- all_ones = np.ones((qlen, klen), dtype=step_input.dtype)
- mask = np.triu(all_ones, 1 + mlen) + np.tril(all_ones, 0)
- mask_nd = (mx.nd.from_numpy(mask, zero_copy=True) == 0).as_in_context(
- step_input.context).expand_dims(0).broadcast_axes(axis=0, size=batch_size)
-
- pos_seq = mx.nd.arange(start=klen, stop=-qlen, step=-1, ctx=step_input.context)
-
- softmax_output, hids, last_hidden = self._net(step_input, target, mask_nd, pos_seq, mems)
-
- # Update memory
- if mems is not None:
- new_mems = [
- # pylint: disable=invalid-sequence-index
- mx.nd.concat(mem_i, hid_i, dim=1)[:, -mem_i.shape[1]:].detach()
- for mem_i, hid_i in zip(mems, hids)
- ]
- else:
- new_mems = None
-
- return softmax_output, new_mems, last_hidden
-
-
-class XLNetCell(TransformerXLCell):
- """XLNet Cell.
-
- Parameters
- ----------
- attention_cell
- Attention cell to be used.
- units : int
- Number of units for the output
- hidden_size : int
- number of units in the hidden layer of position-wise feed-forward networks
- num_heads : int
- Number of heads in multi-head attention
- scaled : bool
- Whether to scale the softmax input by the sqrt of the input dimension
- in multi-head attention
- dropout : float
- attention_dropout : float
- use_residual : bool
- output_attention: bool
- Whether to output the attention weights
- weight_initializer : str or Initializer
- Initializer for the input weights matrix, used for the linear
- transformation of the inputs.
- bias_initializer : str or Initializer
- Initializer for the bias vector.
- prefix : str, default None
- Prefix for name of `Block`s
- (and name of weight if params is `None`).
- params : Parameter or None
- Container for weight sharing between cells.
- Created if `None`.
- """
-
- def hybrid_forward(self, F, inputs, pos_emb, mem_value, mask, segments):
- # pylint: disable=arguments-differ
- """Transformer Decoder Attention Cell.
-
- Parameters
- ----------
- inputs : Symbol or NDArray
- Input sequence. Shape (batch_size, length, C_in)
- mem_value : Symbol or NDArray
- Memory value, i.e. output of the encoder. Shape (batch_size,
- memory__length, C_in)
- pos_emb : Symbol or NDArray
- Positional embeddings. Shape (mem_length, C_in)
- seg_emb : Symbol or NDArray
- Segment embeddings. Shape (mem_length, C_in)
- mask : Symbol or NDArray
- Attention mask of shape (batch_size, length, length + mem_length)
- segments : Symbol or NDArray
- One-hot vector indicating if a query-key pair is in the same
- segment or not. Shape [batch_size, query_length, query_length +
- memory_length, 2]. `1` indicates that the pair is not in the same
- segment.
-
- Returns
- -------
- decoder_cell_outputs: list
- Outputs of the decoder cell. Contains:
-
- - outputs of the transformer decoder cell. Shape (batch_size, length, C_out)
- - additional_outputs of all the transformer decoder cell
- """
- key_value = inputs
- if mem_value is not None:
- key_value = F.concat(mem_value, inputs, dim=1)
- outputs, attention_outputs = self.attention_cell(inputs, key_value, key_value, pos_emb,
- mask, segments)
-
- outputs = self.proj(outputs)
- if self._dropout:
- outputs = self.dropout_layer(outputs)
- if self._use_residual:
- outputs = outputs + inputs
- outputs = self.layer_norm(outputs)
- outputs = self.ffn(outputs)
- additional_outputs = [attention_outputs] if self._output_attention else []
- return outputs, additional_outputs
-
-
-class _BaseXLNet(mx.gluon.HybridBlock):
- """
- Parameters
- ----------
- vocab_size : int
- The size of the vocabulary.
- num_layers : int
- units : int
- hidden_size : int
- number of units in the hidden layer of position-wise feed-forward networks
- num_heads : int
- Number of heads in multi-head attention
- activation
- Activation function used for the position-wise feed-forward networks
- two_stream
- If True, use Two-Stream Self-Attention. Typically set to True for
- pre-training and False during finetuning.
- scaled : bool
- Whether to scale the softmax input by the sqrt of the input dimension
- in multi-head attention
- dropout : float
- attention_dropout : float
- use_residual : bool
- clamp_len : int
- Clamp all relative distances larger than clamp_len
- use_decoder : bool, default True
- Whether to include the decoder for language model prediction.
- tie_decoder_weight : bool, default True
- Whether to tie the decoder weight with the input embeddings
- weight_initializer : str or Initializer
- Initializer for the input weights matrix, used for the linear
- transformation of the inputs.
- bias_initializer : str or Initializer
- Initializer for the bias vector.
- prefix : str, default None
- Prefix for name of `Block`s (and name of weight if params is `None`).
- params : ParameterDict or None
- Container for weight sharing between cells. Created if `None`.
-
- """
-
- def __init__(self, vocab_size, num_layers=2, units=128, hidden_size=2048, num_heads=4,
- activation='approx_gelu', two_stream: bool = False, scaled=True, dropout=0.0,
- attention_dropout=0.0, use_residual=True, clamp_len: typing.Optional[int] = None,
- use_decoder=True, tie_decoder_weight=True, weight_initializer=None,
- bias_initializer='zeros', prefix=None, params=None):
- super().__init__(prefix=prefix, params=params)
- assert units % num_heads == 0, 'In TransformerDecoder, the units should be divided ' \
- 'exactly by the number of heads. Received units={}, ' \
- 'num_heads={}'.format(units, num_heads)
-
- self._num_layers = num_layers
- self._units = units
- self._hidden_size = hidden_size
- self._num_heads = num_heads
- self._two_stream = two_stream
- assert not two_stream, 'Not yet implemented.'
- self._dropout = dropout
- self._use_residual = use_residual
- self._clamp_len = clamp_len
- with self.name_scope():
- self.word_embed = nn.Embedding(vocab_size, units)
- self.mask_embed = self.params.get('mask_embed', shape=(1, 1, units))
- self.pos_embed = PositionalEmbedding(units)
- if dropout:
- self.dropout_layer = nn.Dropout(rate=dropout)
-
- self.transformer_cells = nn.HybridSequential()
- for i in range(num_layers):
- attention_cell = RelativeSegmentEmbeddingPositionalEmbeddingMultiHeadAttentionCell(
- d_head=units // num_heads, num_heads=num_heads, scaled=scaled,
- dropout=attention_dropout)
- self.transformer_cells.add(
- XLNetCell(attention_cell=attention_cell, units=units, hidden_size=hidden_size,
- num_heads=num_heads, activation=activation, layer_norm_eps=1e-12,
- weight_initializer=weight_initializer,
- bias_initializer=bias_initializer, dropout=dropout, scaled=scaled,
- use_residual=use_residual, prefix='transformer%d_' % i))
- if use_decoder:
- self.decoder = nn.Dense(
- vocab_size, flatten=False,
- params=self.word_embed.params if tie_decoder_weight else None)
-
- def hybrid_forward(self, F, step_input, segments, mask, pos_seq, mems, mask_embed):
- # pylint: disable=arguments-differ
- """
- Parameters
- ----------
- step_input : Symbol or NDArray
- Input of shape [batch_size, query_length]
- segments : Symbol or NDArray
- One-hot vector indicating if a query-key pair is in the same
- segment or not. Shape [batch_size, query_length, query_length +
- memory_length, 2]. `1` indicates that the pair is not in the same
- segment.
- mask : Symbol or NDArray
- Attention mask of shape (batch_size, length, length + mem_length)
- pos_seq : Symbol or NDArray
- Relative distances
- mems : List of NDArray or Symbol, optional
- Memory from previous forward passes containing
- `num_layers` `NDArray`s or `Symbol`s each of shape [batch_size,
- memory_length, units].
- Returns
-
- -------
- core_out : NDArray or Symbol
- For use_decoder=True, logits. Otherwise output of last layer.
- hids : List of NDArray or Symbol
- Stacking the output of each layer
- """
- if self._clamp_len:
- pos_seq = F.clip(pos_seq, a_min=0, a_max=self._clamp_len)
-
- # Force use mask_embed in a noop to make HybridBlock happy
- core_out = F.broadcast_add(self.word_embed(step_input), 0 * mask_embed)
- pos_emb = self.pos_embed(pos_seq)
-
- if self._dropout:
- core_out = self.dropout_layer(core_out)
- pos_emb = self.dropout_layer(pos_emb)
-
- hids = []
- for i, layer in enumerate(self.transformer_cells):
- hids.append(core_out)
- mems_i = None if mems is None else mems[i]
- core_out, _ = layer(core_out, pos_emb, mems_i, mask, segments)
-
- if self._dropout:
- core_out = self.dropout_layer(core_out)
-
- if hasattr(self, 'decoder'):
- return self.decoder(core_out), hids
- return core_out, hids
-
- def begin_mems(self, batch_size, mem_len, context):
- mems = [
- mx.nd.zeros((batch_size, mem_len, self._units), ctx=context)
- for _ in range(len(self.transformer_cells))
- ]
- return mems
-
-
-class XLNet((mx.gluon.Block)):
- """XLNet
-
- Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., & Le, Q. V.
- (2019). XLNet: Generalized Autoregressive Pretraining for Language
- Understanding. arXiv preprint arXiv:1906.08237.
-
- Parameters
- ----------
- attention_cell : None
- Argument reserved for later.
- vocab_size : int or None, default None
- The size of the vocabulary.
- num_layers : int
- units : int
- hidden_size : int
- number of units in the hidden layer of position-wise feed-forward networks
- num_heads : int
- Number of heads in multi-head attention
- activation
- Activation function used for the position-wise feed-forward networks
- two_stream
- If True, use Two-Stream Self-Attention. Typically set to True for
- pre-training and False during finetuning.
- scaled : bool
- Whether to scale the softmax input by the sqrt of the input dimension
- in multi-head attention
- dropout : float
- use_residual : bool
- use_decoder : bool, default True
- Whether to include the decoder for language model prediction.
- tie_decoder_weight : bool, default True
- Whether to tie the decoder weight with the input embeddings
- weight_initializer : str or Initializer
- Initializer for the input weights matrix, used for the linear
- transformation of the inputs.
- bias_initializer : str or Initializer
- Initializer for the bias vector.
- prefix : str, default None
- Prefix for name of `Block`s (and name of weight if params is `None`).
- params : ParameterDict or None
- Container for weight sharing between cells. Created if `None`.
-
- """
-
- def __init__(self, *args, **kwargs):
- prefix = kwargs.pop('prefix', None)
- params = kwargs.pop('params', None)
- super().__init__(prefix=prefix, params=params)
-
- with self.name_scope():
- self._net = _BaseXLNet(*args, **kwargs)
-
- def begin_mems(self, batch_size, mem_len, context):
- mems = [
- mx.nd.zeros((batch_size, mem_len, self._net._units), ctx=context)
- for _ in range(len(self._net.transformer_cells))
- ]
- return mems
-
- def forward(self, step_input, token_types, mems=None, mask=None): # pylint: disable=arguments-differ
- """
-
- Parameters
- ----------
- step_input : NDArray or Symbol
- Input of shape [batch_size, query_length]
- token_types : NDArray or Symbol
- Token types of the input tokens of shape [batch_size,
- query_length], indicating various portions of the inputs.
- mems : List of NDArray or Symbol, optional
- Optional memory from previous forward passes containing
- `num_layers` `NDArray`s or `Symbol`s each of shape [batch_size,
- memory_length, units].
- mask : Symbol or NDArray
- Attention mask of shape (batch_size, length, length + mem_length)
-
- Returns
- -------
- output : NDArray or Symbol
- For XLNet(..., use_decoder=True), logits. Otherwise output of last
- XLNetCell layer.
- mems : List of NDArray or Symbol
- List containing `num_layers` `NDArray`s or `Symbol`s each of shape
- [batch_size, mem_len, units] representing the mememory states at
- the entry of each layer.
-
- """
- # Uses same number of unmasked memory steps for every step
- batch_size, qlen = step_input.shape[:2]
- mlen = mems[0].shape[1] if mems is not None else 0
- klen = qlen + mlen
- segments = None
- if token_types is not None:
- if mlen > 0:
- mem_pad = mx.nd.zeros([batch_size, mlen], dtype=token_types.dtype,
- ctx=token_types.context)
- mem_pad_token_types = mx.nd.concat(mem_pad, token_types, dim=1)
- else:
- mem_pad_token_types = token_types
- # `1` indicates not in the same segment [qlen x klen x bsz]
- segments = mx.nd.broadcast_not_equal(token_types.expand_dims(2),
- mem_pad_token_types.expand_dims(1))
- segments = mx.nd.one_hot(segments, 2, 1, 0)
-
-
- pos_seq = mx.nd.arange(start=klen, stop=-qlen, step=-1, ctx=step_input.context)
-
- if mask is None and self._net._active:
- # Hybridized _net does not support `None`-valued parameters
- mask = mx.nd.ones((batch_size, qlen, klen), ctx=step_input.context)
- output, hids = self._net(step_input, segments, mask, pos_seq, mems)
-
- # Update memory
- new_mems = None
- if mems is not None:
- new_mems = [
- # pylint: disable=invalid-sequence-index
- mx.nd.concat(mem_i, hid_i, dim=1)[:, -mem_i.shape[1]:].detach()
- for mem_i, hid_i in zip(mems, hids)
- ]
-
- return output, new_mems
diff --git a/scripts/language_model/transformer_xl.py b/scripts/language_model/transformer_xl.py
deleted file mode 100644
index 2592aadb85..0000000000
--- a/scripts/language_model/transformer_xl.py
+++ /dev/null
@@ -1,164 +0,0 @@
-"""Transformer-XL Language Model
-================================
-
-This example shows how to build a Transformer-XL language model with Gluon NLP
-Toolkit.
-
-@article{dai2019transformer,
- title = {Transformer-XL: Attentive language models beyond a fixed-length context},
- author = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and Cohen, William W
- and Carbonell, Jaime and Le, Quoc V and Salakhutdinov, Ruslan},
- journal = {arXiv preprint arXiv:1901.02860},
- year = {2019},
-}
-
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import itertools
-import math
-import sys
-import time
-
-import mxnet as mx
-import gluonnlp as nlp
-
-nlp.utils.check_version('0.8.0')
-
-def evaluate(data_iter):
- """Evaluate the model on the dataset."""
-
- total_L = mx.nd.zeros(shape=(1, ))
- ntotal = 0
-
- mems = model.begin_mems(args.eval_batch_size, args.mem_len, context=ctx)
- for i, (data, target) in enumerate(data_iter):
- data = data.T.as_in_context(ctx)
- target = target.T.as_in_context(ctx)
- L, mems, _ = model(data, target, mems) # Negative log likelihood of targets
- total_L += mx.nd.sum(L).as_in_context(mx.cpu())
- ntotal += L.size
- mx.nd.waitall() # Avoid OOM due to pushing data too fast
-
- if i % args.log_every == 0:
- current_loss = total_L.asscalar() / ntotal
- print('Iter {} evaluation loss {:.2f}, ppl {:.2f}, bpc {:.2f}'.format(
- i, current_loss, math.exp(current_loss), current_loss / math.log(2)))
-
- return total_L.asscalar() / ntotal
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='Transformer-XL Language Modeling.',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--dataset', type=str, required=True,
- choices=['wt103', 'text8', 'enwik8', 'lm1b'], help='Dataset name.')
- parser.add_argument('--split', type=str, default='test', choices=['valid', 'test'],
- help='Which split to evaluate')
- parser.add_argument('--parameter-file', type=str, default=None, required=True,
- help='File storing pre-trained parameters for the model.')
- parser.add_argument('--vocab-file', type=str, default=None, required=True,
- help='File storing nlp.Vocab corresponding to --parameter-file.')
-
- parser.add_argument('--mem-len', type=int, default=1600,
- help='length of the retained previous heads')
- parser.add_argument('--bptt', type=int, default=128,
- help='The number of tokens per batch dimension per sample.')
- parser.add_argument('--clamp-len', type=int, default=1000,
- help='max positional embedding index')
-
- parser.add_argument('--log-every', type=int, default=10,
- help='Log every `--log-every` iterations.')
-
- # TODO: training not yet supported
- parser.add_argument('--eval-only', action='store_true', required=True,
- help='Only evaluate the trained model')
- parser.add_argument('--eval-batch-size', type=int, default=64,
- help='Batch size for evaluation.')
- parser.add_argument('--gpu', type=int, help='GPU id')
- args = parser.parse_args()
-
- start_time = time.time()
-
- # Model
- from transformer.model import get_model
- with open(args.vocab_file, 'r') as f:
- vocab = nlp.Vocab.from_json(f.read())
-
- ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu()
- model, vocab = get_model('transformerxl', vocab=vocab, dataset_name=args.dataset,
- clamp_len=args.clamp_len)
- model.initialize(ctx=ctx)
- model.load_parameters(args.parameter_file, ignore_extra=False)
- model.hybridize()
- print(model)
-
- # Data
- if args.dataset == 'wt103':
- val_dataset, test_dataset = [
- nlp.data.WikiText103(segment=segment, skip_empty=False, bos=vocab.bos_token,
- eos=vocab.eos_token) for segment in ['val', 'test']
- ]
- elif args.dataset == 'lm1b':
- # bos=vocab.eos_token is not a typo: tf uses [''] + symbols + ['']
- test_datasets = list(
- nlp.data.GBWStream(segment='test', skip_empty=True, bos=vocab.eos_token,
- eos=vocab.eos_token))
- assert len(test_datasets) == 1
- test_dataset = mx.gluon.data.SimpleDataset(
- list(itertools.chain.from_iterable(test_datasets[0])))
- val_dataset = None
- elif args.dataset == 'text8':
- dataset = nlp.data.Text8(max_sentence_length=None)
- chars = list(itertools.chain.from_iterable(list(w) + ['_'] for w in dataset[0]))
- num_test_chars = 5000000
- val_dataset = mx.gluon.data.SimpleDataset(chars[-2 * num_test_chars:-num_test_chars])
- test_dataset = mx.gluon.data.SimpleDataset(chars[-num_test_chars:])
- elif args.dataset == 'enwik8':
- val_dataset, test_dataset = [
- mx.gluon.data.SimpleDataset(
- list(itertools.chain.from_iterable(nlp.data.Enwik8(segment=segment))))
- for segment in ['val', 'test']
- ]
- else:
- print('Dataset unsupported by this script.')
- sys.exit(1)
-
- eval_batchify = nlp.data.batchify.CorpusBPTTBatchify(vocab, args.bptt, args.eval_batch_size,
- last_batch='discard')
-
- # Evaluate
- test_loss = None
- valid_loss = None
- if args.split in ('valid', 'all') and val_dataset is not None:
- val_data = eval_batchify(val_dataset)
- valid_loss = evaluate(val_data)
- if args.split in ('test', 'all') and test_dataset is not None:
- test_data = eval_batchify(test_dataset)
- test_loss = evaluate(test_data)
-
- if test_loss is not None:
- print('Best test loss {:.2f}, test ppl {:.2f}, test bpc {:.2f}'.format(
- test_loss, math.exp(test_loss), test_loss / math.log(2)))
- if valid_loss is not None:
- print('Best validation loss {:.2f}, val ppl {:.2f}, val bpc {:.2f}'.format(
- valid_loss, math.exp(valid_loss), valid_loss / math.log(2)))
-
- print('Total time cost {:.2f}s'.format(time.time() - start_time))
diff --git a/scripts/language_model/word_language_model.py b/scripts/language_model/word_language_model.py
deleted file mode 100644
index 12df344d79..0000000000
--- a/scripts/language_model/word_language_model.py
+++ /dev/null
@@ -1,474 +0,0 @@
-"""
-Word Language Model
-===================
-
-This example shows how to build a word-level language model on WikiText-2 with Gluon NLP Toolkit.
-By using the existing data pipeline tools and building blocks, the process is greatly simplified.
-
-We implement the AWD LSTM language model proposed in the following work.
-
-@article{merityRegOpt,
- title={{Regularizing and Optimizing LSTM Language Models}},
- author={Merity, Stephen and Keskar, Nitish Shirish and Socher, Richard},
- journal={ICLR},
- year={2018}
-}
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import time
-import math
-import os
-import sys
-import mxnet as mx
-from mxnet import gluon, autograd
-import gluonnlp as nlp
-
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, '..', '..'))
-
-nlp.utils.check_version('0.7.0')
-
-parser = argparse.ArgumentParser(description=
- 'MXNet Autograd RNN/LSTM Language Model on Wikitext-2.')
-parser.add_argument('--model', type=str, default='lstm',
- help='type of recurrent net (rnn_tanh, rnn_relu, lstm, gru)')
-parser.add_argument('--emsize', type=int, default=400,
- help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=1150,
- help='number of hidden units per layer')
-parser.add_argument('--nlayers', type=int, default=3,
- help='number of layers')
-parser.add_argument('--lr', type=float, default=30,
- help='initial learning rate')
-parser.add_argument('--clip', type=float, default=0.25,
- help='gradient clipping')
-parser.add_argument('--epochs', type=int, default=750,
- help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=80, metavar='N',
- help='batch size')
-parser.add_argument('--bptt', type=int, default=70,
- help='sequence length')
-parser.add_argument('--dropout', type=float, default=0.4,
- help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--dropout_h', type=float, default=0.2,
- help='dropout applied to hidden layer (0 = no dropout)')
-parser.add_argument('--dropout_i', type=float, default=0.65,
- help='dropout applied to input layer (0 = no dropout)')
-parser.add_argument('--dropout_e', type=float, default=0.1,
- help='dropout applied to embedding layer (0 = no dropout)')
-parser.add_argument('--weight_dropout', type=float, default=0.5,
- help='weight dropout applied to h2h weight matrix (0 = no weight dropout)')
-parser.add_argument('--tied', action='store_true',
- help='tie the word embedding and softmax weights')
-parser.add_argument('--log-interval', type=int, default=200, metavar='N',
- help='report interval')
-parser.add_argument('--save', type=str, default='model.params',
- help='path to save the final model')
-parser.add_argument('--eval_only', action='store_true',
- help='Whether to only evaluate the trained model')
-parser.add_argument('--gpu', type=str, help='single gpu id')
-parser.add_argument('--optimizer', type=str, default='sgd',
- help='optimizer to use (sgd, adam)')
-parser.add_argument('--wd', type=float, default=1.2e-6,
- help='weight decay applied to all weights')
-parser.add_argument('--alpha', type=float, default=2,
- help='alpha L2 regularization on RNN activation '
- '(alpha = 0 means no regularization)')
-parser.add_argument('--beta', type=float, default=1,
- help='beta slowness regularization applied on RNN activation '
- '(beta = 0 means no regularization)')
-parser.add_argument('--ntasgd', action='store_true',
- help='Whether to apply ntasgd')
-parser.add_argument('--test_mode', action='store_true',
- help='Whether to run through the script with few examples')
-parser.add_argument('--lr_update_interval', type=int, default=30,
- help='lr udpate interval')
-parser.add_argument('--lr_update_factor', type=float, default=0.1,
- help='lr udpate factor')
-args = parser.parse_args()
-
-###############################################################################
-# Load data
-###############################################################################
-
-context = [mx.cpu()] if not args.gpu else [mx.gpu(int(args.gpu))]
-
-assert args.batch_size % len(context) == 0, \
- 'Total batch size must be multiple of the number of devices'
-
-assert args.weight_dropout > 0 or (args.weight_dropout == 0 and args.alpha == 0), \
- 'The alpha L2 regularization cannot be used with standard RNN, please set alpha to 0'
-
-train_dataset, val_dataset, test_dataset = \
- [nlp.data.WikiText2(segment=segment,
- skip_empty=False, bos=None, eos='')
- for segment in ['train', 'val', 'test']]
-
-vocab = nlp.Vocab(counter=nlp.data.Counter(train_dataset), padding_token=None, bos_token=None)
-train_batchify = nlp.data.batchify.CorpusBatchify(vocab, args.batch_size)
-train_data = train_batchify(train_dataset)
-val_batch_size = 10
-val_batchify = nlp.data.batchify.CorpusBatchify(vocab, val_batch_size)
-val_data = val_batchify(val_dataset)
-test_batch_size = 1
-test_batchify = nlp.data.batchify.CorpusBatchify(vocab, test_batch_size)
-test_data = test_batchify(test_dataset)
-
-if args.test_mode:
- args.emsize = 200
- args.nhid = 200
- args.nlayers = 1
- args.epochs = 3
- train_data = train_data[0:100]
- val_data = val_data[0:100]
- test_data = test_data[0:100]
-
-print(args)
-
-###############################################################################
-# Build the model
-###############################################################################
-
-ntokens = len(vocab)
-
-if args.weight_dropout > 0:
- print('Use AWDRNN')
- model_eval = nlp.model.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers,
- args.tied, args.dropout, args.weight_dropout,
- args.dropout_h, args.dropout_i, args.dropout_e)
- model = nlp.model.train.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers,
- args.tied, args.dropout, args.weight_dropout,
- args.dropout_h, args.dropout_i, args.dropout_e)
-else:
- model_eval = nlp.model.StandardRNN(args.model, len(vocab), args.emsize,
- args.nhid, args.nlayers, args.dropout, args.tied)
- model = nlp.model.train.StandardRNN(args.model, len(vocab), args.emsize,
- args.nhid, args.nlayers, args.dropout, args.tied)
-
-model.initialize(mx.init.Xavier(), ctx=context)
-
-model.hybridize(static_alloc=True)
-
-print(model)
-
-
-if args.optimizer == 'sgd':
- trainer_params = {'learning_rate': args.lr,
- 'momentum': 0,
- 'wd': args.wd}
-elif args.optimizer == 'adam':
- trainer_params = {'learning_rate': args.lr,
- 'wd': args.wd,
- 'beta1': 0,
- 'beta2': 0.999,
- 'epsilon': 1e-9}
-
-trainer = gluon.Trainer(model.collect_params(), args.optimizer, trainer_params,
- update_on_kvstore=False)
-
-loss = gluon.loss.SoftmaxCrossEntropyLoss()
-
-
-class JointActivationRegularizationLoss(gluon.loss.Loss):
- r"""Computes Joint Regularization Loss with standard loss.
-
- The activation regularization refer to
- gluonnlp.loss.ActivationRegularizationLoss.
-
- The temporal activation regularization refer to
- gluonnlp.loss.TemporalActivationRegularizationLoss.
-
- Parameters
- ----------
- loss : gluon.loss.Loss
- The standard loss
- alpha: float
- The activation regularization parameter in gluonnlp.loss.ActivationRegularizationLoss
- beta: float
- The temporal activation regularization parameter in
- gluonnlp.loss.TemporalActivationRegularizationLoss
-
- Inputs:
- - **out**: NDArray
- output tensor with shape `(sequence_length, batch_size, input_size)`
- when `layout` is "TNC".
- - **target**: NDArray
- target tensor with shape `(sequence_length, batch_size, input_size)`
- when `layout` is "TNC".
- - **states**: the stack outputs from RNN,
- which consists of output from each time step (TNC).
- - **dropped_states**: the stack outputs from RNN with dropout,
- which consists of output from each time step (TNC).
-
- Outputs:
- - **loss**: loss tensor with shape (batch_size,). Dimensions other than
- batch_axis are averaged out.
- """
-
- def __init__(self, l, alpha, beta, weight=None, batch_axis=None, **kwargs):
- super(JointActivationRegularizationLoss, self).__init__(weight, batch_axis, **kwargs)
- self._loss = l
- self._alpha, self._beta = alpha, beta
- if alpha:
- self._ar_loss = nlp.loss.ActivationRegularizationLoss(alpha)
- if beta:
- self._tar_loss = nlp.loss.TemporalActivationRegularizationLoss(beta)
-
- def __repr__(self):
- s = 'JointActivationTemporalActivationRegularizationLoss'
- return s
-
- def hybrid_forward(self, F, out, target, states, dropped_states): # pylint: disable=arguments-differ
- # pylint: disable=unused-argument
- l = self._loss(out.reshape(-3, -1), target.reshape(-1,))
- if self._alpha:
- l = l + self._ar_loss(*dropped_states)
- if self._beta:
- l = l + self._tar_loss(*states)
- return l
-
-
-joint_loss = JointActivationRegularizationLoss(loss, args.alpha, args.beta)
-
-###############################################################################
-# Training code
-###############################################################################
-
-
-def detach(hidden):
- """Transfer hidden states into new states, to detach them from the history.
- Parameters
- ----------
- hidden : NDArray
- The hidden states
- Returns
- ----------
- hidden: NDArray
- The detached hidden states
- """
- if isinstance(hidden, (tuple, list)):
- hidden = [detach(h) for h in hidden]
- else:
- hidden = hidden.detach()
- return hidden
-
-
-def get_batch(data_source, i, seq_len=None):
- """Get mini-batches of the dataset.
-
- Parameters
- ----------
- data_source : NDArray
- The dataset is evaluated on.
- i : int
- The index of the batch, starting from 0.
- seq_len : int
- The length of each sample in the batch.
-
- Returns
- -------
- data: NDArray
- The context
- target: NDArray
- The words to predict
- """
- seq_len = min(seq_len if seq_len else args.bptt, len(data_source) - 1 - i)
- data = data_source[i:i+seq_len]
- target = data_source[i+1:i+1+seq_len]
- return data, target
-
-
-def evaluate(data_source, batch_size, params_file_name, ctx=None):
- """Evaluate the model on the dataset.
-
- Parameters
- ----------
- data_source : NDArray
- The dataset is evaluated on.
- batch_size : int
- The size of the mini-batch.
- params_file_name : str
- The parameter file to use to evaluate,
- e.g., val.params or args.save
- ctx : mx.cpu() or mx.gpu()
- The context of the computation.
-
- Returns
- -------
- loss: float
- The loss on the dataset
- """
-
- total_L = 0.0
- ntotal = 0
-
- model_eval.load_parameters(params_file_name, context)
-
- hidden = model_eval.begin_state(batch_size=batch_size, func=mx.nd.zeros, ctx=context[0])
- i = 0
- while i < len(data_source) - 1 - 1:
- data, target = get_batch(data_source, i, seq_len=args.bptt)
- data = data.as_in_context(ctx)
- target = target.as_in_context(ctx)
- output, hidden = model_eval(data, hidden)
- hidden = detach(hidden)
- L = loss(output.reshape(-3, -1),
- target.reshape(-1,))
- total_L += mx.nd.sum(L).asscalar()
- ntotal += L.size
- i += args.bptt
- return total_L / ntotal
-
-
-def train():
- """Training loop for awd language model.
-
- """
- ntasgd = False
- best_val = float('Inf')
- start_train_time = time.time()
- parameters = model.collect_params()
- param_dict_avg = None
- t = 0
- avg_trigger = 0
- n = 5
- valid_losses = []
- for epoch in range(args.epochs):
- total_L = 0.0
- start_epoch_time = time.time()
- start_log_interval_time = time.time()
- hiddens = [model.begin_state(args.batch_size//len(context),
- func=mx.nd.zeros, ctx=ctx) for ctx in context]
- batch_i, i = 0, 0
- while i < len(train_data) - 1 - 1:
- bptt = args.bptt if mx.nd.random.uniform().asscalar() < 0.95 else args.bptt / 2
- seq_len = max(5, int(mx.nd.random.normal(bptt, 5).asscalar()))
- lr_batch_start = trainer.learning_rate
- trainer.set_learning_rate(lr_batch_start*seq_len/args.bptt)
-
- data, target = get_batch(train_data, i, seq_len=seq_len)
- data_list = gluon.utils.split_and_load(data, context, batch_axis=1, even_split=True)
- target_list = gluon.utils.split_and_load(target, context, batch_axis=1, even_split=True)
- hiddens = detach(hiddens)
- Ls = []
- with autograd.record():
- for j, (X, y, h) in enumerate(zip(data_list, target_list, hiddens)):
- output, h, encoder_hs, dropped_encoder_hs = model(X, h)
- l = joint_loss(output, y, encoder_hs, dropped_encoder_hs)
- Ls.append(l / (len(context) * X.size))
- hiddens[j] = h
- for L in Ls:
- L.backward()
-
- grads = [p.grad(d.context) for p in parameters.values() for d in data_list]
- gluon.utils.clip_global_norm(grads, args.clip)
-
- if args.ntasgd and ntasgd:
- if param_dict_avg is None:
- param_dict_avg = {k.split(model._prefix)[1]: v.data(context[0]).copy()
- for k, v in parameters.items()}
-
- trainer.step(1)
-
- if args.ntasgd and ntasgd:
- gamma = 1.0 / max(1, epoch * (len(train_data) // args.bptt)
- + batch_i - avg_trigger + 2)
- for name, param_avg in param_dict_avg.items():
- param_avg[:] += gamma * (parameters['{}{}'.format(model._prefix, name)]
- .data(context[0]) - param_avg)
-
- total_L += sum([mx.nd.sum(L).asscalar() for L in Ls])
- trainer.set_learning_rate(lr_batch_start)
-
- if batch_i % args.log_interval == 0 and batch_i > 0:
- cur_L = total_L / args.log_interval
- print('[Epoch %d Batch %d/%d] current loss %.2f, ppl %.2f, '
- 'throughput %.2f samples/s, lr %.2f'
- % (epoch, batch_i, len(train_data) // args.bptt, cur_L, math.exp(cur_L),
- args.batch_size * args.log_interval
- / (time.time() - start_log_interval_time),
- lr_batch_start * seq_len / args.bptt))
- total_L = 0.0
- start_log_interval_time = time.time()
- i += seq_len
- batch_i += 1
-
- mx.nd.waitall()
-
- print('[Epoch %d] throughput %.2f samples/s' % (
- epoch, (args.batch_size * len(train_data)) / (time.time() - start_epoch_time)))
-
- if args.ntasgd and ntasgd:
- mx.nd.save('{}.val.params'.format(args.save), param_dict_avg)
- else:
- model.save_parameters('{}.val.params'.format(args.save))
- val_L = evaluate(val_data, val_batch_size, '{}.val.params'.format(args.save), context[0])
- print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f, lr %.2f' % (
- epoch, time.time() - start_epoch_time, val_L, math.exp(val_L),
- trainer.learning_rate))
-
- if args.ntasgd and avg_trigger == 0:
- if t > n and val_L > min(valid_losses[-n:]):
- if param_dict_avg is None:
- param_dict_avg = {k.split(model._prefix)[1]: v.data(context[0]).copy()
- for k, v in parameters.items()}
- else:
- for k, v in parameters.items():
- param_dict_avg[k.split(model._prefix)[1]] \
- = v.data(context[0]).copy()
- avg_trigger = epoch * (len(train_data) // args.bptt) + len(train_data) // args.bptt
- print('Switching to NTASGD and avg_trigger is : %d' % avg_trigger)
- ntasgd = True
- valid_losses.append(val_L)
- t += 1
-
- if val_L < best_val:
- update_lr_epoch = 0
- best_val = val_L
- if args.ntasgd and ntasgd:
- mx.nd.save(args.save, param_dict_avg)
- else:
- model.save_parameters(args.save)
- test_L = evaluate(test_data, test_batch_size, args.save, context[0])
- print('[Epoch %d] test loss %.2f, test ppl %.2f'
- % (epoch, test_L, math.exp(test_L)))
- else:
- update_lr_epoch += 1
- if update_lr_epoch % args.lr_update_interval == 0 and update_lr_epoch != 0:
- lr_scale = trainer.learning_rate * args.lr_update_factor
- print('Learning rate after interval update %f' % lr_scale)
- trainer.set_learning_rate(lr_scale)
- update_lr_epoch = 0
-
- print('Total training throughput %.2f samples/s'
- % ((args.batch_size * len(train_data) * args.epochs) / (time.time() - start_train_time)))
-
-
-if __name__ == '__main__':
- start_pipeline_time = time.time()
- if not args.eval_only:
- train()
- model.load_parameters(args.save, context)
- final_val_L = evaluate(val_data, val_batch_size, args.save, context[0])
- final_test_L = evaluate(test_data, test_batch_size, args.save, context[0])
- print('Best validation loss %.2f, val ppl %.2f' % (final_val_L, math.exp(final_val_L)))
- print('Best test loss %.2f, test ppl %.2f' % (final_test_L, math.exp(final_test_L)))
- print('Total time cost %.2fs' % (time.time()-start_pipeline_time))
diff --git a/scripts/language_model/xlnet_qa_evaluate.py b/scripts/language_model/xlnet_qa_evaluate.py
deleted file mode 100644
index 3421192d1a..0000000000
--- a/scripts/language_model/xlnet_qa_evaluate.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors, Allenai and DMLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""XLNet SQuAD evaluate."""
-
-from collections import namedtuple, OrderedDict
-
-from mxnet import nd
-
-_PrelimPrediction = namedtuple( # pylint: disable=invalid-name
- 'PrelimPrediction', [
- 'feature_id', 'start_index', 'end_index', 'start_log_prob',
- 'end_log_prob'
- ])
-
-_NbestPrediction = namedtuple( # pylint: disable=invalid-name
- 'NbestPrediction', ['text', 'start_log_prob', 'end_log_prob'])
-
-
-def predict_extended(features,
- results,
- n_best_size,
- max_answer_length=64,
- start_n_top=5,
- end_n_top=5):
- """Get prediction results for XLNet.
-
- Parameters
- ----------
- features : list of SQuADFeature
- List of squad features for the example.
- results : list of data.qa.PredResult
- List of model predictions for span start and span end.
- tokenizer: callable
- Tokenizer function.
- max_answer_length: int, default 64
- Maximum length of the answer tokens.
- null_score_diff_threshold: float, default 0.0
- If null_score - best_non_null is greater than the threshold predict null.
- n_best_size: int, default 10
- The total number of n-best predictions.
- version_2: bool, default False
- If true, the SQuAD examples contain some that do not have an answer.
-
- Returns
- -------
- prediction: str
- The final prediction.
- nbest : list of (str, float)
- n-best predictions with their probabilities.
- """
-
- prelim_predictions = []
- score_null = 1000000 # large and positive
- for features_id, (result, feature) in enumerate(zip(results, features)):
- cur_null_score = result.cls_logits[0]
- score_null = min(score_null, cur_null_score)
- for i in range(start_n_top):
- for j in range(end_n_top):
- start_log_prob = result.start_top_log_probs[i]
- start_index = int(result.start_top_index[i])
- j_index = j * end_n_top + i
- end_log_prob = result.end_top_log_probs[j_index]
- end_index = int(result.end_top_index[j_index])
- # We could hypothetically create invalid predictions, e.g., predict
- # that the start of the span is in the question. We throw out all
- # invalid predictions.
- if start_index >= feature.paragraph_len - 1:
- continue
- if end_index >= feature.paragraph_len - 1:
- continue
-
- if not feature.token_is_max_context.get(start_index, False):
- continue
- if end_index < start_index:
- continue
- length = end_index - start_index + 1
- if length > max_answer_length:
- continue
- prelim_predictions.append(
- _PrelimPrediction(feature_id=features_id,
- start_index=start_index,
- end_index=end_index,
- start_log_prob=start_log_prob,
- end_log_prob=end_log_prob))
-
- prelim_predictions = sorted(prelim_predictions,
- key=lambda x:
- (x.start_log_prob + x.end_log_prob),
- reverse=True)
-
- seen_predictions = {}
- nbest = []
- for pred in prelim_predictions:
- if len(nbest) >= n_best_size:
- break
- feature = features[pred.feature_id]
- tok_start_to_orig_index = feature.tok_start_to_orig_index
- tok_end_to_orig_index = feature.tok_end_to_orig_index
- start_orig_pos = tok_start_to_orig_index[pred.start_index]
- end_orig_pos = tok_end_to_orig_index[pred.end_index]
-
- paragraph_text = feature.paragraph_text
- final_text = paragraph_text[start_orig_pos:end_orig_pos + 1].strip()
- if final_text in seen_predictions:
- continue
- seen_predictions[final_text] = True
- nbest.append(
- _NbestPrediction(text=final_text,
- start_log_prob=pred.start_log_prob,
- end_log_prob=pred.end_log_prob))
-
- # In very rare edge cases we could have no valid predictions. So we
- # just create a nonce prediction in this case to avoid failure.
- if not nbest:
- nbest.append(
- _NbestPrediction(text='', start_log_prob=-1e6, end_log_prob=-1e6))
-
- assert len(nbest) >= 1
-
- total_scores = []
- best_non_null_entry = None
- for entry in nbest:
- total_scores.append(entry.start_log_prob + entry.end_log_prob)
- if not best_non_null_entry:
- best_non_null_entry = entry
- probs = nd.softmax(nd.array(total_scores)).asnumpy()
-
- nbest_json = []
-
- for (i, entry) in enumerate(nbest):
- output = OrderedDict()
- output['text'] = entry.text
- output['probability'] = float(probs[i])
- output['start_log_prob'] = float(entry.start_log_prob)
- output['end_log_prob'] = float(entry.end_log_prob)
- nbest_json.append(output)
-
- assert len(nbest_json) >= 1
- assert best_non_null_entry is not None
- score_diff = score_null
- return score_diff, best_non_null_entry.text, nbest_json
diff --git a/scripts/machine_translation/README.md b/scripts/machine_translation/README.md
new file mode 100644
index 0000000000..402e6272eb
--- /dev/null
+++ b/scripts/machine_translation/README.md
@@ -0,0 +1,172 @@
+# Machine Translation
+
+## Train a Transformer from scratch
+First, use the script described in [datasets/machine_translation](../datasets/machine_translation)
+to generate the dataset. Then, run `train_transformer.py` to train the model.
+
+In the following, we give the training script for WMT2014 EN-DE task with yttm tokenizer.
+You may first run the following command in [datasets/machine_translation](../datasets/machine_translation).
+```bash
+bash wmt2014_ende.sh yttm
+```
+
+Then, you can run the experiment.
+For "transformer_base" configuration
+
+```bash
+SUBWORD_MODEL=yttm
+SRC=en
+TGT=de
+datapath=../datasets/machine_translation
+python3 train_transformer.py \
+ --train_src_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${SRC} \
+ --train_tgt_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${TGT} \
+ --dev_src_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${SRC} \
+ --dev_tgt_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${TGT} \
+ --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --save_dir transformer_base_wmt2014_en_de_${SUBWORD_ALGO} \
+ --cfg transformer_base \
+ --lr 0.002 \
+ --sampler BoundedBudgetSampler \
+ --max_num_tokens 2700 \
+ --max_update 15000 \
+ --save_interval_update 500 \
+ --warmup_steps 6000 \
+ --warmup_init_lr 0.0 \
+ --seed 123 \
+ --gpus 0,1,2,3
+```
+
+Or training via horovod
+```
+horovodrun -np 4 -H localhost:4 python3 train_transformer.py \
+ --comm_backend horovod \
+ --train_src_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${SRC} \
+ --train_tgt_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${TGT} \
+ --dev_src_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${SRC} \
+ --dev_tgt_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${TGT} \
+ --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --save_dir transformer_base_wmt2014_en_de_${SUBWORD_ALGO} \
+ --cfg transformer_base \
+ --lr 0.002 \
+ --sampler BoundedBudgetSampler \
+ --max_num_tokens 2700 \
+ --max_update 15000 \
+ --save_interval_update 500 \
+ --warmup_steps 6000 \
+ --warmup_init_lr 0.0 \
+ --seed 123 \
+ --gpus 0,1,2,3
+```
+
+Use the average_checkpoint cli to average the last 10 checkpoints
+
+```bash
+gluon_average_checkpoint --checkpoints transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/epoch*.params \
+ --begin 30 \
+ --end 39 \
+ --save-path transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/epoch_avg_30_39.params
+```
+
+Use the following command to inference/evaluate the Transformer model:
+
+```bash
+python3 evaluate_transformer.py \
+ --param_path transformer_base_wmt2014_en_de_${SUBWORD_MODEL}/epoch_avg_30_39.params \
+ --src_lang en \
+ --tgt_lang de \
+ --cfg transformer_base_wmt2014_en_de_${SUBWORD_MODEL}/config.yml \
+ --src_tokenizer ${SUBWORD_MODEL} \
+ --tgt_tokenizer ${SUBWORD_MODEL} \
+ --src_subword_model_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.model \
+ --tgt_subword_model_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.model \
+ --src_vocab_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.vocab \
+ --tgt_vocab_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.vocab \
+ --src_corpus ../datasets/machine_translation/wmt2014_ende/test.raw.en \
+ --tgt_corpus ../datasets/machine_translation/wmt2014_ende/test.raw.de
+```
+
+
+
+For "transformer_wmt_en_de_big" configuration
+
+```bash
+SUBWORD_MODEL=yttm
+SRC=en
+TGT=de
+datapath=../datasets/machine_translation
+python3 train_transformer.py \
+ --train_src_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${SRC} \
+ --train_tgt_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${TGT} \
+ --dev_src_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${SRC} \
+ --dev_tgt_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${TGT} \
+ --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --save_dir transformer_big_wmt2014_en_de_${SUBWORD_ALGO} \
+ --cfg transformer_wmt_en_de_big \
+ --lr 0.001 \
+ --sampler BoundedBudgetSampler \
+ --max_num_tokens 3584 \
+ --max_update 15000 \
+ --warmup_steps 4000 \
+ --warmup_init_lr 0.0 \
+ --seed 123 \
+ --gpus 0,1,2,3
+```
+
+Use the average_checkpoint cli to average the last 10 checkpoints
+
+```bash
+gluon_average_checkpoint --checkpoints transformer_big_wmt2014_en_de_${SUBWORD_ALGO}/update*.params \
+ --begin 21 \
+ --end 30 \
+ --save-path transformer_big_wmt2014_en_de_${SUBWORD_ALGO}/avg_21_30.params
+```
+
+
+Use the following command to inference/evaluate the Transformer model:
+
+```bash
+python3 evaluate_transformer.py \
+ --param_path transformer_big_wmt2014_en_de_${SUBWORD_MODEL}/average_21_30.params \
+ --src_lang en \
+ --tgt_lang de \
+ --cfg transformer_big_wmt2014_en_de_${SUBWORD_MODEL}/config.yml \
+ --src_tokenizer ${SUBWORD_MODEL} \
+ --tgt_tokenizer ${SUBWORD_MODEL} \
+ --src_subword_model_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.model \
+ --tgt_subword_model_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.model \
+ --src_vocab_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.vocab \
+ --tgt_vocab_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.vocab \
+ --src_corpus ../datasets/machine_translation/wmt2014_ende/test.raw.en \
+ --tgt_corpus ../datasets/machine_translation/wmt2014_ende/test.raw.de
+```
+
+
+Test BLEU score with 3 seeds (evaluated via sacre BLEU):
+
+- transformer_base
+
+(test bleu / valid bleu)
+| Subword Model | #Params | Seed = 123 | Seed = 1234 | Seed = 12345 | Mean±std |
+|---------------|------------|-------------|-------------|--------------|-------------|
+| yttm | | 26.50/26.29 | - | - | - |
+| hf_bpe | | - | - | - | - |
+| spm | | - | - | - | - |
+
+- transformer_wmt_en_de_big
+
+(test bleu / valid bleu)
+| Subword Model | #Params | Seed = 123 | Seed = 1234 | Seed = 12345 | Mean±std |
+|---------------|------------|-------------|-------------|--------------|-------------|
+| yttm | | 27.93/26.82 | - | - | - |
+| hf_bpe | | - | - | - | - |
+| spm | | - | - | - | - |
diff --git a/scripts/machine_translation/__init__.py b/scripts/machine_translation/__init__.py
index 4c7a3827b3..e69de29bb2 100644
--- a/scripts/machine_translation/__init__.py
+++ b/scripts/machine_translation/__init__.py
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""NMT example."""
-from . import _constants, bleu, dataset, \
- gnmt, translation, utils
diff --git a/scripts/machine_translation/_constants.py b/scripts/machine_translation/_constants.py
deleted file mode 100644
index a3d996d240..0000000000
--- a/scripts/machine_translation/_constants.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constants used in the NMT examples."""
-import os
-
-__all__ = ['CACHE_PATH']
-
-CACHE_PATH = os.path.realpath(os.path.join(os.path.realpath(__file__), '..', 'cached'))
diff --git a/scripts/machine_translation/bleu.py b/scripts/machine_translation/bleu.py
deleted file mode 100644
index 2a0c820ccd..0000000000
--- a/scripts/machine_translation/bleu.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""BLEU."""
-import sys
-import re
-import math
-import unicodedata
-from collections import Counter
-import six
-LIST_TYPES = (list, tuple)
-
-__all__ = ['compute_bleu']
-
-
-def _ngrams(segment, n):
- """Extracts n-grams from an input segment.
-
- Parameters
- ----------
- segment: list
- Text segment from which n-grams will be extracted.
- n: int
- Order of n-gram.
-
- Returns
- -------
- ngram_counts: Counter
- Contain all the nth n-grams in segment with a count of how many times each n-gram occurred.
- """
- ngram_counts = Counter()
- for i in range(0, len(segment) - n + 1):
- ngram = tuple(segment[i:i + n])
- ngram_counts[ngram] += 1
- return ngram_counts
-
-
-def _split_compound_word(segment):
- """Put compounds in ATAT format.
- rich-text format" --> rich ##AT##-##AT## text format.
- """
- return re.sub(r'(\S)-(\S)', '\\1 ##AT##-##AT## \\2', ' '.join(segment)).split()
-
-
-def _bpe_to_words(sentence, delimiter='@@'):
- """Convert a sequence of bpe words into sentence."""
- words = []
- word = ''
- delimiter_len = len(delimiter)
- for subwords in sentence:
- if len(subwords) >= delimiter_len and subwords[-delimiter_len:] == delimiter:
- word += subwords[:-delimiter_len]
- else:
- word += subwords
- words.append(word)
- word = ''
- return words
-
-
-def _tokenize_mteval_13a(segment):
- r"""
- Tokenizes a string following the tokenizer in mteval-v13a.pl.
- See https://github.com/moses-smt/mosesdecoder/"
- "blob/master/scripts/generic/mteval-v14.pl#L917-L942
- Parameters
- ----------
- segment: str
- A string to be tokenized
-
- Returns
- -------
- The tokenized string
- """
-
- norm = segment.rstrip()
-
- norm = norm.replace('', '')
- norm = norm.replace('-\n', '')
- norm = norm.replace('\n', ' ')
- norm = norm.replace('"', '"')
- norm = norm.replace('&', '&')
- norm = norm.replace('<', '<')
- norm = norm.replace('>', '>')
-
- norm = u' {} '.format(norm)
- norm = re.sub(r'([\{-\~\[-\` -\&\(-\+\:-\@\/])', ' \\1 ', norm)
- norm = re.sub(r'([^0-9])([\.,])', '\\1 \\2 ', norm)
- norm = re.sub(r'([\.,])([^0-9])', ' \\1 \\2', norm)
- norm = re.sub(r'([0-9])(-)', '\\1 \\2 ', norm)
- norm = re.sub(r'\s+', ' ', norm)
- norm = re.sub(r'^\s+', '', norm)
- norm = re.sub(r'\s+$', '', norm)
-
- return norm
-
-
-class UnicodeRegex:
- """Ad-hoc hack to recognize all punctuation and symbols.
- """
- def __init__(self):
- punctuation = self._property_chars('P')
- self.nondigit_punct_re = re.compile(r'([^\d])([' + punctuation + r'])')
- self.punct_nondigit_re = re.compile(r'([' + punctuation + r'])([^\d])')
- self.symbol_re = re.compile('([' + self._property_chars('S') + '])')
-
- def _property_chars(self, prefix):
- return ''.join(six.unichr(x) for x in range(sys.maxunicode)
- if unicodedata.category(six.unichr(x)).startswith(prefix))
-
-
-unicodeRegex = UnicodeRegex()
-
-
-def _tokenize_mteval_v14_intl(segment):
- r"""Tokenize a string following following the international tokenizer in mteval-v14a.pl.
- See https://github.com/moses-smt/mosesdecoder/"
- "blob/master/scripts/generic/mteval-v14.pl#L954-L983
-
- Parameters
- ----------
- segment: str
- A string to be tokenized
-
- Returns
- -------
- The tokenized string
- """
- segment = segment.rstrip()
- segment = unicodeRegex.nondigit_punct_re.sub(r'\1 \2 ', segment)
- segment = unicodeRegex.punct_nondigit_re.sub(r' \1 \2', segment)
- segment = unicodeRegex.symbol_re.sub(r' \1 ', segment)
- return segment.strip()
-
-
-TOKENIZERS = {
- '13a': _tokenize_mteval_13a,
- 'intl': _tokenize_mteval_v14_intl,
- None: lambda x: x,
-}
-
-
-def compute_bleu(reference_corpus_list, translation_corpus, tokenized=True,
- tokenizer='13a', max_n=4, smooth=False, lower_case=False,
- bpe=False, split_compound_word=False):
- r"""Compute bleu score of translation against references.
-
- Parameters
- ----------
- reference_corpus_list: list of list(list(str)) or list of list(str)
- list of list(list(str)): tokenized references
- list of list(str): plain text
- List of references for each translation.
- translation_corpus: list(list(str)) or list(str)
- list(list(str)): tokenized translation
- list(str): plain text
- Translations to score.
- tokenized: bool, default True
- Whether the inputs has been tokenized.
- tokenizer: str or None, default '13a'
- '13a': follow the tokenizer in mteval-v13a.pl
- 'intl': follow the international tokenizer in mteval-v14.pl
- None: identity mapping on the string.
- This option is ignored if tokenized is True
- max_n: int, default 4
- Maximum n-gram order to use when computing BLEU score.
- smooth: bool, default False
- Whether or not to compute smoothed bleu score.
- lower_case: bool, default False
- Whether or not to use lower case of tokens
- split_compound_word: bool, default False
- Whether or not to split compound words
- "rich-text format" --> rich ##AT##-##AT## text format.
- bpe: bool, default False
- Whether or not the inputs are in BPE format
-
- Returns
- -------
- 5-Tuple with the BLEU score, n-gram precisions, brevity penalty,
- reference length, and translation length
- """
- precision_numerators = [0 for _ in range(max_n)]
- precision_denominators = [0 for _ in range(max_n)]
- ref_length, trans_length = 0, 0
- for references in reference_corpus_list:
- assert len(references) == len(translation_corpus), \
- 'The number of translations and their references do not match'
- if tokenized:
- assert isinstance(reference_corpus_list[0][0], LIST_TYPES) and \
- isinstance(translation_corpus[0], LIST_TYPES), \
- 'references and translation should have format of list of list(list(str)) ' \
- 'and list(list(str)), respectively, when tokenized is True.'
- else:
- assert isinstance(reference_corpus_list[0][0], six.string_types) and \
- isinstance(translation_corpus[0], six.string_types), \
- 'references and translation should have format of list(list(str)) ' \
- 'and list(str), respectively, when tokenized is False.'
- for references, translation in zip(zip(*reference_corpus_list), translation_corpus):
- if not tokenized:
- references = [TOKENIZERS[tokenizer](reference).split() for reference in references]
- translation = TOKENIZERS[tokenizer](translation).split()
- if bpe:
- references = [_bpe_to_words(reference) for reference in references]
- translation = _bpe_to_words(translation)
- if split_compound_word:
- references = [_split_compound_word(reference) for reference in references]
- translation = _split_compound_word(translation)
- if lower_case:
- references = [[w.lower() for w in reference] for reference in references]
- translation = [w.lower() for w in translation]
- trans_len = len(translation)
- trans_length += trans_len
- ref_length += _closest_ref_length(references, trans_len)
- for n in range(max_n):
- matches, candidates = _compute_precision(references, translation, n + 1)
- precision_numerators[n] += matches
- precision_denominators[n] += candidates
-
- precision_fractions = [(precision_numerators[n], precision_denominators[n])
- for n in range(max_n)]
- smooth_const = 0
- if smooth:
- smooth_const = 1
- precisions = _smoothing(precision_fractions, smooth_const)
- if min(precisions) > 0:
- precision_log_average = sum(math.log(p) for p in precisions) / max_n
- precision_exp_log_average = math.exp(precision_log_average)
- else:
- precision_exp_log_average = 0
-
- bp = _brevity_penalty(ref_length, trans_length)
- bleu = precision_exp_log_average*bp
-
- return bleu, precisions, bp, ref_length, trans_length
-
-
-def _compute_precision(references, translation, n):
- """Compute ngram precision.
-
- Parameters
- ----------
- references: list(list(str))
- A list of references.
- translation: list(str)
- A translation.
- n: int
- Order of n-gram.
-
- Returns
- -------
- matches: int
- Number of matched nth order n-grams
- candidates
- Number of possible nth order n-grams
- """
- matches = 0
- candidates = 0
- ref_ngram_counts = Counter()
-
- for reference in references:
- ref_ngram_counts |= _ngrams(reference, n)
- trans_ngram_counts = _ngrams(translation, n)
- overlap_ngram_counts = trans_ngram_counts & ref_ngram_counts
- matches += sum(overlap_ngram_counts.values())
- possible_matches = len(translation) - n + 1
- if possible_matches > 0:
- candidates += possible_matches
-
- return matches, candidates
-
-
-def _brevity_penalty(ref_length, trans_length):
- """Calculate brevity penalty.
-
- Parameters
- ----------
- ref_length: int
- Sum of all closest references'lengths for every translations in a corpus
- trans_length: int
- Sum of all translations's lengths in a corpus.
-
- Returns
- -------
- bleu's brevity penalty: float
- """
- if trans_length > ref_length:
- return 1
- # If translation is empty, brevity penalty = 0 should result in BLEU = 0.0
- elif trans_length == 0:
- return 0
- else:
- return math.exp(1 - float(ref_length) / trans_length)
-
-
-def _closest_ref_length(references, trans_length):
- """Find the reference that has the closest length to the translation.
-
- Parameters
- ----------
- references: list(list(str))
- A list of references.
- trans_length: int
- Length of the translation.
-
- Returns
- -------
- closest_ref_len: int
- Length of the reference that is closest to the translation.
- """
- ref_lengths = (len(reference) for reference in references)
- closest_ref_len = min(ref_lengths,
- key=lambda ref_length: (abs(ref_length - trans_length), ref_length))
-
- return closest_ref_len
-
-
-def _smoothing(precision_fractions, c=1):
- """Compute the smoothed precision for all the orders.
-
- Parameters
- ----------
- precision_fractions: list(tuple)
- Contain a list of (precision_numerator, precision_denominator) pairs
- c: int, default 1
- Smoothing constant to use
-
- Returns
- -------
- ratios: list of floats
- Contain the smoothed precision_fractions.
- """
- ratios = [0] * len(precision_fractions)
- for i, precision_fraction in enumerate(precision_fractions):
- if precision_fraction[1] > 0:
- ratios[i] = float(precision_fraction[0] + c) / (precision_fraction[1] + c)
- else:
- ratios[i] = 0.0
-
- return ratios
diff --git a/scripts/machine_translation/dataprocessor.py b/scripts/machine_translation/dataprocessor.py
deleted file mode 100644
index e60989e2e1..0000000000
--- a/scripts/machine_translation/dataprocessor.py
+++ /dev/null
@@ -1,284 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Data preprocessing for transformer."""
-
-import os
-import io
-import time
-import logging
-import numpy as np
-from mxnet import gluon
-import gluonnlp as nlp
-import gluonnlp.data.batchify as btf
-import _constants
-import dataset as _dataset
-
-
-def _cache_dataset(dataset, prefix):
- """Cache the processed npy dataset the dataset into a npz
-
- Parameters
- ----------
- dataset : SimpleDataset
- file_path : str
- """
- if not os.path.exists(_constants.CACHE_PATH):
- os.makedirs(_constants.CACHE_PATH)
- src_data = np.concatenate([e[0] for e in dataset])
- tgt_data = np.concatenate([e[1] for e in dataset])
- src_cumlen = np.cumsum([0]+[len(e[0]) for e in dataset])
- tgt_cumlen = np.cumsum([0]+[len(e[1]) for e in dataset])
- np.savez(os.path.join(_constants.CACHE_PATH, prefix + '.npz'),
- src_data=src_data, tgt_data=tgt_data,
- src_cumlen=src_cumlen, tgt_cumlen=tgt_cumlen)
-
-
-def _load_cached_dataset(prefix):
- cached_file_path = os.path.join(_constants.CACHE_PATH, prefix + '.npz')
- if os.path.exists(cached_file_path):
- print('Loading dataset...')
- npz_data = np.load(cached_file_path)
- src_data, tgt_data, src_cumlen, tgt_cumlen = \
- [npz_data[n] for n in ['src_data', 'tgt_data', 'src_cumlen', 'tgt_cumlen']]
- src_data = np.array([src_data[low:high] for low, high
- in zip(src_cumlen[:-1], src_cumlen[1:])])
- tgt_data = np.array([tgt_data[low:high] for low, high
- in zip(tgt_cumlen[:-1], tgt_cumlen[1:])])
- return gluon.data.ArrayDataset(np.array(src_data), np.array(tgt_data))
- else:
- return None
-
-
-class TrainValDataTransform:
- """Transform the machine translation dataset.
-
- Clip source and the target sentences to the maximum length. For the source sentence, append the
- EOS. For the target sentence, append BOS and EOS.
-
- Parameters
- ----------
- src_vocab : Vocab
- tgt_vocab : Vocab
- src_max_len : int
- tgt_max_len : int
- """
-
- def __init__(self, src_vocab, tgt_vocab, src_max_len=None, tgt_max_len=None):
- self._src_vocab = src_vocab
- self._tgt_vocab = tgt_vocab
- self._src_max_len = src_max_len
- self._tgt_max_len = tgt_max_len
-
- def __call__(self, src, tgt):
- # For src_max_len < 0, we do not clip the sequence
- if self._src_max_len >= 0:
- src_sentence = self._src_vocab[src.split()[:self._src_max_len]]
- else:
- src_sentence = self._src_vocab[src.split()]
- # For tgt_max_len < 0, we do not clip the sequence
- if self._tgt_max_len >= 0:
- tgt_sentence = self._tgt_vocab[tgt.split()[:self._tgt_max_len]]
- else:
- tgt_sentence = self._tgt_vocab[tgt.split()]
- src_sentence.append(self._src_vocab[self._src_vocab.eos_token])
- tgt_sentence.insert(0, self._tgt_vocab[self._tgt_vocab.bos_token])
- tgt_sentence.append(self._tgt_vocab[self._tgt_vocab.eos_token])
- src_npy = np.array(src_sentence, dtype=np.int32)
- tgt_npy = np.array(tgt_sentence, dtype=np.int32)
- return src_npy, tgt_npy
-
-
-def process_dataset(dataset, src_vocab, tgt_vocab, src_max_len=-1, tgt_max_len=-1):
- start = time.time()
- dataset_processed = dataset.transform(TrainValDataTransform(src_vocab, tgt_vocab,
- src_max_len,
- tgt_max_len), lazy=False)
- end = time.time()
- print('Processing Time spent: {}'.format(end - start))
- return dataset_processed
-
-
-def load_translation_data(dataset, bleu, args):
- """Load translation dataset
-
- Parameters
- ----------
- dataset : str
- args : argparse result
-
- Returns
- -------
-
- """
- src_lang, tgt_lang = args.src_lang, args.tgt_lang
- if dataset == 'IWSLT2015':
- common_prefix = 'IWSLT2015_{}_{}_{}_{}'.format(src_lang, tgt_lang,
- args.src_max_len, args.tgt_max_len)
- data_train = nlp.data.IWSLT2015('train', src_lang=src_lang, tgt_lang=tgt_lang)
- data_val = nlp.data.IWSLT2015('val', src_lang=src_lang, tgt_lang=tgt_lang)
- data_test = nlp.data.IWSLT2015('test', src_lang=src_lang, tgt_lang=tgt_lang)
- elif dataset == 'WMT2016BPE':
- common_prefix = 'WMT2016BPE_{}_{}_{}_{}'.format(src_lang, tgt_lang,
- args.src_max_len, args.tgt_max_len)
- data_train = nlp.data.WMT2016BPE('train', src_lang=src_lang, tgt_lang=tgt_lang)
- data_val = nlp.data.WMT2016BPE('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang)
- data_test = nlp.data.WMT2016BPE('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang)
- elif dataset == 'WMT2014BPE':
- common_prefix = 'WMT2014BPE_{}_{}_{}_{}'.format(src_lang, tgt_lang,
- args.src_max_len, args.tgt_max_len)
- data_train = nlp.data.WMT2014BPE('train', src_lang=src_lang, tgt_lang=tgt_lang)
- data_val = nlp.data.WMT2014BPE('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang)
- data_test = nlp.data.WMT2014BPE('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang,
- full=args.full)
- elif dataset == 'TOY':
- common_prefix = 'TOY_{}_{}_{}_{}'.format(src_lang, tgt_lang,
- args.src_max_len, args.tgt_max_len)
- data_train = _dataset.TOY('train', src_lang=src_lang, tgt_lang=tgt_lang)
- data_val = _dataset.TOY('val', src_lang=src_lang, tgt_lang=tgt_lang)
- data_test = _dataset.TOY('test', src_lang=src_lang, tgt_lang=tgt_lang)
- else:
- raise NotImplementedError
- src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab
- data_train_processed = _load_cached_dataset(common_prefix + '_train')
- if not data_train_processed:
- data_train_processed = process_dataset(data_train, src_vocab, tgt_vocab,
- args.src_max_len, args.tgt_max_len)
- _cache_dataset(data_train_processed, common_prefix + '_train')
- data_val_processed = _load_cached_dataset(common_prefix + '_val')
- if not data_val_processed:
- data_val_processed = process_dataset(data_val, src_vocab, tgt_vocab)
- _cache_dataset(data_val_processed, common_prefix + '_val')
- if dataset == 'WMT2014BPE':
- filename = common_prefix + '_' + str(args.full) + '_test'
- else:
- filename = common_prefix + '_test'
- data_test_processed = _load_cached_dataset(filename)
- if not data_test_processed:
- data_test_processed = process_dataset(data_test, src_vocab, tgt_vocab)
- _cache_dataset(data_test_processed, filename)
- if bleu == 'tweaked':
- fetch_tgt_sentence = lambda src, tgt: tgt.split()
- val_tgt_sentences = list(data_val.transform(fetch_tgt_sentence))
- test_tgt_sentences = list(data_test.transform(fetch_tgt_sentence))
- elif bleu in ('13a', 'intl'):
- fetch_tgt_sentence = lambda src, tgt: tgt
- if dataset == 'WMT2016BPE':
- val_text = nlp.data.WMT2016('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang)
- test_text = nlp.data.WMT2016('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang)
- elif dataset == 'WMT2014BPE':
- val_text = nlp.data.WMT2014('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang)
- test_text = nlp.data.WMT2014('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang,
- full=args.full)
- elif dataset in ('IWSLT2015', 'TOY'):
- val_text = data_val
- test_text = data_test
- else:
- raise NotImplementedError
- val_tgt_sentences = list(val_text.transform(fetch_tgt_sentence))
- test_tgt_sentences = list(test_text.transform(fetch_tgt_sentence))
- else:
- raise NotImplementedError
- return data_train_processed, data_val_processed, data_test_processed, \
- val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab
-
-
-def get_data_lengths(dataset):
- get_lengths = lambda *args: (args[2], args[3])
- return list(dataset.transform(get_lengths))
-
-def get_dataloader(data_set, args, dataset_type,
- use_average_length=False, num_shards=0, num_workers=8):
- """Create data loaders for training/validation/test."""
- assert dataset_type in ['train', 'val', 'test']
-
- if args.bucket_scheme == 'constant':
- bucket_scheme = nlp.data.ConstWidthBucket()
- elif args.bucket_scheme == 'linear':
- bucket_scheme = nlp.data.LinearWidthBucket()
- elif args.bucket_scheme == 'exp':
- bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
- else:
- raise NotImplementedError
-
- data_lengths = get_data_lengths(data_set)
-
- if dataset_type == 'train':
- train_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
- btf.Stack(dtype='float32'), btf.Stack(dtype='float32'))
-
- else:
- data_lengths = list(map(lambda x: x[-1], data_lengths))
- test_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
- btf.Stack(dtype='float32'), btf.Stack(dtype='float32'),
- btf.Stack())
-
- batch_sampler = nlp.data.FixedBucketSampler(lengths=data_lengths,
- batch_size=(args.batch_size \
- if dataset_type == 'train' \
- else args.test_batch_size),
- num_buckets=args.num_buckets,
- ratio=args.bucket_ratio,
- shuffle=(dataset_type == 'train'),
- use_average_length=use_average_length,
- num_shards=num_shards,
- bucket_scheme=bucket_scheme)
-
- if dataset_type == 'train':
- logging.info('Train Batch Sampler:\n%s', batch_sampler.stats())
- data_loader = nlp.data.ShardedDataLoader(data_set,
- batch_sampler=batch_sampler,
- batchify_fn=train_batchify_fn,
- num_workers=num_workers)
- else:
- if dataset_type == 'val':
- logging.info('Valid Batch Sampler:\n%s', batch_sampler.stats())
- else:
- logging.info('Test Batch Sampler:\n%s', batch_sampler.stats())
-
- data_loader = gluon.data.DataLoader(data_set,
- batch_sampler=batch_sampler,
- batchify_fn=test_batchify_fn,
- num_workers=num_workers)
-
- return data_loader
-
-def make_dataloader(data_train, data_val, data_test, args,
- use_average_length=False, num_shards=0, num_workers=8):
- """Create data loaders for training/validation/test."""
- train_data_loader = get_dataloader(data_train, args, dataset_type='train',
- use_average_length=use_average_length,
- num_shards=num_shards,
- num_workers=num_workers)
-
- val_data_loader = get_dataloader(data_val, args, dataset_type='val',
- use_average_length=use_average_length,
- num_workers=num_workers)
-
- test_data_loader = get_dataloader(data_test, args, dataset_type='test',
- use_average_length=use_average_length,
- num_workers=num_workers)
-
- return train_data_loader, val_data_loader, test_data_loader
-
-
-def write_sentences(sentences, file_path):
- with io.open(file_path, 'w', encoding='utf-8') as of:
- for sent in sentences:
- if isinstance(sent, (list, tuple)):
- of.write(' '.join(sent) + '\n')
- else:
- of.write(sent + '\n')
diff --git a/scripts/machine_translation/dataset.py b/scripts/machine_translation/dataset.py
deleted file mode 100644
index 5392e80508..0000000000
--- a/scripts/machine_translation/dataset.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-"""Translation datasets."""
-
-
-__all__ = ['TOY']
-
-import os
-from gluonnlp.base import get_home_dir
-from gluonnlp.data.translation import _TranslationDataset, _get_pair_key
-
-
-class TOY(_TranslationDataset):
- """A Small Translation Dataset for Testing Scripts.
-
- Parameters
- ----------
- segment : str or list of str, default 'train'
- Dataset segment. Options are 'train', 'val', 'test' or their combinations.
- src_lang : str, default 'en'
- The source language. Option for source and target languages are 'en' <-> 'de'
- tgt_lang : str, default 'de'
- The target language. Option for source and target languages are 'en' <-> 'de'
- root : str, default '$MXNET_HOME/datasets/translation_test'
- Path to temp folder for storing data.
- MXNET_HOME defaults to '~/.mxnet'.
- """
- def __init__(self, segment='train', src_lang='en', tgt_lang='de',
- root=os.path.join(get_home_dir(), 'datasets', 'translation_test')):
- self._supported_segments = ['train', 'val', 'test']
- self._archive_file = {_get_pair_key('en', 'de'):
- ('translation_test.zip',
- '14f6c8e31ac6ec84ce469b4c196d60b4c86a179d')}
- self._data_file = {_get_pair_key('en', 'de'):
- {'train_en': ('train.en',
- 'aa7f22b91eb93390fd342a57a81f51f53ed29542'),
- 'train_de': ('train.de',
- 'f914217ce23ddd8cac07e761a75685c043d4f6d3'),
- 'val_en': ('train.en',
- 'aa7f22b91eb93390fd342a57a81f51f53ed29542'),
- 'val_de': ('train.de',
- 'f914217ce23ddd8cac07e761a75685c043d4f6d3'),
- 'test_en': ('train.en',
- 'aa7f22b91eb93390fd342a57a81f51f53ed29542'),
- 'test_de': ('train.de',
- 'f914217ce23ddd8cac07e761a75685c043d4f6d3'),
- 'vocab_en': ('vocab.en.json',
- 'c7c6af4603ea70f0a4af2460a622333fbd014050'),
- 'vocab_de' : ('vocab.de.json',
- '5b6f1be36a3e3cb9946b86e5d0fc73d164fda99f')}}
- super(TOY, self).__init__('translation_test', segment=segment, src_lang=src_lang,
- tgt_lang=tgt_lang, root=root)
diff --git a/scripts/machine_translation/evaluate_transformer.py b/scripts/machine_translation/evaluate_transformer.py
new file mode 100644
index 0000000000..2dddfdc06b
--- /dev/null
+++ b/scripts/machine_translation/evaluate_transformer.py
@@ -0,0 +1,291 @@
+import numpy as np
+import random
+import os
+import mxnet as mx
+from mxnet import gluon
+import argparse
+import logging
+import time
+from gluonnlp.utils.misc import logging_config
+from gluonnlp.models.transformer import TransformerModel,\
+ TransformerNMTInference
+from gluonnlp.data.batchify import Tuple, Pad, Stack
+from gluonnlp.data.filtering import MosesNormalizer
+from gluonnlp.data import tokenizers
+from gluonnlp.sequence_sampler import BeamSearchSampler, BeamSearchScorer
+import sacrebleu
+from tqdm import tqdm
+mx.npx.set_np()
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='Transformer for Neural Machine Translation. Load a checkpoint and inference.')
+ parser.add_argument('--seed', type=int, default=100, help='The random seed.')
+ parser.add_argument('--src_lang', type=str, default='en', help='Source language')
+ parser.add_argument('--tgt_lang', type=str, default='de', help='Target language')
+ parser.add_argument('--src_corpus', type=str, required=True,
+ help='The source corpus for evaluation.')
+ parser.add_argument('--tgt_corpus', type=str, default=None,
+ help='The target corpus for evaluation.')
+ parser.add_argument('--src_tokenizer', choices=['spm',
+ 'subword_nmt',
+ 'yttm',
+ 'hf_bytebpe',
+ 'hf_wordpiece',
+ 'hf_bpe'],
+ required=True, type=str,
+ help='The source tokenizer. Only supports online encoding at present.')
+ parser.add_argument('--tgt_tokenizer', choices=['spm',
+ 'subword_nmt',
+ 'yttm',
+ 'hf_bytebpe',
+ 'hf_wordpiece',
+ 'hf_bpe'],
+ required=True, type=str,
+ help='The target tokenizer. Only supports online encoding at present.')
+ parser.add_argument('--src_subword_model_path', type=str,
+ help='Path to the source subword model.')
+ parser.add_argument('--src_vocab_path', type=str,
+ help='Path to the source subword vocab.')
+ parser.add_argument('--tgt_subword_model_path', type=str,
+ help='Path to the target subword model.')
+ parser.add_argument('--tgt_vocab_path', type=str,
+ help='Path to the target subword vocab.')
+ parser.add_argument('--src_max_len', type=int, default=None,
+ help='Maximum length of the source sentence.')
+ parser.add_argument('--tgt_max_len', type=int, default=None,
+ help='Maximum length of the target sentence.')
+ parser.add_argument('--cfg', type=str, help='Config file of the Transformer model.')
+ parser.add_argument('--beam-size', type=int, default=4, help='Number of beams')
+ parser.add_argument('--lp_alpha', type=float, default=0.6,
+ help='The alpha value in the length penalty')
+ parser.add_argument('--lp_k', type=int, default=5, help='The K value in the length penalty')
+ parser.add_argument('--max_length_a', type=int, default=1,
+ help='The a in the a * x + b formula of beam search')
+ parser.add_argument('--max_length_b', type=int, default=50,
+ help='The b in the a * x + b formula of beam search')
+ parser.add_argument('--param_path', type=str, help='The path to the model parameters.')
+ parser.add_argument('--gpus', type=str, default='0',
+ help='List of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.'
+ '(using single gpu is suggested)')
+ parser.add_argument('--save_dir', type=str, default=None,
+ help='The path to save the log files and predictions.')
+ parser.add_argument('--stochastic', action='store_true',
+ help='Whether to use the stochastic beam search')
+ parser.add_argument('--inference', action='store_true',
+ help='Whether to inference with your own data, '
+ 'when applying inference, tgt_corpus is not needed and will be set to None.')
+ parser.add_argument('--fp16', action='store_true',
+ help='Whether to use dtype float16')
+ args = parser.parse_args()
+ if args.save_dir is None:
+ args.save_dir = os.path.splitext(args.param_path)[0] + '_evaluation'
+ assert args.inference or args.tgt_corpus, 'requring --tgt_corpus while not using --inference'
+ if args.inference:
+ args.tgt_corpus = None
+ logging_config(args.save_dir, console=True)
+ logging.info(args)
+ return args
+
+
+def process_corpus(corpus_path, sentence_normalizer, bpe_tokenizer,
+ base_tokenizer=None, add_bos=True,
+ add_eos=True):
+ processed_token_ids = []
+ raw_lines = []
+ with open(corpus_path, 'r', encoding='utf-8') as f:
+ for line in f:
+ line = line.strip()
+ raw_lines.append(line)
+ line = sentence_normalizer(line)
+ if base_tokenizer is not None:
+ line = ' '.join(base_tokenizer.encode(line))
+ bpe_token_ids = bpe_tokenizer.encode(line, output_type=int)
+ if add_bos:
+ bpe_token_ids = [bpe_tokenizer.vocab.bos_id] + bpe_token_ids
+ if add_eos:
+ bpe_token_ids.append(bpe_tokenizer.vocab.eos_id)
+ processed_token_ids.append(bpe_token_ids)
+ return processed_token_ids, raw_lines
+
+
+def create_tokenizer(tokenizer_type, model_path, vocab_path):
+ if tokenizer_type == 'spm':
+ return tokenizers.create(tokenizer_type, model_path=model_path, vocab=vocab_path)
+ elif tokenizer_type == 'subword_nmt':
+ return tokenizers.create(tokenizer_type, codec_path=model_path, vocab_path=vocab_path)
+ elif tokenizer_type == 'yttm':
+ return tokenizers.create(tokenizer_type, model_path=model_path)
+ elif tokenizer_type == 'hf_bytebpe':
+ return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path)
+ elif tokenizer_type == 'hf_wordpiece':
+ return tokenizers.create(tokenizer_type, vocab_file=vocab_path)
+ elif tokenizer_type == 'hf_bpe':
+ return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path)
+ else:
+ raise NotImplementedError
+
+
+def evaluate(args):
+ ctx_l = [mx.cpu()] if args.gpus is None or args.gpus == '' else [mx.gpu(int(x)) for x in
+ args.gpus.split(',')]
+ src_normalizer = MosesNormalizer(args.src_lang)
+ tgt_normalizer = MosesNormalizer(args.tgt_lang)
+ base_src_tokenizer = tokenizers.create('moses', args.src_lang)
+ base_tgt_tokenizer = tokenizers.create('moses', args.tgt_lang)
+
+ src_tokenizer = create_tokenizer(args.src_tokenizer,
+ args.src_subword_model_path,
+ args.src_vocab_path)
+ tgt_tokenizer = create_tokenizer(args.tgt_tokenizer,
+ args.tgt_subword_model_path,
+ args.tgt_vocab_path)
+ src_vocab = src_tokenizer.vocab
+ tgt_vocab = tgt_tokenizer.vocab
+ if args.cfg.endswith('.yml'):
+ cfg = TransformerModel.get_cfg().clone_merge(args.cfg)
+ else:
+ cfg = TransformerModel.get_cfg(args.cfg)
+ cfg.defrost()
+ cfg.MODEL.src_vocab_size = len(src_vocab)
+ cfg.MODEL.tgt_vocab_size = len(tgt_vocab)
+ if args.fp16:
+ cfg.MODEL.dtype = 'float16'
+ cfg.freeze()
+ model = TransformerModel.from_cfg(cfg)
+ model.hybridize()
+ model.load_parameters(args.param_path, ctx=ctx_l)
+ inference_model = TransformerNMTInference(model=model)
+ inference_model.hybridize()
+ # Construct the BeamSearchSampler
+ if args.stochastic:
+ scorer = BeamSearchScorer(alpha=0.0,
+ K=0.0,
+ temperature=1.0,
+ from_logits=False)
+ else:
+ scorer = BeamSearchScorer(alpha=args.lp_alpha,
+ K=args.lp_k,
+ from_logits=False)
+ beam_search_sampler = BeamSearchSampler(beam_size=args.beam_size,
+ decoder=inference_model,
+ vocab_size=len(tgt_vocab),
+ eos_id=tgt_vocab.eos_id,
+ scorer=scorer,
+ stochastic=args.stochastic,
+ max_length_a=args.max_length_a,
+ max_length_b=args.max_length_b)
+
+ logging.info(beam_search_sampler)
+ all_src_token_ids, all_src_lines = process_corpus(
+ args.src_corpus,
+ sentence_normalizer=src_normalizer,
+ base_tokenizer=base_src_tokenizer,
+ bpe_tokenizer=src_tokenizer,
+ add_bos=False,
+ add_eos=True
+ )
+ if args.tgt_corpus is not None:
+ all_tgt_token_ids, all_tgt_lines = process_corpus(
+ args.tgt_corpus,
+ sentence_normalizer=tgt_normalizer,
+ base_tokenizer=base_tgt_tokenizer,
+ bpe_tokenizer=tgt_tokenizer,
+ add_bos=True,
+ add_eos=True
+ )
+ else: # when applying inference, populate the fake tgt tokens
+ all_tgt_token_ids = all_tgt_lines = [[] for i in range(len(all_src_token_ids))]
+ test_dataloader = gluon.data.DataLoader(
+ list(zip(all_src_token_ids,
+ [len(ele) for ele in all_src_token_ids],
+ all_tgt_token_ids,
+ [len(ele) for ele in all_tgt_token_ids])),
+ batch_size=32,
+ batchify_fn=Tuple(Pad(), Stack(), Pad(), Stack()),
+ shuffle=False)
+
+ ctx = ctx_l[0]
+ pred_sentences = []
+ start_eval_time = time.time()
+ # evaluate
+ if not args.inference:
+ avg_nll_loss = 0
+ ntokens = 0
+ for i, (src_token_ids, src_valid_length, tgt_token_ids, tgt_valid_length)\
+ in enumerate(test_dataloader):
+ src_token_ids = mx.np.array(src_token_ids, ctx=ctx, dtype=np.int32)
+ src_valid_length = mx.np.array(src_valid_length, ctx=ctx, dtype=np.int32)
+ tgt_token_ids = mx.np.array(tgt_token_ids, ctx=ctx, dtype=np.int32)
+ tgt_valid_length = mx.np.array(tgt_valid_length, ctx=ctx, dtype=np.int32)
+ tgt_pred = model(src_token_ids, src_valid_length, tgt_token_ids[:, :-1],
+ tgt_valid_length - 1)
+ pred_logits = mx.npx.log_softmax(tgt_pred, axis=-1)
+ nll = - mx.npx.pick(pred_logits, tgt_token_ids[:, 1:])
+ avg_nll_loss += mx.npx.sequence_mask(nll,
+ sequence_length=tgt_valid_length - 1,
+ use_sequence_length=True, axis=1).sum().asnumpy()
+ ntokens += int((tgt_valid_length - 1).sum().asnumpy())
+ init_input = mx.np.array([tgt_vocab.bos_id for _ in range(src_token_ids.shape[0])], ctx=ctx)
+ states = inference_model.init_states(src_token_ids, src_valid_length)
+ samples, scores, valid_length = beam_search_sampler(init_input, states, src_valid_length)
+ for j in range(samples.shape[0]):
+ pred_tok_ids = samples[j, 0, :valid_length[j, 0].asnumpy()].asnumpy().tolist()
+ bpe_decode_line = tgt_tokenizer.decode(pred_tok_ids[1:-1])
+ pred_sentence = base_tgt_tokenizer.decode(bpe_decode_line.split(' '))
+ pred_sentences.append(pred_sentence)
+ print(pred_sentence)
+ print('Processed {}/{}'.format(len(pred_sentences), len(all_tgt_lines)))
+ end_eval_time = time.time()
+ avg_nll_loss = avg_nll_loss / ntokens
+
+ with open(os.path.join(args.save_dir, 'gt_sentences.txt'), 'w', encoding='utf-8') as of:
+ of.write('\n'.join(all_tgt_lines))
+ of.write('\n')
+ with open(os.path.join(args.save_dir, 'pred_sentences.txt'), 'w', encoding='utf-8') as of:
+ of.write('\n'.join(pred_sentences))
+ of.write('\n')
+
+ sacrebleu_out = sacrebleu.corpus_bleu(sys_stream=pred_sentences, ref_streams=[all_tgt_lines])
+ logging.info('Time Spent: {}, #Sent={}, SacreBlEU={} '
+ '({:2.1f} {:2.1f} {:2.1f} {:2.1f}) '
+ '(BP={:.3f}, ratio={:.3f}, syslen={}, reflen={}), '
+ 'Avg NLL={}, Perplexity={}'
+ .format(end_eval_time - start_eval_time, len(all_tgt_lines),
+ sacrebleu_out.score,
+ *sacrebleu_out.precisions,
+ sacrebleu_out.bp, sacrebleu_out.sys_len / sacrebleu_out.ref_len,
+ sacrebleu_out.sys_len, sacrebleu_out.ref_len,
+ avg_nll_loss, np.exp(avg_nll_loss)))
+ # inference only
+ else:
+ with open(os.path.join(args.save_dir, 'pred_sentences.txt'), 'w', encoding='utf-8') as of:
+ processed_sentences = 0
+ for src_token_ids, src_valid_length, _, _ in tqdm(test_dataloader):
+ src_token_ids = mx.np.array(src_token_ids, ctx=ctx, dtype=np.int32)
+ src_valid_length = mx.np.array(src_valid_length, ctx=ctx, dtype=np.int32)
+ init_input = mx.np.array([tgt_vocab.bos_id for _ in range(src_token_ids.shape[0])], ctx=ctx)
+ states = inference_model.init_states(src_token_ids, src_valid_length)
+ samples, scores, valid_length = beam_search_sampler(init_input, states, src_valid_length)
+ for j in range(samples.shape[0]):
+ pred_tok_ids = samples[j, 0, :valid_length[j, 0].asnumpy()].asnumpy().tolist()
+ bpe_decode_line = tgt_tokenizer.decode(pred_tok_ids[1:-1])
+ pred_sentence = base_tgt_tokenizer.decode(bpe_decode_line.split(' '))
+ pred_sentences.append(pred_sentence)
+ of.write('\n'.join(pred_sentences))
+ of.write('\n')
+ processed_sentences += len(pred_sentences)
+ pred_sentences = []
+ end_eval_time = time.time()
+ logging.info('Time Spent: {}, Inferred sentences: {}'
+ .format(end_eval_time - start_eval_time, processed_sentences))
+
+if __name__ == '__main__':
+ os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
+ os.environ['MXNET_USE_FUSION'] = '0' # Manually disable pointwise fusion
+ args = parse_args()
+ np.random.seed(args.seed)
+ mx.random.seed(args.seed)
+ random.seed(args.seed)
+ evaluate(args)
diff --git a/scripts/machine_translation/gnmt.py b/scripts/machine_translation/gnmt.py
deleted file mode 100644
index c31cb1d66f..0000000000
--- a/scripts/machine_translation/gnmt.py
+++ /dev/null
@@ -1,512 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Encoder and decoder usded in sequence-to-sequence learning."""
-__all__ = ['GNMTEncoder', 'GNMTDecoder', 'GNMTOneStepDecoder', 'get_gnmt_encoder_decoder']
-
-import mxnet as mx
-from mxnet.base import _as_list
-from mxnet.gluon import nn, rnn
-from mxnet.gluon.block import HybridBlock
-from gluonnlp.model.seq2seq_encoder_decoder import Seq2SeqEncoder, Seq2SeqDecoder, \
- Seq2SeqOneStepDecoder, _nested_sequence_last
-from gluonnlp.model.utils import _get_cell_type
-from gluonnlp.model.attention_cell import _get_attention_cell
-
-
-class GNMTEncoder(Seq2SeqEncoder):
- r"""Structure of the RNN Encoder similar to that used in
- "[Arxiv2016] Google's Neural Machine Translation System:
- Bridgeing the Gap between Human and Machine Translation"
-
- The encoder first stacks several bidirectional RNN layers and then stacks multiple
- uni-directional RNN layers with residual connections.
-
- Parameters
- ----------
- cell_type : str or function
- Can be "lstm", "gru" or constructor functions that can be directly called,
- like rnn.LSTMCell
- num_layers : int
- Total number of layers
- num_bi_layers : int
- Total number of bidirectional layers
- hidden_size : int
- Number of hidden units
- dropout : float
- The dropout rate
- use_residual : bool
- Whether to use residual connection. Residual connection will be added in the
- uni-directional RNN layers
- i2h_weight_initializer : str or Initializer
- Initializer for the input weights matrix, used for the linear
- transformation of the inputs.
- h2h_weight_initializer : str or Initializer
- Initializer for the recurrent weights matrix, used for the linear
- transformation of the recurrent state.
- i2h_bias_initializer : str or Initializer
- Initializer for the bias vector.
- h2h_bias_initializer : str or Initializer
- Initializer for the bias vector.
- prefix : str, default 'rnn_'
- Prefix for name of `Block`s
- (and name of weight if params is `None`).
- params : Parameter or None
- Container for weight sharing between cells.
- Created if `None`.
- """
- def __init__(self, cell_type='lstm', num_layers=2, num_bi_layers=1, hidden_size=128,
- dropout=0.0, use_residual=True,
- i2h_weight_initializer=None, h2h_weight_initializer=None,
- i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
- prefix=None, params=None):
- super(GNMTEncoder, self).__init__(prefix=prefix, params=params)
- self._cell_type = _get_cell_type(cell_type)
- assert num_bi_layers <= num_layers,\
- 'Number of bidirectional layers must be smaller than the total number of layers, ' \
- 'num_bi_layers={}, num_layers={}'.format(num_bi_layers, num_layers)
- self._num_bi_layers = num_bi_layers
- self._num_layers = num_layers
- self._hidden_size = hidden_size
- self._dropout = dropout
- self._use_residual = use_residual
- with self.name_scope():
- self.dropout_layer = nn.Dropout(dropout)
- self.rnn_cells = nn.HybridSequential()
- for i in range(num_layers):
- if i < num_bi_layers:
- self.rnn_cells.add(rnn.BidirectionalCell(
- l_cell=self._cell_type(hidden_size=self._hidden_size,
- i2h_weight_initializer=i2h_weight_initializer,
- h2h_weight_initializer=h2h_weight_initializer,
- i2h_bias_initializer=i2h_bias_initializer,
- h2h_bias_initializer=h2h_bias_initializer,
- prefix='rnn%d_l_' % i),
- r_cell=self._cell_type(hidden_size=self._hidden_size,
- i2h_weight_initializer=i2h_weight_initializer,
- h2h_weight_initializer=h2h_weight_initializer,
- i2h_bias_initializer=i2h_bias_initializer,
- h2h_bias_initializer=h2h_bias_initializer,
- prefix='rnn%d_r_' % i)))
- else:
- self.rnn_cells.add(
- self._cell_type(hidden_size=self._hidden_size,
- i2h_weight_initializer=i2h_weight_initializer,
- h2h_weight_initializer=h2h_weight_initializer,
- i2h_bias_initializer=i2h_bias_initializer,
- h2h_bias_initializer=h2h_bias_initializer,
- prefix='rnn%d_' % i))
-
- def __call__(self, inputs, states=None, valid_length=None):
- """Encoder the inputs given the states and valid sequence length.
-
- Parameters
- ----------
- inputs : NDArray
- Input sequence. Shape (batch_size, length, C_in)
- states : list of NDArrays or None
- Initial states. The list of initial states
- valid_length : NDArray or None
- Valid lengths of each sequence. This is usually used when part of sequence has
- been padded. Shape (batch_size,)
-
- Returns
- -------
- encoder_outputs: list
- Outputs of the encoder. Contains:
-
- - outputs of the last RNN layer
- - new_states of all the RNN layers
- """
- return super(GNMTEncoder, self).__call__(inputs, states, valid_length)
-
- def forward(self, inputs, states=None, valid_length=None): #pylint: disable=arguments-differ, missing-docstring
- # TODO(sxjscience) Accelerate the forward using HybridBlock
- _, length, _ = inputs.shape
- new_states = []
- outputs = inputs
- for i, cell in enumerate(self.rnn_cells):
- begin_state = None if states is None else states[i]
- outputs, layer_states = cell.unroll(
- length=length, inputs=inputs, begin_state=begin_state, merge_outputs=True,
- valid_length=valid_length, layout='NTC')
- if i < self._num_bi_layers:
- # For bidirectional RNN, we use the states of the backward RNN
- new_states.append(layer_states[len(self.rnn_cells[i].state_info()) // 2:])
- else:
- new_states.append(layer_states)
- # Apply Dropout
- outputs = self.dropout_layer(outputs)
- if self._use_residual:
- if i > self._num_bi_layers:
- outputs = outputs + inputs
- inputs = outputs
- if valid_length is not None:
- outputs = mx.nd.SequenceMask(outputs, sequence_length=valid_length,
- use_sequence_length=True, axis=1)
- return [outputs, new_states], []
-
-
-class _BaseGNMTDecoder(HybridBlock):
- def __init__(self, cell_type='lstm', attention_cell='scaled_luong',
- num_layers=2, hidden_size=128,
- dropout=0.0, use_residual=True, output_attention=False,
- i2h_weight_initializer=None, h2h_weight_initializer=None,
- i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
- prefix=None, params=None):
- super().__init__(prefix=prefix, params=params)
- self._cell_type = _get_cell_type(cell_type)
- self._num_layers = num_layers
- self._hidden_size = hidden_size
- self._dropout = dropout
- self._use_residual = use_residual
- self._output_attention = output_attention
- with self.name_scope():
- self.attention_cell = _get_attention_cell(attention_cell, units=hidden_size)
- self.dropout_layer = nn.Dropout(dropout)
- self.rnn_cells = nn.HybridSequential()
- for i in range(num_layers):
- self.rnn_cells.add(
- self._cell_type(hidden_size=self._hidden_size,
- i2h_weight_initializer=i2h_weight_initializer,
- h2h_weight_initializer=h2h_weight_initializer,
- i2h_bias_initializer=i2h_bias_initializer,
- h2h_bias_initializer=h2h_bias_initializer,
- prefix='rnn%d_' % i))
-
- def init_state_from_encoder(self, encoder_outputs, encoder_valid_length=None):
- """Initialize the state from the encoder outputs.
-
- Parameters
- ----------
- encoder_outputs : list
- encoder_valid_length : NDArray or None
-
- Returns
- -------
- decoder_states : list
- The decoder states, includes:
-
- - rnn_states : NDArray
- - attention_vec : NDArray
- - mem_value : NDArray
- - mem_masks : NDArray, optional
- """
- mem_value, rnn_states = encoder_outputs
- batch_size, _, mem_size = mem_value.shape
- attention_vec = mx.nd.zeros(shape=(batch_size, mem_size), ctx=mem_value.context)
- decoder_states = [rnn_states, attention_vec, mem_value]
- mem_length = mem_value.shape[1]
- if encoder_valid_length is not None:
- mem_masks = mx.nd.broadcast_lesser(
- mx.nd.arange(mem_length, ctx=encoder_valid_length.context).reshape((1, -1)),
- encoder_valid_length.reshape((-1, 1)))
- decoder_states.append(mem_masks)
- return decoder_states
-
- def forward(self, step_input, states): # pylint: disable=arguments-differ
- """One-step-ahead decoding of the GNMT decoder.
-
- Parameters
- ----------
- step_input : NDArray or Symbol
- states : list of NDArray or Symbol
-
- Returns
- -------
- step_output : NDArray or Symbol
- The output of the decoder. Shape is (batch_size, C_out)
- new_states: list
- Includes
-
- - rnn_states : list of NDArray or Symbol
- - attention_vec : NDArray or Symbol, Shape (batch_size, C_memory)
- - mem_value : NDArray
- - mem_masks : NDArray, optional
-
- step_additional_outputs : list
- Either be an empty list or contains the attention weights in this step.
- The attention weights will have shape (batch_size, 1, mem_length) or
- (batch_size, num_heads, 1, mem_length)
- """
- step_output, new_states, step_additional_outputs = super().forward(step_input, states)
- # In hybrid_forward, only the rnn_states and attention_vec are calculated.
- # We directly append the mem_value and mem_masks in the forward() function.
- # We apply this trick because the memory value/mask can be directly appended to the next
- # timestamp and there is no need to create additional NDArrays. If we use HybridBlock,
- # new NDArrays will be created even for identity mapping.
- # See https://github.com/apache/incubator-mxnet/issues/10167
- new_states += states[2:]
- return step_output, new_states, step_additional_outputs
-
- def hybrid_forward(self, F, step_input, states): #pylint: disable=arguments-differ
- """
-
- Parameters
- ----------
- step_input : NDArray or Symbol
- states : list of NDArray or Symbol
-
- Returns
- -------
- step_output : NDArray or Symbol
- The output of the decoder. Shape is (batch_size, C_out)
- new_states: list
- Includes
-
- - rnn_states : list of NDArray or Symbol
- - attention_vec : NDArray or Symbol, Shape (batch_size, C_memory)
-
- step_additional_outputs : list
- Either be an empty list or contains the attention weights in this step.
- The attention weights will have shape (batch_size, 1, mem_length) or
- (batch_size, num_heads, 1, mem_length)
-
- """
- has_mem_mask = (len(states) == 4)
- if has_mem_mask:
- rnn_states, attention_output, mem_value, mem_masks = states
- mem_masks = F.expand_dims(mem_masks, axis=1)
- else:
- rnn_states, attention_output, mem_value = states
- mem_masks = None
- new_rnn_states = []
- # Process the first layer
- rnn_out, layer_state =\
- self.rnn_cells[0](F.concat(step_input, attention_output, dim=-1), rnn_states[0])
- new_rnn_states.append(layer_state)
- attention_vec, attention_weights =\
- self.attention_cell(F.expand_dims(rnn_out, axis=1), # Shape(B, 1, C)
- mem_value,
- mem_value,
- mem_masks)
- attention_vec = F.reshape(attention_vec, shape=(0, -1))
- # Process the 2nd layer - the last layer
- for i in range(1, len(self.rnn_cells)):
- curr_input = rnn_out
- rnn_cell = self.rnn_cells[i]
- # Concatenate the attention vector calculated by the bottom layer and the output of the
- # previous layer
- rnn_out, layer_state = rnn_cell(F.concat(curr_input, attention_vec, dim=-1),
- rnn_states[i])
- rnn_out = self.dropout_layer(rnn_out)
- if self._use_residual:
- rnn_out = rnn_out + curr_input
- # Append new RNN state
- new_rnn_states.append(layer_state)
- new_states = [new_rnn_states, attention_vec]
- step_additional_outputs = []
- if self._output_attention:
- step_additional_outputs.append(attention_weights)
- return rnn_out, new_states, step_additional_outputs
-
-
-class GNMTOneStepDecoder(_BaseGNMTDecoder, Seq2SeqOneStepDecoder):
- """RNN Encoder similar to that used in the Google Neural Machine Translation paper.
-
- One-step ahead decoder used during inference.
-
- We use gnmt_v2 strategy in tensorflow/nmt
-
- Parameters
- ----------
- cell_type : str or type
- Can be "lstm", "gru" or constructor functions that can be directly called,
- like rnn.LSTMCell
- attention_cell : AttentionCell or str
- Arguments of the attention cell.
- Can be 'scaled_luong', 'normed_mlp', 'dot'
- num_layers : int
- Total number of layers
- hidden_size : int
- Number of hidden units
- dropout : float
- The dropout rate
- use_residual : bool
- Whether to use residual connection. Residual connection will be added in the
- uni-directional RNN layers
- output_attention: bool
- Whether to output the attention weights
- i2h_weight_initializer : str or Initializer
- Initializer for the input weights matrix, used for the linear
- transformation of the inputs.
- h2h_weight_initializer : str or Initializer
- Initializer for the recurrent weights matrix, used for the linear
- transformation of the recurrent state.
- i2h_bias_initializer : str or Initializer
- Initializer for the bias vector.
- h2h_bias_initializer : str or Initializer
- Initializer for the bias vector.
- prefix : str, default 'rnn_'
- Prefix for name of `Block`s
- (and name of weight if params is `None`).
- params : Parameter or None
- Container for weight sharing between cells.
- Created if `None`.
- """
-
-
-class GNMTDecoder(_BaseGNMTDecoder, Seq2SeqDecoder):
- """RNN Encoder similar to that used in the Google Neural Machine Translation paper.
-
- Multi-step decoder used during training with teacher forcing.
-
- We use gnmt_v2 strategy in tensorflow/nmt
-
- Parameters
- ----------
- cell_type : str or type
- Can be "lstm", "gru" or constructor functions that can be directly called,
- like rnn.LSTMCell
- attention_cell : AttentionCell or str
- Arguments of the attention cell.
- Can be 'scaled_luong', 'normed_mlp', 'dot'
- num_layers : int
- Total number of layers
- hidden_size : int
- Number of hidden units
- dropout : float
- The dropout rate
- use_residual : bool
- Whether to use residual connection. Residual connection will be added in the
- uni-directional RNN layers
- output_attention: bool
- Whether to output the attention weights
- i2h_weight_initializer : str or Initializer
- Initializer for the input weights matrix, used for the linear
- transformation of the inputs.
- h2h_weight_initializer : str or Initializer
- Initializer for the recurrent weights matrix, used for the linear
- transformation of the recurrent state.
- i2h_bias_initializer : str or Initializer
- Initializer for the bias vector.
- h2h_bias_initializer : str or Initializer
- Initializer for the bias vector.
- prefix : str, default 'rnn_'
- Prefix for name of `Block`s
- (and name of weight if params is `None`).
- params : Parameter or None
- Container for weight sharing between cells.
- Created if `None`.
- """
-
- def forward(self, inputs, states, valid_length=None): # pylint: disable=arguments-differ
- """Decode the decoder inputs. This function is only used for training.
-
- Parameters
- ----------
- inputs : NDArray, Shape (batch_size, length, C_in)
- states : list of NDArrays or None
- Initial states. The list of initial decoder states
- valid_length : NDArray or None
- Valid lengths of each sequence. This is usually used when part of sequence has
- been padded. Shape (batch_size,)
-
- Returns
- -------
- output : NDArray, Shape (batch_size, length, C_out)
- states : list
- The decoder states, includes:
-
- - rnn_states : NDArray
- - attention_vec : NDArray
- - mem_value : NDArray
- - mem_masks : NDArray, optional
- additional_outputs : list
- Either be an empty list or contains the attention weights in this step.
- The attention weights will have shape (batch_size, length, mem_length) or
- (batch_size, num_heads, length, mem_length)
- """
- length = inputs.shape[1]
- output = []
- additional_outputs = []
- inputs = _as_list(mx.nd.split(inputs, num_outputs=length, axis=1, squeeze_axis=True))
- rnn_states_l = []
- attention_output_l = []
- fixed_states = states[2:]
- for i in range(length):
- ele_output, states, ele_additional_outputs = super().forward(inputs[i], states)
- rnn_states_l.append(states[0])
- attention_output_l.append(states[1])
- output.append(ele_output)
- additional_outputs.extend(ele_additional_outputs)
- output = mx.nd.stack(*output, axis=1)
- if valid_length is not None:
- states = [_nested_sequence_last(rnn_states_l, valid_length),
- _nested_sequence_last(attention_output_l, valid_length)] + fixed_states
- output = mx.nd.SequenceMask(output,
- sequence_length=valid_length,
- use_sequence_length=True,
- axis=1)
- if self._output_attention:
- additional_outputs = [mx.nd.concat(*additional_outputs, dim=-2)]
- return output, states, additional_outputs
-
-
-def get_gnmt_encoder_decoder(cell_type='lstm', attention_cell='scaled_luong', num_layers=2,
- num_bi_layers=1, hidden_size=128, dropout=0.0, use_residual=False,
- i2h_weight_initializer=None, h2h_weight_initializer=None,
- i2h_bias_initializer=mx.init.LSTMBias(forget_bias=1.0),
- h2h_bias_initializer='zeros',
- prefix='gnmt_', params=None):
- """Build a pair of GNMT encoder/decoder
-
- Parameters
- ----------
- cell_type : str or type
- attention_cell : str or AttentionCell
- num_layers : int
- num_bi_layers : int
- hidden_size : int
- dropout : float
- use_residual : bool
- i2h_weight_initializer : mx.init.Initializer or None
- h2h_weight_initializer : mx.init.Initializer or None
- i2h_bias_initializer : mx.init.Initializer or None
- h2h_bias_initializer : mx.init.Initializer or None
- prefix : str, default 'gnmt_'
- Prefix for name of `Block`s.
- params : Parameter or None
- Container for weight sharing between cells.
- Created if `None`.
-
- Returns
- -------
- encoder : GNMTEncoder
- decoder : GNMTDecoder
- """
- encoder = GNMTEncoder(cell_type=cell_type, num_layers=num_layers, num_bi_layers=num_bi_layers,
- hidden_size=hidden_size, dropout=dropout, use_residual=use_residual,
- i2h_weight_initializer=i2h_weight_initializer,
- h2h_weight_initializer=h2h_weight_initializer,
- i2h_bias_initializer=i2h_bias_initializer,
- h2h_bias_initializer=h2h_bias_initializer, prefix=prefix + 'enc_',
- params=params)
- decoder = GNMTDecoder(cell_type=cell_type, attention_cell=attention_cell, num_layers=num_layers,
- hidden_size=hidden_size, dropout=dropout, use_residual=use_residual,
- i2h_weight_initializer=i2h_weight_initializer,
- h2h_weight_initializer=h2h_weight_initializer,
- i2h_bias_initializer=i2h_bias_initializer,
- h2h_bias_initializer=h2h_bias_initializer, prefix=prefix + 'dec_',
- params=params)
- one_step_ahead_decoder = GNMTOneStepDecoder(
- cell_type=cell_type, attention_cell=attention_cell, num_layers=num_layers,
- hidden_size=hidden_size, dropout=dropout, use_residual=use_residual,
- i2h_weight_initializer=i2h_weight_initializer,
- h2h_weight_initializer=h2h_weight_initializer, i2h_bias_initializer=i2h_bias_initializer,
- h2h_bias_initializer=h2h_bias_initializer, prefix=prefix + 'dec_',
- params=decoder.collect_params())
- return encoder, decoder, one_step_ahead_decoder
diff --git a/scripts/machine_translation/hyperparameters.py b/scripts/machine_translation/hyperparameters.py
deleted file mode 100644
index f9aaf9a7fb..0000000000
--- a/scripts/machine_translation/hyperparameters.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Hyperparameters for transformer, for past reference only."""
-
-# parameters for dataset
-src_lang = 'en'
-tgt_lang = 'de'
-src_max_len = -1
-tgt_max_len = -1
-
-# parameters for model
-num_units = 512
-hidden_size = 2048
-dropout = 0.1
-epsilon = 0.1
-num_layers = 6
-num_heads = 8
-scaled = True
-
-# parameters for training
-optimizer = 'adam'
-epochs = 3
-batch_size = 2700
-test_batch_size = 256
-num_accumulated = 1
-lr = 2
-warmup_steps = 1
-save_dir = 'transformer_en_de_u512'
-average_start = 1
-num_buckets = 20
-log_interval = 10
-bleu = '13a'
-
-#parameters for testing
-beam_size = 4
-lp_alpha = 0.6
-lp_k = 5
diff --git a/scripts/machine_translation/index.rst b/scripts/machine_translation/index.rst
deleted file mode 100644
index a228ee24ed..0000000000
--- a/scripts/machine_translation/index.rst
+++ /dev/null
@@ -1,71 +0,0 @@
-Machine Translation
--------------------
-
-:download:`Download scripts `
-
-Google Neural Machine Translation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Use the following command to train the GNMT model on the IWSLT2015 dataset.
-
-.. code-block:: console
-
- $ MXNET_GPU_MEM_POOL_TYPE=Round python train_gnmt.py --src_lang en --tgt_lang vi --batch_size 128 \
- --optimizer adam --lr 0.001 --lr_update_factor 0.5 --beam_size 10 --bucket_scheme exp \
- --num_hidden 512 --save_dir gnmt_en_vi_l2_h512_beam10 --epochs 12 --gpu 0
-
-It gets test BLEU score equals to 26.20.
-
-Transformers
-~~~~~~~~~~~~
-
-Use the following commands to train the Transformer model on the WMT14 dataset for English to German translation.
-
-.. code-block:: console
-
- $ MXNET_GPU_MEM_POOL_TYPE=Round python train_transformer.py --dataset WMT2014BPE \
- --src_lang en --tgt_lang de --batch_size 2700 \
- --optimizer adam --num_accumulated 16 --lr 2.0 --warmup_steps 4000 \
- --save_dir transformer_en_de_u512 --epochs 30 --gpus 0,1,2,3,4,5,6,7 --scaled \
- --average_start 5 --num_buckets 20 --bucket_scheme exp --bleu 13a --log_interval 10
-
-It gets official mteval-v13a BLEU score equals to 27.09 on newstest2014 (http://statmt.org/wmt14/test-filtered.tgz).
-This result is obtained by using averaged SGD in last 5 epochs. If we use international tokenization (i.e., ``--bleu intl``),
-we can obtain bleu score equals to 27.89. If we use ``--bleu tweaked``, we obtain test BLEU score equals to 28.96.
-This result is obtained on tweaked reference, where the tokenized reference text is put in ATAT format for historical reason
-and following preprocessing pipeline is done:
-
-.. code-block:: console
-
- mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l de
- mosesdecoder/scripts/tokenizer/remove-non-printing-char.perl
- mosesdecoder/scripts/tokenizer/tokenizer.perl -q -no-escape -protected mosesdecoder/scripts/tokenizer/basic-protected-patterns -l de.
-
-If we turn on ``--full``, the testing is performed on newstest2014 (http://statmt.org/wmt14/test-full.tgz). Then, we can
-obtain BLEU=27.05 with ``--bleu 13a``, BLEU=27.81 with ``--bleu intl``, and BLEU=28.80 with ``--bleu tweaked``
-
-The pre-trained model can be downloaded from http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/transformer_en_de_512_WMT2014-e25287c5.zip.
-
-For the users from China, it might be faster with this link instead: https://apache-mxnet.s3.cn-north-1.amazonaws.com.cn/gluon/models/transformer_en_de_512_WMT2014-e25287c5.zip.
-
-
-Use the following commands to inference the Transformer model on the WMT14 test dataset for English to German translation.
-
-.. code-block:: console
-
- $ python inference_transformer.py --dataset WMT2014BPE
- --src_lang en \
- --tgt_lang de \
- --batch_size 2700 \
- --scaled \
- --num_buckets 20 \
- --bucket_scheme exp \
- --bleu 13a \
- --log_interval 10 \
- --gpu 0 \
- --model_parameter PATH/TO/valid_best.params
-
-Before inference, you should do a complete training at least one time to get the pre-trained model, or you can get the pre-trained model from http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/transformer_en_de_512_WMT2014-e25287c5.zip.
-
-For the users from China, it might be faster with this link instead: https://apache-mxnet.s3.cn-north-1.amazonaws.com.cn/gluon/models/transformer_en_de_512_WMT2014-e25287c5.zip.
-
diff --git a/scripts/machine_translation/inference_transformer.py b/scripts/machine_translation/inference_transformer.py
deleted file mode 100644
index 178270a6d6..0000000000
--- a/scripts/machine_translation/inference_transformer.py
+++ /dev/null
@@ -1,300 +0,0 @@
-"""
-Transformer
-=================================
-
-This example shows how to implement the Transformer model with Gluon NLP Toolkit.
-
-@inproceedings{vaswani2017attention,
- title={Attention is all you need},
- author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones,
- Llion and Gomez, Aidan N and Kaiser, Lukasz and Polosukhin, Illia},
- booktitle={Advances in Neural Information Processing Systems},
- pages={6000--6010},
- year={2017}
-}
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-
-import argparse
-import time
-import random
-import os
-import zipfile
-import logging
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-from mxnet.gluon.utils import download, check_sha1
-import gluonnlp as nlp
-
-from gluonnlp.loss import MaskedSoftmaxCELoss
-from gluonnlp.model.translation import NMTModel
-from gluonnlp.model.transformer import get_transformer_encoder_decoder
-from translation import BeamSearchTranslator
-from utils import logging_config
-from bleu import _bpe_to_words, compute_bleu
-import dataprocessor
-
-np.random.seed(100)
-random.seed(100)
-mx.random.seed(10000)
-
-nlp.utils.check_version('0.7.0')
-
-parser = argparse.ArgumentParser(description='Neural Machine Translation Example.'
- 'We use this script only for transformer inference.')
-parser.add_argument('--dataset', type=str, default='WMT2014BPE', help='Dataset to use.')
-parser.add_argument('--src_lang', type=str, default='en', help='Source language')
-parser.add_argument('--tgt_lang', type=str, default='de', help='Target language')
-parser.add_argument('--num_units', type=int, default=512, help='Dimension of the embedding '
- 'vectors and states.')
-parser.add_argument('--hidden_size', type=int, default=2048,
- help='Dimension of the hidden state in position-wise feed-forward networks.')
-parser.add_argument('--dropout', type=float, default=0.1,
- help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--num_layers', type=int, default=6,
- help='number of layers in the encoder and decoder')
-parser.add_argument('--num_heads', type=int, default=8,
- help='number of heads in multi-head attention')
-parser.add_argument('--scaled', action='store_true', help='Turn on to use scale in attention')
-parser.add_argument('--batch_size', type=int, default=1024,
- help='Batch size. Number of tokens in a minibatch')
-parser.add_argument('--beam_size', type=int, default=4, help='Beam size')
-parser.add_argument('--lp_alpha', type=float, default=0.6,
- help='Alpha used in calculating the length penalty')
-parser.add_argument('--lp_k', type=int, default=5, help='K used in calculating the length penalty')
-parser.add_argument('--test_batch_size', type=int, default=256, help='Test batch size')
-parser.add_argument('--num_buckets', type=int, default=10, help='Bucket number')
-parser.add_argument('--bucket_scheme', type=str, default='constant',
- help='Strategy for generating bucket keys. It supports: '
- '"constant": all the buckets have the same width; '
- '"linear": the width of bucket increases linearly; '
- '"exp": the width of bucket increases exponentially')
-parser.add_argument('--bucket_ratio', type=float, default=0.0, help='Ratio for increasing the '
- 'throughput of the bucketing')
-parser.add_argument('--src_max_len', type=int, default=-1, help='Maximum length of the source '
- 'sentence, -1 means no clipping')
-parser.add_argument('--tgt_max_len', type=int, default=-1, help='Maximum length of the target '
- 'sentence, -1 means no clipping')
-parser.add_argument('--full', action='store_true',
- help='In default, we use the test dataset in'
- ' http://statmt.org/wmt14/test-filtered.tgz.'
- ' When the option full is turned on, we use the test dataset in'
- ' http://statmt.org/wmt14/test-full.tgz')
-parser.add_argument('--bleu', type=str, default='tweaked',
- help='Schemes for computing bleu score. It can be: '
- '"tweaked": it uses similar steps in get_ende_bleu.sh in tensor2tensor '
- 'repository, where compound words are put in ATAT format; '
- '"13a": This uses official WMT tokenization and produces the same results'
- ' as official script (mteval-v13a.pl) used by WMT; '
- '"intl": This use international tokenization in mteval-v14a.pl')
-parser.add_argument('--log_interval', type=int, default=100, metavar='N',
- help='report interval')
-parser.add_argument('--save_dir', type=str, default='transformer_out',
- help='directory path to save the final model and training log')
-parser.add_argument('--gpu', type=int,
- help='gpu id, e.g. 0 or 1. Unspecified means using cpu.')
-parser.add_argument('--model_parameter', type=str, default=' ', required=True,
- help='model parameter for inference, must be provided.')
-
-args = parser.parse_args()
-logging_config(args.save_dir)
-logging.info(args)
-
-# data process
-data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab \
- = dataprocessor.load_translation_data(dataset=args.dataset, bleu=args.bleu, args=args)
-
-dataprocessor.write_sentences(test_tgt_sentences, os.path.join(args.save_dir, 'test_gt.txt'))
-
-data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False)
-data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
- for i, ele in enumerate(data_val)])
-data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
- for i, ele in enumerate(data_test)])
-
-data_train_lengths, data_val_lengths, data_test_lengths = [dataprocessor.get_data_lengths(x)
- for x in
- [data_train, data_val, data_test]]
-
-detokenizer = nlp.data.SacreMosesDetokenizer()
-
-# model prepare
-ctx = [mx.cpu()] if args.gpu is None else [mx.gpu(args.gpu)]
-
-if args.src_max_len <= 0 or args.tgt_max_len <= 0:
- max_len = np.max(
- [np.max(data_train_lengths, axis=0), np.max(data_val_lengths, axis=0),
- np.max(data_test_lengths, axis=0)],
- axis=0)
-
-if args.src_max_len > 0:
- src_max_len = args.src_max_len
-else:
- src_max_len = max_len[0]
-if args.tgt_max_len > 0:
- tgt_max_len = args.tgt_max_len
-else:
- tgt_max_len = max_len[1]
-
-encoder, decoder, one_step_ahead_decoder = get_transformer_encoder_decoder(
- units=args.num_units, hidden_size=args.hidden_size, dropout=args.dropout,
- num_layers=args.num_layers, num_heads=args.num_heads, max_src_length=max(src_max_len, 500),
- max_tgt_length=max(tgt_max_len, 500), scaled=args.scaled)
-model = NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder, decoder=decoder,
- one_step_ahead_decoder=one_step_ahead_decoder, share_embed=args.dataset != 'TOY',
- embed_size=args.num_units, tie_weights=args.dataset != 'TOY',
- embed_initializer=None, prefix='transformer_')
-
-param_name = args.model_parameter
-if (not os.path.exists(param_name)):
- archive_param_url = 'http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/{}'
- archive_file_hash = ('transformer_en_de_512_WMT2014-e25287c5.zip',
- '5193b469e0e2dfdda3c834f9212420758a0d1d71')
- param_file_hash = ('transformer_en_de_512_WMT2014-e25287c5.params',
- 'e25287c5a924b7025e08d626f02626d5fa3af2d1')
- archive_file, archive_hash = archive_file_hash
- param_file, param_hash = param_file_hash
- logging.warning('The provided param file {} does not exist, start to download it from {}...'
- .format(param_name, archive_param_url.format(archive_file)))
-
- root_dir = os.path.dirname(__file__)
- archive_file_path = '{}/{}'.format(root_dir, archive_file)
- param_name = '{}/{}'.format(root_dir, param_file)
- if (not os.path.exists(param_name) or not check_sha1(param_name, param_hash)):
- download(archive_param_url.format(archive_file),
- path=archive_file_path,
- sha1_hash=archive_hash)
- with zipfile.ZipFile(archive_file_path) as zf:
- zf.extractall(root_dir)
-
-model.load_parameters(param_name, ctx)
-
-static_alloc = True
-model.hybridize(static_alloc=static_alloc)
-logging.info(model)
-
-# translator prepare
-translator = BeamSearchTranslator(model=model, beam_size=args.beam_size,
- scorer=nlp.model.BeamSearchScorer(alpha=args.lp_alpha,
- K=args.lp_k),
- max_length=200)
-logging.info('Use beam_size={}, alpha={}, K={}'.format(args.beam_size, args.lp_alpha, args.lp_k))
-
-test_loss_function = MaskedSoftmaxCELoss()
-test_loss_function.hybridize(static_alloc=static_alloc)
-
-def inference():
- """inference function."""
- logging.info('Inference on test_dataset!')
-
- # data prepare
- test_data_loader = dataprocessor.get_dataloader(data_test, args,
- dataset_type='test',
- use_average_length=True)
-
- if args.bleu == 'tweaked':
- bpe = bool(args.dataset != 'IWSLT2015' and args.dataset != 'TOY')
- split_compound_word = bpe
- tokenized = True
- elif args.bleu == '13a' or args.bleu == 'intl':
- bpe = False
- split_compound_word = False
- tokenized = False
- else:
- raise NotImplementedError
-
- translation_out = []
- all_inst_ids = []
- total_wc = 0
- total_time = 0
- batch_total_blue = 0
-
- for batch_id, (src_seq, tgt_seq, src_test_length, tgt_test_length, inst_ids) \
- in enumerate(test_data_loader):
-
- total_wc += src_test_length.sum().asscalar() + tgt_test_length.sum().asscalar()
-
- src_seq = src_seq.as_in_context(ctx[0])
- tgt_seq = tgt_seq.as_in_context(ctx[0])
- src_test_length = src_test_length.as_in_context(ctx[0])
- tgt_test_length = tgt_test_length.as_in_context(ctx[0])
- all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
-
- start = time.time()
- # Translate to get a bleu score
- samples, _, sample_test_length = \
- translator.translate(src_seq=src_seq, src_valid_length=src_test_length)
- total_time += (time.time() - start)
-
- # generator the translator result for each batch
- max_score_sample = samples[:, 0, :].asnumpy()
- sample_test_length = sample_test_length[:, 0].asnumpy()
- translation_tmp = []
- translation_tmp_sentences = []
- for i in range(max_score_sample.shape[0]):
- translation_tmp.append([tgt_vocab.idx_to_token[ele] for ele in \
- max_score_sample[i][1:(sample_test_length[i] - 1)]])
-
- # detokenizer each translator result
- for _, sentence in enumerate(translation_tmp):
- if args.bleu == 'tweaked':
- translation_tmp_sentences.append(sentence)
- translation_out.append(sentence)
- elif args.bleu == '13a' or args.bleu == 'intl':
- translation_tmp_sentences.append(detokenizer(_bpe_to_words(sentence)))
- translation_out.append(detokenizer(_bpe_to_words(sentence)))
- else:
- raise NotImplementedError
-
- # generate tgt_sentence for bleu calculation of each batch
- tgt_sen_tmp = [test_tgt_sentences[index] for \
- _, index in enumerate(inst_ids.asnumpy().astype(np.int32).tolist())]
- batch_test_bleu_score, _, _, _, _ = compute_bleu([tgt_sen_tmp], translation_tmp_sentences,
- tokenized=tokenized, tokenizer=args.bleu,
- split_compound_word=split_compound_word,
- bpe=bpe)
- batch_total_blue += batch_test_bleu_score
-
- # log for every ten batchs
- if batch_id % 10 == 0 and batch_id != 0:
- batch_ave_bleu = batch_total_blue / 10
- batch_total_blue = 0
- logging.info('batch id={:d}, batch_bleu={:.4f}'
- .format(batch_id, batch_ave_bleu * 100))
-
- # reorg translation sentences by inst_ids
- real_translation_out = [None for _ in range(len(all_inst_ids))]
- for ind, sentence in zip(all_inst_ids, translation_out):
- real_translation_out[ind] = sentence
-
- # get bleu score, n-gram precisions, brevity penalty, reference length, and translation length
- test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], real_translation_out,
- tokenized=tokenized, tokenizer=args.bleu,
- split_compound_word=split_compound_word,
- bpe=bpe)
-
- logging.info('Inference at test dataset. \
- inference bleu={:.4f}, throughput={:.4f}K wps'
- .format(test_bleu_score * 100, total_wc / total_time / 1000))
-
-
-if __name__ == '__main__':
- inference()
diff --git a/scripts/machine_translation/train_gnmt.py b/scripts/machine_translation/train_gnmt.py
deleted file mode 100644
index da1c61f2d9..0000000000
--- a/scripts/machine_translation/train_gnmt.py
+++ /dev/null
@@ -1,285 +0,0 @@
-"""
-Google Neural Machine Translation
-=================================
-
-This example shows how to implement the GNMT model with Gluon NLP Toolkit.
-
-@article{wu2016google,
- title={Google's neural machine translation system:
- Bridging the gap between human and machine translation},
- author={Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le, Quoc V and
- Norouzi, Mohammad and Macherey, Wolfgang and Krikun, Maxim and Cao, Yuan and Gao, Qin and
- Macherey, Klaus and others},
- journal={arXiv preprint arXiv:1609.08144},
- year={2016}
-}
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-
-import argparse
-import time
-import random
-import os
-import logging
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-
-from gluonnlp.model.translation import NMTModel
-from gluonnlp.loss import MaskedSoftmaxCELoss
-from gnmt import get_gnmt_encoder_decoder
-from translation import BeamSearchTranslator
-from utils import logging_config
-from bleu import compute_bleu
-import dataprocessor
-
-np.random.seed(100)
-random.seed(100)
-mx.random.seed(10000)
-
-nlp.utils.check_version('0.9.0')
-
-parser = argparse.ArgumentParser(description='Neural Machine Translation Example.'
- 'We train the Google NMT model')
-parser.add_argument('--dataset', type=str, default='IWSLT2015', help='Dataset to use.')
-parser.add_argument('--src_lang', type=str, default='en', help='Source language')
-parser.add_argument('--tgt_lang', type=str, default='vi', help='Target language')
-parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit')
-parser.add_argument('--num_hidden', type=int, default=128, help='Dimension of the embedding '
- 'vectors and states.')
-parser.add_argument('--dropout', type=float, default=0.2,
- help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--num_layers', type=int, default=2, help='number of layers in the encoder'
- ' and decoder')
-parser.add_argument('--num_bi_layers', type=int, default=1,
- help='number of bidirectional layers in the encoder and decoder')
-parser.add_argument('--batch_size', type=int, default=128, help='Batch size')
-parser.add_argument('--beam_size', type=int, default=4, help='Beam size')
-parser.add_argument('--lp_alpha', type=float, default=1.0,
- help='Alpha used in calculating the length penalty')
-parser.add_argument('--lp_k', type=int, default=5, help='K used in calculating the length penalty')
-parser.add_argument('--test_batch_size', type=int, default=32, help='Test batch size')
-parser.add_argument('--num_buckets', type=int, default=5, help='Bucket number')
-parser.add_argument('--bucket_scheme', type=str, default='constant',
- help='Strategy for generating bucket keys. It supports: '
- '"constant": all the buckets have the same width; '
- '"linear": the width of bucket increases linearly; '
- '"exp": the width of bucket increases exponentially')
-parser.add_argument('--bucket_ratio', type=float, default=0.0, help='Ratio for increasing the '
- 'throughput of the bucketing')
-parser.add_argument('--src_max_len', type=int, default=50, help='Maximum length of the source '
- 'sentence')
-parser.add_argument('--tgt_max_len', type=int, default=50, help='Maximum length of the target '
- 'sentence')
-parser.add_argument('--optimizer', type=str, default='adam', help='optimization algorithm')
-parser.add_argument('--lr', type=float, default=1E-3, help='Initial learning rate')
-parser.add_argument('--lr_update_factor', type=float, default=0.5,
- help='Learning rate decay factor')
-parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping')
-parser.add_argument('--log_interval', type=int, default=100, metavar='N',
- help='report interval')
-parser.add_argument('--save_dir', type=str, default='out_dir',
- help='directory path to save the final model and training log')
-parser.add_argument('--gpu', type=int, default=None,
- help='id of the gpu to use. Set it to empty means to use cpu.')
-args = parser.parse_args()
-print(args)
-logging_config(args.save_dir)
-
-
-data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab\
- = dataprocessor.load_translation_data(dataset=args.dataset, bleu='tweaked', args=args)
-
-dataprocessor.write_sentences(val_tgt_sentences, os.path.join(args.save_dir, 'val_gt.txt'))
-dataprocessor.write_sentences(test_tgt_sentences, os.path.join(args.save_dir, 'test_gt.txt'))
-
-data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False)
-data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
- for i, ele in enumerate(data_val)])
-data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
- for i, ele in enumerate(data_test)])
-if args.gpu is None:
- ctx = mx.cpu()
- print('Use CPU')
-else:
- ctx = mx.gpu(args.gpu)
-
-encoder, decoder, one_step_ahead_decoder = get_gnmt_encoder_decoder(
- hidden_size=args.num_hidden, dropout=args.dropout, num_layers=args.num_layers,
- num_bi_layers=args.num_bi_layers)
-model = NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder, decoder=decoder,
- one_step_ahead_decoder=one_step_ahead_decoder, embed_size=args.num_hidden,
- prefix='gnmt_')
-model.initialize(init=mx.init.Uniform(0.1), ctx=ctx)
-static_alloc = True
-model.hybridize(static_alloc=static_alloc)
-logging.info(model)
-
-translator = BeamSearchTranslator(model=model, beam_size=args.beam_size,
- scorer=nlp.model.BeamSearchScorer(alpha=args.lp_alpha,
- K=args.lp_k),
- max_length=args.tgt_max_len + 100)
-logging.info('Use beam_size={}, alpha={}, K={}'.format(args.beam_size, args.lp_alpha, args.lp_k))
-
-
-loss_function = MaskedSoftmaxCELoss()
-loss_function.hybridize(static_alloc=static_alloc)
-
-
-def evaluate(data_loader):
- """Evaluate given the data loader
-
- Parameters
- ----------
- data_loader : DataLoader
-
- Returns
- -------
- avg_loss : float
- Average loss
- real_translation_out : list of list of str
- The translation output
- """
- translation_out = []
- all_inst_ids = []
- avg_loss_denom = 0
- avg_loss = 0.0
- for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \
- in enumerate(data_loader):
- src_seq = src_seq.as_in_context(ctx)
- tgt_seq = tgt_seq.as_in_context(ctx)
- src_valid_length = src_valid_length.as_in_context(ctx)
- tgt_valid_length = tgt_valid_length.as_in_context(ctx)
- # Calculating Loss
- out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
- loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).sum().asscalar()
- all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
- avg_loss += loss * (tgt_seq.shape[1] - 1)
- avg_loss_denom += (tgt_valid_length - 1).sum().asscalar()
- # Translate
- samples, _, sample_valid_length = translator.translate(
- src_seq=src_seq, src_valid_length=src_valid_length)
- max_score_sample = samples[:, 0, :].asnumpy()
- sample_valid_length = sample_valid_length[:, 0].asnumpy()
- for i in range(max_score_sample.shape[0]):
- translation_out.append(
- [tgt_vocab.idx_to_token[ele] for ele in
- max_score_sample[i][1:(sample_valid_length[i] - 1)]])
- avg_loss = avg_loss / avg_loss_denom
- real_translation_out = [None for _ in range(len(all_inst_ids))]
- for ind, sentence in zip(all_inst_ids, translation_out):
- real_translation_out[ind] = sentence
- return avg_loss, real_translation_out
-
-
-def train():
- """Training function."""
- trainer = gluon.Trainer(model.collect_params(), args.optimizer, {'learning_rate': args.lr})
-
- train_data_loader, val_data_loader, test_data_loader \
- = dataprocessor.make_dataloader(data_train, data_val, data_test, args)
-
- best_valid_bleu = 0.0
- for epoch_id in range(args.epochs):
- log_loss = 0
- log_denom = 0
- log_avg_gnorm = 0
- log_wc = 0
- log_start_time = time.time()
- for batch_id, (src_seq, tgt_seq, src_valid_length, tgt_valid_length)\
- in enumerate(train_data_loader):
- # logging.info(src_seq.context) Context suddenly becomes GPU.
- src_seq = src_seq.as_in_context(ctx)
- tgt_seq = tgt_seq.as_in_context(ctx)
- src_valid_length = src_valid_length.as_in_context(ctx)
- tgt_valid_length = tgt_valid_length.as_in_context(ctx)
- with mx.autograd.record():
- out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
- loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean()
- loss = loss * (tgt_seq.shape[1] - 1)
- log_loss += loss * tgt_seq.shape[0]
- log_denom += (tgt_valid_length - 1).sum()
- loss = loss / (tgt_valid_length - 1).mean()
- loss.backward()
- grads = [p.grad(ctx) for p in model.collect_params().values()]
- gnorm = gluon.utils.clip_global_norm(grads, args.clip)
- trainer.step(1)
- src_wc = src_valid_length.sum().asscalar()
- tgt_wc = (tgt_valid_length - 1).sum().asscalar()
- log_loss = log_loss.asscalar()
- log_denom = log_denom.asscalar()
- log_avg_gnorm += gnorm
- log_wc += src_wc + tgt_wc
- if (batch_id + 1) % args.log_interval == 0:
- wps = log_wc / (time.time() - log_start_time)
- logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, gnorm={:.4f}, '
- 'throughput={:.2f}K wps, wc={:.2f}K'
- .format(epoch_id, batch_id + 1, len(train_data_loader),
- log_loss / log_denom,
- np.exp(log_loss / log_denom),
- log_avg_gnorm / args.log_interval,
- wps / 1000, log_wc / 1000))
- log_start_time = time.time()
- log_loss = 0
- log_denom = 0
- log_avg_gnorm = 0
- log_wc = 0
- valid_loss, valid_translation_out = evaluate(val_data_loader)
- valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences], valid_translation_out)
- logging.info('[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
- .format(epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
- test_loss, test_translation_out = evaluate(test_data_loader)
- test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out)
- logging.info('[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
- .format(epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100))
- dataprocessor.write_sentences(valid_translation_out,
- os.path.join(args.save_dir,
- 'epoch{:d}_valid_out.txt').format(epoch_id))
- dataprocessor.write_sentences(test_translation_out,
- os.path.join(args.save_dir,
- 'epoch{:d}_test_out.txt').format(epoch_id))
- if valid_bleu_score > best_valid_bleu:
- best_valid_bleu = valid_bleu_score
- save_path = os.path.join(args.save_dir, 'valid_best.params')
- logging.info('Save best parameters to {}'.format(save_path))
- model.save_parameters(save_path)
- if epoch_id + 1 >= (args.epochs * 2) // 3:
- new_lr = trainer.learning_rate * args.lr_update_factor
- logging.info('Learning rate change to {}'.format(new_lr))
- trainer.set_learning_rate(new_lr)
- if os.path.exists(os.path.join(args.save_dir, 'valid_best.params')):
- model.load_parameters(os.path.join(args.save_dir, 'valid_best.params'))
- valid_loss, valid_translation_out = evaluate(val_data_loader)
- valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences], valid_translation_out)
- logging.info('Best model valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
- .format(valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
- test_loss, test_translation_out = evaluate(test_data_loader)
- test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out)
- logging.info('Best model test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
- .format(test_loss, np.exp(test_loss), test_bleu_score * 100))
- dataprocessor.write_sentences(valid_translation_out,
- os.path.join(args.save_dir, 'best_valid_out.txt'))
- dataprocessor.write_sentences(test_translation_out,
- os.path.join(args.save_dir, 'best_test_out.txt'))
-
-
-if __name__ == '__main__':
- train()
diff --git a/scripts/machine_translation/train_transformer.py b/scripts/machine_translation/train_transformer.py
index baa8249c04..655c2771b5 100644
--- a/scripts/machine_translation/train_transformer.py
+++ b/scripts/machine_translation/train_transformer.py
@@ -2,7 +2,7 @@
Transformer
=================================
-This example shows how to implement the Transformer model with Gluon NLP Toolkit.
+This example shows how to implement the Transformer model with GluonNLP Toolkit.
@inproceedings{vaswani2017attention,
title={Attention is all you need},
@@ -33,380 +33,501 @@
# pylint:disable=redefined-outer-name,logging-format-interpolation
import argparse
+import time
+import random
+import os
import logging
+import itertools
import math
-import os
-import random
-import time
-
import numpy as np
import mxnet as mx
from mxnet import gluon
-
-import gluonnlp as nlp
-from gluonnlp.loss import LabelSmoothing, MaskedSoftmaxCELoss
-from gluonnlp.model.transformer import ParallelTransformer, get_transformer_encoder_decoder
-from gluonnlp.model.translation import NMTModel
-from gluonnlp.utils.parallel import Parallel
-import dataprocessor
-from bleu import _bpe_to_words, compute_bleu
-from translation import BeamSearchTranslator
-from utils import logging_config
-
-np.random.seed(100)
-random.seed(100)
-mx.random.seed(10000)
-
-nlp.utils.check_version('0.9.0')
-
-parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- description='Neural Machine Translation Example with the Transformer Model.')
-parser.add_argument('--dataset', type=str.upper, default='WMT2016BPE', help='Dataset to use.',
- choices=['IWSLT2015', 'WMT2016BPE', 'WMT2014BPE', 'TOY'])
-parser.add_argument('--src_lang', type=str, default='en', help='Source language')
-parser.add_argument('--tgt_lang', type=str, default='de', help='Target language')
-parser.add_argument('--epochs', type=int, default=10, help='upper epoch limit')
-parser.add_argument('--num_units', type=int, default=512, help='Dimension of the embedding '
- 'vectors and states.')
-parser.add_argument('--hidden_size', type=int, default=2048,
- help='Dimension of the hidden state in position-wise feed-forward networks.')
-parser.add_argument('--dropout', type=float, default=0.1,
- help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--epsilon', type=float, default=0.1,
- help='epsilon parameter for label smoothing')
-parser.add_argument('--num_layers', type=int, default=6,
- help='number of layers in the encoder and decoder')
-parser.add_argument('--num_heads', type=int, default=8,
- help='number of heads in multi-head attention')
-parser.add_argument('--scaled', action='store_true', help='Turn on to use scale in attention')
-parser.add_argument('--batch_size', type=int, default=1024,
- help='Batch size. Number of tokens per gpu in a minibatch')
-parser.add_argument('--beam_size', type=int, default=4, help='Beam size')
-parser.add_argument('--lp_alpha', type=float, default=0.6,
- help='Alpha used in calculating the length penalty')
-parser.add_argument('--lp_k', type=int, default=5, help='K used in calculating the length penalty')
-parser.add_argument('--test_batch_size', type=int, default=256, help='Test batch size')
-parser.add_argument('--num_buckets', type=int, default=10, help='Bucket number')
-parser.add_argument('--bucket_scheme', type=str, default='constant',
- help='Strategy for generating bucket keys. It supports: '
- '"constant": all the buckets have the same width; '
- '"linear": the width of bucket increases linearly; '
- '"exp": the width of bucket increases exponentially')
-parser.add_argument('--bucket_ratio', type=float, default=0.0, help='Ratio for increasing the '
- 'throughput of the bucketing')
-parser.add_argument('--src_max_len', type=int, default=-1, help='Maximum length of the source '
- 'sentence, -1 means no clipping')
-parser.add_argument('--tgt_max_len', type=int, default=-1, help='Maximum length of the target '
- 'sentence, -1 means no clipping')
-parser.add_argument('--optimizer', type=str, default='adam', help='optimization algorithm')
-parser.add_argument('--lr', type=float, default=1.0, help='Initial learning rate')
-parser.add_argument('--warmup_steps', type=float, default=4000,
- help='number of warmup steps used in NOAM\'s stepsize schedule')
-parser.add_argument('--num_accumulated', type=int, default=1,
- help='Number of steps to accumulate the gradients. '
- 'This is useful to mimic large batch training with limited gpu memory')
-parser.add_argument('--magnitude', type=float, default=3.0,
- help='Magnitude of Xavier initialization')
-parser.add_argument('--average_checkpoint', action='store_true',
- help='Turn on to perform final testing based on '
- 'the average of last few checkpoints')
-parser.add_argument('--num_averages', type=int, default=5,
- help='Perform final testing based on the '
- 'average of last num_averages checkpoints. '
- 'This is only used if average_checkpoint is True')
-parser.add_argument('--average_start', type=int, default=5,
- help='Perform average SGD on last average_start epochs')
-parser.add_argument('--full', action='store_true',
- help='In default, we use the test dataset in'
- ' http://statmt.org/wmt14/test-filtered.tgz.'
- ' When the option full is turned on, we use the test dataset in'
- ' http://statmt.org/wmt14/test-full.tgz')
-parser.add_argument('--bleu', type=str, default='tweaked',
- help='Schemes for computing bleu score. It can be: '
- '"tweaked": it uses similar steps in get_ende_bleu.sh in tensor2tensor '
- 'repository, where compound words are put in ATAT format; '
- '"13a": This uses official WMT tokenization and produces the same results'
- ' as official script (mteval-v13a.pl) used by WMT; '
- '"intl": This use international tokenization in mteval-v14a.pl')
-parser.add_argument('--log_interval', type=int, default=100, metavar='N',
- help='report interval')
-parser.add_argument('--save_dir', type=str, default='transformer_out',
- help='directory path to save the final model and training log')
-parser.add_argument('--gpus', type=str,
- help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.'
- '(using single gpu is suggested)')
-args = parser.parse_args()
-logging_config(args.save_dir)
-logging.info(args)
-
-
-data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab \
- = dataprocessor.load_translation_data(dataset=args.dataset, bleu=args.bleu, args=args)
-
-dataprocessor.write_sentences(val_tgt_sentences, os.path.join(args.save_dir, 'val_gt.txt'))
-dataprocessor.write_sentences(test_tgt_sentences, os.path.join(args.save_dir, 'test_gt.txt'))
-
-data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False)
-data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
- for i, ele in enumerate(data_val)])
-data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
- for i, ele in enumerate(data_test)])
-
-ctx = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
- [mx.gpu(int(x)) for x in args.gpus.split(',')]
-num_ctxs = len(ctx)
-
-data_train_lengths, data_val_lengths, data_test_lengths = [dataprocessor.get_data_lengths(x)
- for x in
- [data_train, data_val, data_test]]
-
-if args.src_max_len <= 0 or args.tgt_max_len <= 0:
- max_len = np.max(
- [np.max(data_train_lengths, axis=0), np.max(data_val_lengths, axis=0),
- np.max(data_test_lengths, axis=0)],
- axis=0)
-if args.src_max_len > 0:
- src_max_len = args.src_max_len
-else:
- src_max_len = max_len[0]
-if args.tgt_max_len > 0:
- tgt_max_len = args.tgt_max_len
-else:
- tgt_max_len = max_len[1]
-encoder, decoder, one_step_ahead_decoder = get_transformer_encoder_decoder(
- units=args.num_units, hidden_size=args.hidden_size, dropout=args.dropout,
- num_layers=args.num_layers, num_heads=args.num_heads, max_src_length=max(src_max_len, 500),
- max_tgt_length=max(tgt_max_len, 500), scaled=args.scaled)
-model = NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder, decoder=decoder,
- one_step_ahead_decoder=one_step_ahead_decoder,
- share_embed=args.dataset not in ('TOY', 'IWSLT2015'), embed_size=args.num_units,
- tie_weights=args.dataset not in ('TOY', 'IWSLT2015'), embed_initializer=None,
- prefix='transformer_')
-model.initialize(init=mx.init.Xavier(magnitude=args.magnitude), ctx=ctx)
-static_alloc = True
-model.hybridize(static_alloc=static_alloc)
-logging.info(model)
-
-translator = BeamSearchTranslator(model=model, beam_size=args.beam_size,
- scorer=nlp.model.BeamSearchScorer(alpha=args.lp_alpha,
- K=args.lp_k),
- max_length=200)
-logging.info('Use beam_size={}, alpha={}, K={}'.format(args.beam_size, args.lp_alpha, args.lp_k))
-
-label_smoothing = LabelSmoothing(epsilon=args.epsilon, units=len(tgt_vocab))
-label_smoothing.hybridize(static_alloc=static_alloc)
-
-loss_function = MaskedSoftmaxCELoss(sparse_label=False)
-loss_function.hybridize(static_alloc=static_alloc)
-
-test_loss_function = MaskedSoftmaxCELoss()
-test_loss_function.hybridize(static_alloc=static_alloc)
-
-rescale_loss = 100.
-parallel_model = ParallelTransformer(model, label_smoothing, loss_function, rescale_loss)
-detokenizer = nlp.data.SacreMosesDetokenizer()
-
-
-def evaluate(data_loader, context=ctx[0]):
- """Evaluate given the data loader
+from gluonnlp.models.transformer import TransformerModel
+from gluonnlp.utils.misc import logging_config, AverageSGDTracker, count_parameters,\
+ md5sum, grouper, init_comm
+from gluonnlp.data.sampler import (
+ ConstWidthBucket,
+ LinearWidthBucket,
+ ExpWidthBucket,
+ FixedBucketSampler,
+ BoundedBudgetSampler,
+ ShardedIterator
+)
+import gluonnlp.data.batchify as bf
+from gluonnlp.data import Vocab
+from gluonnlp.data import tokenizers
+from gluonnlp.data.tokenizers import BaseTokenizerWithVocab
+from gluonnlp.lr_scheduler import InverseSquareRootScheduler
+from gluonnlp.loss import LabelSmoothCrossEntropyLoss
+try:
+ import horovod.mxnet as hvd
+except ImportError:
+ hvd = None
+
+mx.npx.set_np()
+
+
+CACHE_PATH = os.path.realpath(os.path.join(os.path.realpath(__file__), '..', 'cached'))
+if not os.path.exists(CACHE_PATH):
+ os.makedirs(CACHE_PATH, exist_ok=True)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Transformer for Neural Machine Translation.')
+ parser.add_argument('--train_src_corpus', type=str,
+ help='The source training corpus.')
+ parser.add_argument('--train_tgt_corpus', type=str,
+ help='The target training corpus.')
+ parser.add_argument('--dev_src_corpus', type=str,
+ help='The source dev corpus.')
+ parser.add_argument('--dev_tgt_corpus', type=str,
+ help='The target dev corpus.')
+ parser.add_argument('--src_tokenizer', choices=['spm',
+ 'subword_nmt',
+ 'yttm',
+ 'hf_bytebpe',
+ 'hf_wordpiece',
+ 'hf_bpe',
+ 'whitespace'],
+ default='whitespace', type=str,
+ help='The source tokenizer. '
+ 'Whitespace tokenizer supports processing pre-encoded corpus, '
+ 'and the tokenizers besides whitespace supports online encoding.')
+ parser.add_argument('--tgt_tokenizer', choices=['spm',
+ 'subword_nmt',
+ 'yttm',
+ 'hf_bytebpe',
+ 'hf_wordpiece',
+ 'hf_bpe',
+ 'whitespace'],
+ default='whitespace', type=str,
+ help='The target tokenizer.')
+ parser.add_argument('--src_subword_model_path', type=str,
+ help='Path to the source subword model.')
+ parser.add_argument('--src_vocab_path', type=str,
+ help='Path to the source vocab.')
+ parser.add_argument('--tgt_subword_model_path', type=str,
+ help='Path to the target subword model.')
+ parser.add_argument('--tgt_vocab_path', type=str,
+ help='Path to the target vocab.')
+ parser.add_argument('--seed', type=int, default=100, help='The random seed.')
+ parser.add_argument('--epochs', type=int, default=30, help='Upper epoch limit, '
+ 'the model will keep training when epochs < 0 and max_update < 0.')
+ parser.add_argument('--max_update', type=int, default=-1,
+ help='Max update steps, when max_update > 0, epochs will be set to -1, '
+ 'each update step contains gpu_num * num_accumulated batches.')
+ parser.add_argument('--save_interval_update', type=int, default=500,
+ help='Update interval of saving checkpoints while using max_update.')
+ parser.add_argument('--cfg', type=str, default='transformer_base',
+ help='Configuration of the transformer model. '
+ 'You may select a yml file or use the prebuild configurations.')
+ parser.add_argument('--label_smooth_alpha', type=float, default=0.1,
+ help='Weight of label smoothing')
+ parser.add_argument('--sampler', type=str, choices=['BoundedBudgetSampler', 'FixedBucketSampler'],
+ default='FixedBucketSampler', help='Type of sampler')
+ parser.add_argument('--batch_size', type=int, default=2700,
+ help='Batch size. Number of tokens per gpu in a minibatch.')
+ parser.add_argument('--val_batch_size', type=int, default=16,
+ help='Batch size for evaluation.')
+ parser.add_argument('--num_buckets', type=int, default=20, help='Bucket number.')
+ parser.add_argument('--bucket_scheme', type=str, default='exp',
+ help='Strategy for generating bucket keys. It supports: '
+ '"constant": all the buckets have the same width; '
+ '"linear": the width of bucket increases linearly; '
+ '"exp": the width of bucket increases exponentially')
+ parser.add_argument('--bucket_ratio', type=float, default=0.0,
+ help='Ratio for increasing the throughput of the bucketing')
+ parser.add_argument('--max_num_tokens', type=int, default=-1,
+ help='max tokens num of each batch, applicable while using BoundedBudgetSampler')
+ parser.add_argument('--max_num_sentences', type=int, default=-1,
+ help='max sentences num of each batch, applicable while using BoundedBudgetSampler')
+ parser.add_argument('--lr', type=float, default=0.002,
+ help='The learning rate at the end of the warmup stage. '
+ 'If it is not given, we will use the formula suggested in the '
+ 'original Transformer paper:'
+ ' 1.0 / sqrt(d_model) / sqrt(warmup_steps). '
+ 'Otherwise, we will use the given lr as the final learning rate in '
+ 'the warmup phase.')
+ parser.add_argument('--warmup_steps', type=int, default=4000,
+ help='number of warmup steps used in NOAM\'s stepsize schedule')
+ parser.add_argument('--warmup_init_lr', type=float, default=0.0,
+ help='Initial learning rate at the beginning of the warm-up stage')
+ parser.add_argument('--num_accumulated', type=int, default=32,
+ help='Number of steps to accumulate the gradients. '
+ 'This is useful to mimic large batch training with limited gpu memory')
+ parser.add_argument('--magnitude', type=float, default=3.0,
+ help='Magnitude of Xavier initialization')
+ parser.add_argument('--num_averages', type=int, default=-1,
+ help='Perform final testing based on the '
+ 'average of last num_averages checkpoints. '
+ 'Use num_average will cause extra gpu memory usage.')
+ parser.add_argument('--log_interval', type=int, default=10, metavar='N',
+ help='report interval')
+ parser.add_argument('--save_dir', type=str, default='transformer_out',
+ help='directory path to save the final model and training log')
+ parser.add_argument('--overwrite_cache', action='store_true')
+ parser.add_argument('--fp16', action='store_true',
+ help='Whether to use dtype float16')
+ parser.add_argument('--comm_backend', type=str, default='device',
+ choices=['horovod', 'dist_sync_device', 'device'],
+ help='Communication backend.')
+ parser.add_argument('--gpus', type=str,
+ help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.')
+ args = parser.parse_args()
+ if args.max_update > 0:
+ args.epochs = -1
+ logging_config(args.save_dir, console=True)
+ logging.info(args)
+ return args
+
+
+def validation(model, data_loader, ctx_l):
+ """Validate the model on the dataset
Parameters
----------
+ model : TransformerModel
+ The transformer model
data_loader : DataLoader
+ DataLoader
+ ctx_l : list
+ List of mx.ctx.Context
+
Returns
-------
- avg_loss : float
- Average loss
- real_translation_out : list of list of str
- The translation output
+ avg_nll_loss : float
+ The average negative log-likelihood loss
"""
- translation_out = []
- all_inst_ids = []
- avg_loss_denom = 0
- avg_loss = 0.0
- for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \
- in enumerate(data_loader):
- src_seq = src_seq.as_in_context(context)
- tgt_seq = tgt_seq.as_in_context(context)
- src_valid_length = src_valid_length.as_in_context(context)
- tgt_valid_length = tgt_valid_length.as_in_context(context)
- # Calculating Loss
- out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
- loss = test_loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean().asscalar()
- all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
- avg_loss += loss * (tgt_seq.shape[1] - 1)
- avg_loss_denom += (tgt_seq.shape[1] - 1)
- # Translate
- samples, _, sample_valid_length = \
- translator.translate(src_seq=src_seq, src_valid_length=src_valid_length)
- max_score_sample = samples[:, 0, :].asnumpy()
- sample_valid_length = sample_valid_length[:, 0].asnumpy()
- for i in range(max_score_sample.shape[0]):
- translation_out.append(
- [tgt_vocab.idx_to_token[ele] for ele in
- max_score_sample[i][1:(sample_valid_length[i] - 1)]])
- avg_loss = avg_loss / avg_loss_denom
- real_translation_out = [None for _ in range(len(all_inst_ids))]
- for ind, sentence in zip(all_inst_ids, translation_out):
- if args.bleu == 'tweaked':
- real_translation_out[ind] = sentence
- elif args.bleu == '13a' or args.bleu == 'intl':
- real_translation_out[ind] = detokenizer(_bpe_to_words(sentence))
- else:
- raise NotImplementedError
- return avg_loss, real_translation_out
-
-
-def train():
- """Training function."""
- trainer = gluon.Trainer(model.collect_params(), args.optimizer,
- {'learning_rate': args.lr, 'beta2': 0.98, 'epsilon': 1e-9})
+ avg_nll_loss = mx.np.array(0, dtype=np.float32, ctx=mx.cpu())
+ ntokens = 0
+ for sample_data_l in grouper(data_loader, len(ctx_l)):
+ loss_l = []
+ ntokens += sum([ele[3].sum().asnumpy() - ele[0].shape[0] for ele in sample_data_l
+ if ele is not None])
+ for sample_data, ctx in zip(sample_data_l, ctx_l):
+ if sample_data is None:
+ continue
+ src_token_ids, tgt_token_ids, src_valid_length, tgt_valid_length, sample_ids = sample_data
+ src_token_ids = src_token_ids.as_in_ctx(ctx)
+ tgt_token_ids = tgt_token_ids.as_in_ctx(ctx)
+ src_valid_length = src_valid_length.as_in_ctx(ctx)
+ tgt_valid_length = tgt_valid_length.as_in_ctx(ctx)
+ tgt_pred = model(src_token_ids, src_valid_length, tgt_token_ids[:, :-1],
+ tgt_valid_length - 1)
+ tgt_labels = tgt_token_ids[:, 1:]
+ tgt_pred_logits = mx.npx.log_softmax(tgt_pred, axis=-1)
+ nll_loss = - mx.npx.pick(tgt_pred_logits, tgt_labels, axis=-1)
+ loss = mx.npx.sequence_mask(nll_loss,
+ sequence_length=tgt_valid_length - 1,
+ use_sequence_length=True,
+ axis=1)
+ loss_l.append(loss.sum())
+ avg_nll_loss += sum([loss.as_in_ctx(mx.cpu()) for loss in loss_l])
+ mx.npx.waitall()
+ avg_loss = avg_nll_loss.asnumpy() / ntokens
+ return avg_loss
+
+
+def load_dataset_with_cache(src_corpus_path: str,
+ tgt_corpus_path: str,
+ src_tokenizer: BaseTokenizerWithVocab,
+ tgt_tokenizer: BaseTokenizerWithVocab,
+ overwrite_cache: bool,
+ local_rank: int):
+ # TODO online h5py multi processing encode (Tao)
+ src_md5sum = md5sum(src_corpus_path)
+ tgt_md5sum = md5sum(tgt_corpus_path)
+ cache_filepath = os.path.join(CACHE_PATH,
+ '{}_{}.cache.npz'.format(src_md5sum[:6], tgt_md5sum[:6]))
+ if os.path.exists(cache_filepath) and not overwrite_cache:
+ if local_rank == 0:
+ logging.info('Load cache from {}'.format(cache_filepath))
+ npz_data = np.load(cache_filepath, allow_pickle=True)
+ src_data, tgt_data = npz_data['src_data'][:], npz_data['tgt_data'][:]
+ else:
+ assert src_tokenizer.vocab.eos_id is not None,\
+ 'You will need to add the EOS token to the vocabulary used in the tokenizer of ' \
+ 'the source language.'
+ assert tgt_tokenizer.vocab.bos_id is not None and tgt_tokenizer.vocab.eos_id is not None, \
+ 'You will need to add both the BOS token and the EOS tokens to the vocabulary used ' \
+ 'in the tokenizer of the target language.'
+ src_data = []
+ tgt_data = []
+ # TODO(sxjscience) Optimize the speed of converting to cache
+ with open(src_corpus_path) as f:
+ for line in f:
+ sample = np.array(src_tokenizer.encode(line.strip(), output_type=int) +
+ [src_tokenizer.vocab.eos_id], dtype=np.int32)
+ src_data.append(sample)
+ with open(tgt_corpus_path) as f:
+ for line in f:
+ sample = np.array([tgt_tokenizer.vocab.bos_id] +
+ tgt_tokenizer.encode(line.strip(), output_type=int) +
+ [tgt_tokenizer.vocab.eos_id], dtype=np.int32)
+ tgt_data.append(sample)
+ src_data = np.array(src_data)
+ tgt_data = np.array(tgt_data)
+ np.savez(cache_filepath, src_data=src_data, tgt_data=tgt_data)
+ return src_data, tgt_data
+
+
+def create_tokenizer(tokenizer_type, model_path, vocab_path):
+ if tokenizer_type == 'whitespace':
+ return tokenizers.create(tokenizer_type, vocab=Vocab.load(vocab_path))
+ elif tokenizer_type == 'spm':
+ return tokenizers.create(tokenizer_type, model_path=model_path, vocab=vocab_path)
+ elif tokenizer_type == 'subword_nmt':
+ return tokenizers.create(tokenizer_type, codec_path=model_path, vocab_path=vocab_path)
+ elif tokenizer_type == 'yttm':
+ return tokenizers.create(tokenizer_type, model_path=model_path)
+ elif tokenizer_type == 'hf_bytebpe':
+ return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path)
+ elif tokenizer_type == 'hf_wordpiece':
+ return tokenizers.create(tokenizer_type, vocab_file=vocab_path)
+ elif tokenizer_type == 'hf_bpe':
+ return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path)
+ else:
+ raise NotImplementedError
- train_data_loader, val_data_loader, test_data_loader \
- = dataprocessor.make_dataloader(data_train, data_val, data_test, args,
- use_average_length=True, num_shards=len(ctx))
- if args.bleu == 'tweaked':
- bpe = bool(args.dataset != 'IWSLT2015' and args.dataset != 'TOY')
- split_compound_word = bpe
- tokenized = True
- elif args.bleu == '13a' or args.bleu == 'intl':
- bpe = False
- split_compound_word = False
- tokenized = False
+def train(args):
+ _, num_parts, rank, local_rank, _, ctx_l = init_comm(
+ args.comm_backend, args.gpus)
+ src_tokenizer = create_tokenizer(args.src_tokenizer,
+ args.src_subword_model_path,
+ args.src_vocab_path)
+ tgt_tokenizer = create_tokenizer(args.tgt_tokenizer,
+ args.tgt_subword_model_path,
+ args.tgt_vocab_path)
+ src_vocab = src_tokenizer.vocab
+ tgt_vocab = tgt_tokenizer.vocab
+ train_src_data, train_tgt_data = load_dataset_with_cache(args.train_src_corpus,
+ args.train_tgt_corpus,
+ src_tokenizer,
+ tgt_tokenizer,
+ args.overwrite_cache,
+ local_rank)
+ dev_src_data, dev_tgt_data = load_dataset_with_cache(args.dev_src_corpus,
+ args.dev_tgt_corpus,
+ src_tokenizer,
+ tgt_tokenizer,
+ args.overwrite_cache,
+ local_rank)
+ data_train = gluon.data.SimpleDataset(
+ [(src_tokens, tgt_tokens, len(src_tokens), len(tgt_tokens), i)
+ for i, (src_tokens, tgt_tokens) in enumerate(zip(train_src_data, train_tgt_data))])
+ data_val = gluon.data.SimpleDataset(
+ [(src_tokens, tgt_tokens, len(src_tokens), len(tgt_tokens), i)
+ for i, (src_tokens, tgt_tokens) in enumerate(zip(dev_src_data, dev_tgt_data))])
+ # Construct the model + loss function
+ if args.cfg.endswith('.yml'):
+ cfg = TransformerModel.get_cfg().clone_merge(args.cfg)
+ else:
+ cfg = TransformerModel.get_cfg(args.cfg)
+ cfg.defrost()
+ cfg.MODEL.src_vocab_size = len(src_vocab)
+ cfg.MODEL.tgt_vocab_size = len(tgt_vocab)
+ if args.fp16:
+ raise NotImplementedError
+# cfg.MODEL.dtype = 'float16'
+ cfg.freeze()
+ model = TransformerModel.from_cfg(cfg)
+ model.initialize(mx.init.Xavier(magnitude=args.magnitude),
+ ctx=ctx_l)
+ model.hybridize()
+ if local_rank == 0:
+ logging.info(model)
+ with open(os.path.join(args.save_dir, 'config.yml'), 'w') as cfg_f:
+ cfg_f.write(cfg.dump())
+ label_smooth_loss = LabelSmoothCrossEntropyLoss(num_labels=len(tgt_vocab),
+ alpha=args.label_smooth_alpha,
+ from_logits=False)
+ label_smooth_loss.hybridize()
+ rescale_loss = 100.0
+
+ if args.comm_backend == 'horovod':
+ hvd.broadcast_parameters(model.collect_params(), root_rank=0)
+
+ # Construct the trainer
+ # TODO(sxjscience) Support AMP
+ if args.lr is None:
+ base_lr = 2.0 / math.sqrt(args.num_units) / math.sqrt(args.warmup_steps)
+ else:
+ base_lr = args.lr
+ lr_scheduler = InverseSquareRootScheduler(warmup_steps=args.warmup_steps, base_lr=base_lr,
+ warmup_init_lr=args.warmup_init_lr)
+ trainer_settings = (model.collect_params(), 'adam',
+ {'learning_rate': args.lr, 'beta1': 0.9,
+ 'beta2': 0.98, 'epsilon': 1e-9, 'lr_scheduler': lr_scheduler})
+ if args.comm_backend == 'horovod':
+ trainer = hvd.DistributedTrainer(*trainer_settings)
+ else:
+ trainer = gluon.Trainer(*trainer_settings)
+ # Load Data
+ if args.sampler == 'BoundedBudgetSampler':
+ train_batch_sampler = BoundedBudgetSampler(lengths=[(ele[2], ele[3]) for ele in data_train],
+ max_num_tokens=args.max_num_tokens,
+ max_num_sentences=args.max_num_sentences,
+ seed=args.seed)
+ if num_parts > 1:
+ train_batch_sampler = ShardedIterator(train_batch_sampler, num_parts=num_parts, part_index=rank)
+ elif args.sampler == 'FixedBucketSampler':
+ if args.comm_backend == 'horovod':
+ raise NotImplementedError('FixedBucketSampler does not support horovod at present')
+
+ if args.bucket_scheme == 'constant':
+ bucket_scheme = ConstWidthBucket()
+ elif args.bucket_scheme == 'linear':
+ bucket_scheme = LinearWidthBucket()
+ elif args.bucket_scheme == 'exp':
+ bucket_scheme = ExpWidthBucket(bucket_len_step=1.2)
+ else:
+ raise NotImplementedError
+ # TODO(sxjscience) Support auto-bucket-size tuning
+ train_batch_sampler = FixedBucketSampler(lengths=[(ele[2], ele[3]) for ele in data_train],
+ batch_size=args.batch_size,
+ num_buckets=args.num_buckets,
+ ratio=args.bucket_ratio,
+ shuffle=True,
+ use_average_length=True,
+ bucket_scheme=bucket_scheme,
+ seed=args.seed)
else:
raise NotImplementedError
- best_valid_bleu = 0.0
- step_num = 0
- warmup_steps = args.warmup_steps
- grad_interval = args.num_accumulated
- model.collect_params().setattr('grad_req', 'add')
- average_start = (len(train_data_loader) // grad_interval) * (args.epochs - args.average_start)
- average_param_dict = None
- model.collect_params().zero_grad()
- parallel = Parallel(num_ctxs, parallel_model)
- for epoch_id in range(args.epochs):
- log_avg_loss = 0
- log_wc = 0
- loss_denom = 0
- step_loss = 0
- log_start_time = time.time()
- for batch_id, seqs \
- in enumerate(train_data_loader):
- if batch_id % grad_interval == 0:
- step_num += 1
- new_lr = args.lr / math.sqrt(args.num_units) \
- * min(1. / math.sqrt(step_num), step_num * warmup_steps ** (-1.5))
- trainer.set_learning_rate(new_lr)
- src_wc, tgt_wc, bs = np.sum([(shard[2].sum(), shard[3].sum(), shard[0].shape[0])
- for shard in seqs], axis=0)
- seqs = [[seq.as_in_context(context) for seq in shard]
- for context, shard in zip(ctx, seqs)]
- Ls = []
- for seq in seqs:
- parallel.put((seq, args.batch_size))
- Ls = [parallel.get() for _ in range(len(ctx))]
- src_wc = src_wc.asscalar()
- tgt_wc = tgt_wc.asscalar()
- loss_denom += tgt_wc - bs
- if batch_id % grad_interval == grad_interval - 1 or\
- batch_id == len(train_data_loader) - 1:
- if average_param_dict is None:
- average_param_dict = {k: v.data(ctx[0]).copy() for k, v in
- model.collect_params().items()}
- trainer.step(float(loss_denom) / args.batch_size / rescale_loss)
- param_dict = model.collect_params()
- param_dict.zero_grad()
- if step_num > average_start:
- alpha = 1. / max(1, step_num - average_start)
- for name, average_param in average_param_dict.items():
- average_param[:] += alpha * (param_dict[name].data(ctx[0]) - average_param)
- step_loss += sum([L.asscalar() for L in Ls])
- if batch_id % grad_interval == grad_interval - 1 or\
- batch_id == len(train_data_loader) - 1:
- log_avg_loss += step_loss / loss_denom * args.batch_size * rescale_loss
+ logging.info(train_batch_sampler)
+
+ batchify_fn = bf.Tuple(bf.Pad(), bf.Pad(), bf.Stack(), bf.Stack(), bf.Stack())
+ train_data_loader = gluon.data.DataLoader(data_train,
+ batch_sampler=train_batch_sampler,
+ batchify_fn=batchify_fn,
+ num_workers=0)
+
+ val_data_loader = gluon.data.DataLoader(data_val,
+ batch_size=args.val_batch_size,
+ batchify_fn=batchify_fn,
+ num_workers=0,
+ shuffle=False)
+ for v in model.collect_params().values():
+ if v.grad_req != 'null':
+ v.grad_req = 'add'
+ model.zero_grad()
+ model_averager = AverageSGDTracker(model.collect_params())
+ log_start_time = time.time()
+ num_params, num_fixed_params = None, None
+ # TODO(sxjscience) Add a log metric class
+ accum_count = 0
+ loss_denom = 0
+ n_train_iters = 0
+ log_wc = 0
+ log_avg_loss = 0.0
+ log_loss_denom = 0
+ epoch_id = 0
+ while (args.epochs < 0 or epoch_id < args.epochs): # when args.epochs < 0, the model will keep training
+ n_epoch_train_iters = 0
+ processed_batch_num = 0
+ train_multi_data_loader = grouper(train_data_loader, len(ctx_l))
+ is_last_batch = False
+ sample_data_l = next(train_multi_data_loader)
+ while not is_last_batch:
+ processed_batch_num += len(sample_data_l)
+ loss_l = []
+ for sample_data, ctx in zip(sample_data_l, ctx_l):
+ if sample_data is None:
+ continue
+ src_token_ids, tgt_token_ids, src_valid_length, tgt_valid_length, sample_ids = sample_data
+ src_wc, tgt_wc, bs = src_valid_length.sum(), tgt_valid_length.sum(), src_token_ids.shape[0]
+ loss_denom += tgt_wc - bs
+ log_loss_denom += tgt_wc - bs
+ log_wc += src_wc + tgt_wc
+ src_token_ids = src_token_ids.as_in_ctx(ctx)
+ tgt_token_ids = tgt_token_ids.as_in_ctx(ctx)
+ src_valid_length = src_valid_length.as_in_ctx(ctx)
+ tgt_valid_length = tgt_valid_length.as_in_ctx(ctx)
+ with mx.autograd.record():
+ tgt_pred = model(src_token_ids, src_valid_length, tgt_token_ids[:, :-1],
+ tgt_valid_length - 1)
+ tgt_labels = tgt_token_ids[:, 1:]
+ loss = label_smooth_loss(tgt_pred, tgt_labels)
+ loss = mx.npx.sequence_mask(loss,
+ sequence_length=tgt_valid_length - 1,
+ use_sequence_length=True,
+ axis=1)
+ loss_l.append(loss.sum() / rescale_loss)
+ for l in loss_l:
+ l.backward()
+ accum_count += 1
+ try:
+ sample_data_l = next(train_multi_data_loader)
+ except StopIteration:
+ is_last_batch = True
+ if local_rank == 0 and num_params is None:
+ num_params, num_fixed_params = count_parameters(model.collect_params())
+ logging.info('Total Number of Parameters (not-fixed/fixed): {}/{}'
+ .format(num_params, num_fixed_params))
+ sum_loss = sum([l.as_in_ctx(mx.cpu()) for l in loss_l]) * rescale_loss
+ log_avg_loss += sum_loss
+ mx.npx.waitall()
+ if accum_count == args.num_accumulated or is_last_batch:
+ # Update the parameters
+ n_train_iters += 1
+ n_epoch_train_iters += 1
+ trainer.step(loss_denom.asnumpy() / rescale_loss)
+ accum_count = 0
loss_denom = 0
- step_loss = 0
- log_wc += src_wc + tgt_wc
- if (batch_id + 1) % (args.log_interval * grad_interval) == 0:
- wps = log_wc / (time.time() - log_start_time)
- logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, '
- 'throughput={:.2f}K wps, wc={:.2f}K'
- .format(epoch_id, batch_id + 1, len(train_data_loader),
- log_avg_loss / args.log_interval,
- np.exp(log_avg_loss / args.log_interval),
- wps / 1000, log_wc / 1000))
- log_start_time = time.time()
- log_avg_loss = 0
- log_wc = 0
- mx.nd.waitall()
- valid_loss, valid_translation_out = evaluate(val_data_loader, ctx[0])
- valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences], valid_translation_out,
- tokenized=tokenized, tokenizer=args.bleu,
- split_compound_word=split_compound_word,
- bpe=bpe)
- logging.info('[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
- .format(epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
- test_loss, test_translation_out = evaluate(test_data_loader, ctx[0])
- test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out,
- tokenized=tokenized, tokenizer=args.bleu,
- split_compound_word=split_compound_word,
- bpe=bpe)
- logging.info('[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
- .format(epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100))
- dataprocessor.write_sentences(valid_translation_out,
- os.path.join(args.save_dir,
- 'epoch{:d}_valid_out.txt').format(epoch_id))
- dataprocessor.write_sentences(test_translation_out,
- os.path.join(args.save_dir,
- 'epoch{:d}_test_out.txt').format(epoch_id))
- if valid_bleu_score > best_valid_bleu:
- best_valid_bleu = valid_bleu_score
- save_path = os.path.join(args.save_dir, 'valid_best.params')
- logging.info('Save best parameters to {}'.format(save_path))
- model.save_parameters(save_path)
- save_path = os.path.join(args.save_dir, 'epoch{:d}.params'.format(epoch_id))
- model.save_parameters(save_path)
- save_path = os.path.join(args.save_dir, 'average.params')
- mx.nd.save(save_path, average_param_dict)
- if args.average_checkpoint:
- for j in range(args.num_averages):
- params = mx.nd.load(os.path.join(args.save_dir,
- 'epoch{:d}.params'.format(args.epochs - j - 1)))
- alpha = 1. / (j + 1)
- for k, v in model._collect_params_with_prefix().items():
- for c in ctx:
- v.data(c)[:] += alpha * (params[k].as_in_context(c) - v.data(c))
- save_path = os.path.join(args.save_dir,
- 'average_checkpoint_{}.params'.format(args.num_averages))
- model.save_parameters(save_path)
- elif args.average_start > 0:
- for k, v in model.collect_params().items():
- v.set_data(average_param_dict[k])
- save_path = os.path.join(args.save_dir, 'average.params')
- model.save_parameters(save_path)
- else:
- model.load_parameters(os.path.join(args.save_dir, 'valid_best.params'), ctx)
- valid_loss, valid_translation_out = evaluate(val_data_loader, ctx[0])
- valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences], valid_translation_out,
- tokenized=tokenized, tokenizer=args.bleu, bpe=bpe,
- split_compound_word=split_compound_word)
- logging.info('Best model valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
- .format(valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
- test_loss, test_translation_out = evaluate(test_data_loader, ctx[0])
- test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out,
- tokenized=tokenized, tokenizer=args.bleu, bpe=bpe,
- split_compound_word=split_compound_word)
- logging.info('Best model test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
- .format(test_loss, np.exp(test_loss), test_bleu_score * 100))
- dataprocessor.write_sentences(valid_translation_out,
- os.path.join(args.save_dir, 'best_valid_out.txt'))
- dataprocessor.write_sentences(test_translation_out,
- os.path.join(args.save_dir, 'best_test_out.txt'))
+ model.zero_grad()
+ if (args.epochs > 0 and epoch_id >= args.epochs - args.num_averages) or \
+ (args.max_update > 0 and n_train_iters >= args.max_update - args.num_averages * args.save_interval_update):
+ model_averager.step()
+ if local_rank == 0 and \
+ (n_epoch_train_iters % args.log_interval == 0 or is_last_batch):
+ log_end_time = time.time()
+ log_wc = log_wc.asnumpy()
+ wps = log_wc / (log_end_time - log_start_time)
+ log_avg_loss = (log_avg_loss / log_loss_denom).asnumpy()
+ logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, '
+ 'throughput={:.2f}K wps, wc={:.2f}K, LR={}'
+ .format(epoch_id, processed_batch_num * num_parts,
+ len(train_data_loader), log_avg_loss, np.exp(log_avg_loss),
+ wps / 1000, log_wc / 1000, trainer.learning_rate))
+ log_start_time = time.time()
+ log_avg_loss = 0
+ log_loss_denom = 0
+ log_wc = 0
+ if local_rank == 0 and \
+ (args.max_update > 0 and n_train_iters % args.save_interval_update == 0):
+ n_update = n_train_iters // args.save_interval_update
+ model.save_parameters(os.path.join(args.save_dir,
+ 'update{:d}.params'.format(n_update)),
+ deduplicate=True)
+ avg_valid_loss = validation(model, val_data_loader, ctx_l)
+ logging.info('[Update {}] validation loss/ppl={:.4f}/{:.4f}'
+ .format(n_update, avg_valid_loss, np.exp(avg_valid_loss)))
+ if args.max_update > 0 and n_train_iters >= args.max_update:
+ break
+ if local_rank == 0:
+ model.save_parameters(os.path.join(args.save_dir,
+ 'epoch{:d}.params'.format(epoch_id)),
+ deduplicate=True)
+ avg_valid_loss = validation(model, val_data_loader, ctx_l)
+ logging.info('[Epoch {}] validation loss/ppl={:.4f}/{:.4f}'
+ .format(epoch_id, avg_valid_loss, np.exp(avg_valid_loss)))
+
+ if args.max_update > 0 and n_train_iters >= args.max_update:
+ break
+ epoch_id += 1
+
+ if args.num_averages > 0:
+ model_averager.copy_back(model.collect_params()) # TODO(sxjscience) Rewrite using update
+ model.save_parameters(os.path.join(args.save_dir, 'average.params'),
+ deduplicate=True)
if __name__ == '__main__':
- train()
+ os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
+ args = parse_args()
+ np.random.seed(args.seed)
+ mx.random.seed(args.seed)
+ random.seed(args.seed)
+ train(args)
diff --git a/scripts/machine_translation/translation.py b/scripts/machine_translation/translation.py
deleted file mode 100644
index 34127b6f4c..0000000000
--- a/scripts/machine_translation/translation.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Machine translation models and translators."""
-
-
-__all__ = ['BeamSearchTranslator']
-
-import numpy as np
-import mxnet as mx
-from gluonnlp.model import BeamSearchScorer, BeamSearchSampler
-
-class BeamSearchTranslator:
- """Beam Search Translator
-
- Parameters
- ----------
- model : NMTModel
- The neural machine translation model
- beam_size : int
- Size of the beam
- scorer : BeamSearchScorer
- Score function used in beamsearch
- max_length : int
- The maximum decoding length
- """
- def __init__(self, model, beam_size=1, scorer=BeamSearchScorer(), max_length=100):
- self._model = model
- self._sampler = BeamSearchSampler(
- decoder=self._decode_logprob,
- beam_size=beam_size,
- eos_id=model.tgt_vocab.token_to_idx[model.tgt_vocab.eos_token],
- scorer=scorer,
- max_length=max_length)
-
- def _decode_logprob(self, step_input, states):
- out, states, _ = self._model.decode_step(step_input, states)
- return mx.nd.log_softmax(out), states
-
- def translate(self, src_seq, src_valid_length):
- """Get the translation result given the input sentence.
-
- Parameters
- ----------
- src_seq : mx.nd.NDArray
- Shape (batch_size, length)
- src_valid_length : mx.nd.NDArray
- Shape (batch_size,)
-
- Returns
- -------
- samples : NDArray
- Samples draw by beam search. Shape (batch_size, beam_size, length). dtype is int32.
- scores : NDArray
- Scores of the samples. Shape (batch_size, beam_size). We make sure that scores[i, :] are
- in descending order.
- valid_length : NDArray
- The valid length of the samples. Shape (batch_size, beam_size). dtype will be int32.
- """
- batch_size = src_seq.shape[0]
- encoder_outputs, _ = self._model.encode(src_seq, valid_length=src_valid_length)
- decoder_states = self._model.decoder.init_state_from_encoder(encoder_outputs,
- src_valid_length)
- inputs = mx.nd.full(shape=(batch_size,), ctx=src_seq.context, dtype=np.float32,
- val=self._model.tgt_vocab.token_to_idx[self._model.tgt_vocab.bos_token])
- samples, scores, sample_valid_length = self._sampler(inputs, decoder_states)
- return samples, scores, sample_valid_length
diff --git a/scripts/machine_translation/utils.py b/scripts/machine_translation/utils.py
deleted file mode 100644
index 1494fa43ab..0000000000
--- a/scripts/machine_translation/utils.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility functions."""
-
-import os
-import logging
-import inspect
-
-__all__ = ['logging_config']
-
-
-def logging_config(folder=None, name=None,
- level=logging.DEBUG,
- console_level=logging.INFO,
- no_console=False):
- """ Config the logging.
-
- Parameters
- ----------
- folder : str or None
- name : str or None
- level : int
- console_level
- no_console: bool
- Whether to disable the console log
- Returns
- -------
- folder : str
- Folder that the logging file will be saved into.
- """
- if name is None:
- name = inspect.stack()[1][1].split('.')[0]
- if folder is None:
- folder = os.path.join(os.getcwd(), name)
- if not os.path.exists(folder):
- os.makedirs(folder)
- # Remove all the current handlers
- for handler in logging.root.handlers:
- logging.root.removeHandler(handler)
- logging.root.handlers = []
- logpath = os.path.join(folder, name + '.log')
- print('All Logs will be saved to {}'.format(logpath))
- logging.root.setLevel(level)
- formatter = logging.Formatter('%(asctime)s - %(name)s - %(message)s')
- logfile = logging.FileHandler(logpath)
- logfile.setLevel(level)
- logfile.setFormatter(formatter)
- logging.root.addHandler(logfile)
- if not no_console:
- # Initialze the console logging
- logconsole = logging.StreamHandler()
- logconsole.setLevel(console_level)
- logconsole.setFormatter(formatter)
- logging.root.addHandler(logconsole)
- return folder
diff --git a/scripts/machine_translation/wmt2014_back_translation.sh b/scripts/machine_translation/wmt2014_back_translation.sh
new file mode 100644
index 0000000000..ebe344a773
--- /dev/null
+++ b/scripts/machine_translation/wmt2014_back_translation.sh
@@ -0,0 +1,160 @@
+SUBWORD_ALGO=$1
+SRC=en
+TGT=de
+
+# prepare en_de data for the reverse model
+cd ../datasets/machine_translation
+bash wmt2014_ende.sh ${SUBWORD_ALGO}
+
+# Fetch the raw mono text
+nlp_data prepare_wmt \
+ --mono \
+ --mono_lang ${TGT} \
+ --dataset newscrawl \
+ --save-path wmt2014_mono
+
+
+# Clean and tokenize the monolingual corpus
+cd wmt2014_mono
+nlp_preprocess clean_tok_mono_corpus \
+ --lang ${TGT} \
+ --corpus train.raw.${TGT} \
+ --min-num-words 1 \
+ --max-num-words 100 \
+ --save-path train.tok.${TGT}
+
+cd ../../../machine_translation
+datapath=../datasets/machine_translation
+
+# train the reverse model to translate German to English
+python3 train_transformer.py \
+ --train_src_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${TGT} \
+ --train_tgt_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${SRC} \
+ --dev_src_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${TGT} \
+ --dev_tgt_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${SRC} \
+ --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --save_dir transformer_wmt2014_de_en_${SUBWORD_ALGO} \
+ --cfg transformer_base \
+ --lr 0.002 \
+ --warmup_steps 4000 \
+ --warmup_init_lr 0.0 \
+ --seed 100 \
+ --gpus 0,1,2,3
+
+# Due to the limited memory, we need to split the data and process the data divided respectively
+split -l 400000 ${datapath}/wmt2014_mono/train.tok.${TGT} ${datapath}/wmt2014_mono/train.tok.${TGT}.split -d -a 3
+
+# Infer the synthetic data
+# Notice that some batches are too large and GPU memory may be not enough
+GPUS=(0 1 2 3)
+IDX=0
+for NUM in ` seq -f %03g 0 193 `; do
+ split_corpus=${datapath}/wmt2014_mono/train.tok.${TGT}.split${NUM}
+ if [ ${IDX} -eq ${#GPUS[@]} ]; then
+ let "IDX=0"
+ wait
+ fi
+ {
+ echo processing ${split_corpus}
+ python3 evaluate_transformer.py \
+ --param_path transformer_wmt2014_de_en_${SUBWORD_ALGO}/average.params \
+ --src_lang ${TGT} \
+ --tgt_lang ${SRC} \
+ --cfg transformer_base \
+ --src_tokenizer ${SUBWORD_ALGO} \
+ --tgt_tokenizer ${SUBWORD_ALGO} \
+ --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --src_corpus ${split_corpus} \
+ --save_dir ${split_corpus/.${TGT}./.${SRC}.} \
+ --beam-size 1 \
+ --inference \
+ --gpus ${GPUS[IDX]}
+ } &
+ let "IDX++"
+done
+wait
+
+cat ` seq -f "${datapath}/wmt2014_mono/train.tok.${SRC}.split%03g/pred_sentences.txt" 0 193 ` \
+ > ${datapath}/wmt2014_mono/syn.train.raw.${SRC}
+cp ${datapath}/wmt2014_mono/train.tok.${TGT} ${datapath}/wmt2014_mono/syn.train.raw.${TGT}
+
+# Clean the synthetic data
+nlp_preprocess clean_tok_para_corpus --src-lang ${SRC} \
+ --tgt-lang ${TGT} \
+ --src-corpus ${datapath}/wmt2014_mono/syn.train.raw.${SRC} \
+ --tgt-corpus ${datapath}/wmt2014_mono/syn.train.raw.${TGT} \
+ --min-num-words 1 \
+ --max-num-words 250 \
+ --max-ratio 1.5 \
+ --src-save-path ${datapath}/wmt2014_mono/syn.train.tok.${SRC} \
+ --tgt-save-path ${datapath}/wmt2014_mono/syn.train.tok.${TGT}
+
+# Combine the synthetic data with upsampled original data
+# TODO upsample
+rm -rf ${datapath}/wmt2014_backtranslation
+mkdir ${datapath}/wmt2014_backtranslation
+for LANG in ${SRC} ${TGT} ; do
+ cat ${datapath}/wmt2014_ende/train.tok.${LANG} ${datapath}/wmt2014_mono/syn.train.tok.${LANG} \
+ > ${datapath}/wmt2014_backtranslation/bt.train.tok.${LANG}
+done
+
+# Tokenize
+for LANG in ${SRC} ${TGT} ; do
+ nlp_preprocess apply_subword --model ${SUBWORD_ALGO} \
+ --output-type subword \
+ --model-path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --vocab-path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --corpus ${datapath}/wmt2014_backtranslation/bt.train.tok.${LANG} \
+ --save-path ${datapath}/wmt2014_backtranslation/bt.train.tok.${SUBWORD_ALGO}.${LANG}
+done
+
+# Use the combine data to train the new model
+python3 train_transformer.py \
+ --train_src_corpus ${datapath}/wmt2014_backtranslation/bt.train.tok.${SUBWORD_ALGO}.${SRC} \
+ --train_tgt_corpus ${datapath}/wmt2014_backtranslation/bt.train.tok.${SUBWORD_ALGO}.${TGT} \
+ --dev_src_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${SRC} \
+ --dev_tgt_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${TGT} \
+ --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --save_dir backtranslation_transformer_wmt2014_ende_${SUBWORD_ALGO} \
+ --cfg transformer_base \
+ --lr 0.003 \
+ --max_num_tokens 4096 \
+ --sampler BoundedBudgetSampler \
+ --comm_backend horovod \
+ --max_update 30000 \
+ --save_interval_update 1000 \
+ --warmup_steps 6000 \
+ --warmup_init_lr 0.0 \
+ --num_averages -1 \
+ --seed 123 \
+ --gpus 0,1,2,3
+
+# TODO nlp_average_checkpoint
+nlp_nmt average_checkpoint --prefix range() \
+ --suffix \
+ --save-path backtranslation_transformer_wmt2014_ende_${SUBWORD_ALGO}/average.params
+
+# Finally, we can evaluate the model
+python3 evaluate_transformer.py \
+ --param_path backtranslation_transformer_wmt2014_ende_${SUBWORD_ALGO}/avg_20_29.params \
+ --src_lang ${SRC} \
+ --tgt_lang ${TGT} \
+ --cfg transformer_base \
+ --src_tokenizer ${SUBWORD_ALGO} \
+ --tgt_tokenizer ${SUBWORD_ALGO} \
+ --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+ --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+ --src_corpus ${datapath}/wmt2014_ende/test.raw.${SRC} \
+ --tgt_corpus ${datapath}/wmt2014_ende/test.raw.${TGT} \
+ --gpus 0
diff --git a/scripts/natural_language_inference/dataset.py b/scripts/natural_language_inference/dataset.py
deleted file mode 100644
index 31496a691e..0000000000
--- a/scripts/natural_language_inference/dataset.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# Copyright 2018 Mengxiao Lin .
-# pylint: disable=logging-format-interpolation
-
-"""
-Data loading and batching.
-"""
-
-import os
-import logging
-from mxnet import gluon
-import gluonnlp as nlp
-import gluonnlp.data.batchify as btf
-
-logger = logging.getLogger('nli')
-LABEL_TO_IDX = {'neutral': 0, 'contradiction': 1, 'entailment': 2}
-
-def read_dataset(args, dataset):
- """
- Read dataset from tokenized files.
- """
- path = os.path.join(vars(args)[dataset])
- logger.info('reading data from {}'.format(path))
- examples = [line.strip().split('\t') for line in open(path)]
- if args.max_num_examples > 0:
- examples = examples[:args.max_num_examples]
- # NOTE: assume data has been tokenized
- dataset = gluon.data.SimpleDataset([(e[0], e[1], LABEL_TO_IDX[e[2]]) for e in examples])
- dataset = dataset.transform(lambda s1, s2, label: (
- ['NULL'] + s1.lower().split(),
- ['NULL'] + s2.lower().split(), label),
- lazy=False)
- logger.info('read {} examples'.format(len(dataset)))
- return dataset
-
-def build_vocab(dataset):
- """
- Build vocab given a dataset.
- """
- counter = nlp.data.count_tokens([w for e in dataset for s in e[:2] for w in s],
- to_lower=True)
- vocab = nlp.Vocab(counter)
- return vocab
-
-def prepare_data_loader(args, dataset, vocab, test=False):
- """
- Read data and build data loader.
- """
- # Preprocess
- dataset = dataset.transform(lambda s1, s2, label: (vocab(s1), vocab(s2), label),
- lazy=False)
-
- # Batching
- batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0), btf.Stack(dtype='int32'))
- data_lengths = [max(len(d[0]), len(d[1])) for d in dataset]
- batch_sampler = nlp.data.FixedBucketSampler(lengths=data_lengths,
- batch_size=args.batch_size,
- shuffle=(not test))
- data_loader = gluon.data.DataLoader(dataset=dataset,
- batch_sampler=batch_sampler,
- batchify_fn=batchify_fn)
- return data_loader
diff --git a/scripts/natural_language_inference/decomposable_attention.py b/scripts/natural_language_inference/decomposable_attention.py
deleted file mode 100644
index f991461e03..0000000000
--- a/scripts/natural_language_inference/decomposable_attention.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# Copyright 2018 Mengxiao Lin .
-# pylint: disable=arguments-differ
-
-"""
-Implementation of the decomposable attention model with intra sentence attention.
-"""
-
-from mxnet import gluon
-from mxnet.gluon import nn
-
-
-class DecomposableAttentionModel(gluon.HybridBlock):
- """
- A Decomposable Attention Model for Natural Language Inference
- using intra-sentence attention.
- Arxiv paper: https://arxiv.org/pdf/1606.01933.pdf
- """
- def __init__(self, vocab_size, word_embed_size, hidden_size,
- dropout=0., intra_attention=False, **kwargs):
- super(DecomposableAttentionModel, self).__init__(**kwargs)
- self.word_embed_size = word_embed_size
- self.hidden_size = hidden_size
- self.use_intra_attention = intra_attention
- with self.name_scope():
- self.dropout_layer = nn.Dropout(dropout)
- self.word_emb = nn.Embedding(vocab_size, word_embed_size)
- self.lin_proj = nn.Dense(hidden_size, in_units=word_embed_size,
- flatten=False, use_bias=False)
- if self.use_intra_attention:
- self.intra_attention = IntraSentenceAttention(hidden_size, hidden_size, dropout)
- input_size = hidden_size * 2
- else:
- self.intra_attention = None
- input_size = hidden_size
- self.model = DecomposableAttention(input_size, hidden_size, 3, dropout)
-
- def hybrid_forward(self, F, sentence1, sentence2):
- """
- Predict the relation of two sentences.
-
- Parameters
- ----------
- sentence1 : NDArray
- Shape (batch_size, length)
- sentence2 : NDArray
- Shape (batch_size, length)
-
- Returns
- -------
- pred : NDArray
- Shape (batch_size, num_classes). num_classes == 3.
-
- """
- feature1 = self.lin_proj(self.word_emb(sentence1))
- feature2 = self.lin_proj(self.word_emb(sentence2))
- if self.use_intra_attention:
- feature1 = F.concat(feature1, self.intra_attention(feature1), dim=-1)
- feature2 = F.concat(feature2, self.intra_attention(feature2), dim=-1)
- pred = self.model(feature1, feature2)
- return pred
-
-class IntraSentenceAttention(gluon.HybridBlock):
- """
- Intra Sentence Attention block.
- """
- def __init__(self, inp_size, hidden_size, dropout=0., **kwargs):
- super(IntraSentenceAttention, self).__init__(**kwargs)
- self.hidden_size = hidden_size
- with self.name_scope():
- self.dropout_layer = nn.Dropout(dropout)
- # F_intra in the paper
- self.intra_attn_emb = nn.HybridSequential()
- self.intra_attn_emb.add(self.dropout_layer)
- self.intra_attn_emb.add(nn.Dense(hidden_size, in_units=inp_size,
- activation='relu', flatten=False))
- self.intra_attn_emb.add(self.dropout_layer)
- self.intra_attn_emb.add(nn.Dense(hidden_size, in_units=hidden_size,
- activation='relu', flatten=False))
-
- def hybrid_forward(self, F, feature_a):
- """
- Compute intra-sentence attention given embedded words.
-
- Parameters
- ----------
- feature_a : NDArray
- Shape (batch_size, length, hidden_size)
-
- Returns
- -------
- alpha : NDArray
- Shape (batch_size, length, hidden_size)
- """
- tilde_a = self.intra_attn_emb(feature_a)
- e_matrix = F.batch_dot(tilde_a, tilde_a, transpose_b=True)
- alpha = F.batch_dot(e_matrix.softmax(), tilde_a)
- return alpha
-
-class DecomposableAttention(gluon.HybridBlock):
- """
- Decomposable Attention block.
- """
- def __init__(self, inp_size, hidden_size, num_class, dropout=0., **kwargs):
- super(DecomposableAttention, self).__init__(**kwargs)
- with self.name_scope():
- self.dropout_layer = nn.Dropout(dropout)
- # attention function
- self.f = self._ff_layer(in_units=inp_size, out_units=hidden_size, flatten=False)
- # compare function
- self.g = self._ff_layer(in_units=hidden_size * 2, out_units=hidden_size, flatten=False)
- # predictor
- self.h = self._ff_layer(in_units=hidden_size * 2, out_units=hidden_size, flatten=True)
- self.h.add(nn.Dense(num_class, in_units=hidden_size))
- # extract features
- self.hidden_size = hidden_size
- self.inp_size = inp_size
-
- def _ff_layer(self, in_units, out_units, flatten=True):
- m = nn.HybridSequential()
- m.add(self.dropout_layer)
- m.add(nn.Dense(out_units, in_units=in_units, activation='relu', flatten=flatten))
- m.add(self.dropout_layer)
- m.add(nn.Dense(out_units, in_units=out_units, activation='relu', flatten=flatten))
- return m
-
- def hybrid_forward(self, F, a, b):
- """
- Forward of Decomposable Attention layer
- """
- # a.shape = [B, L1, H]
- # b.shape = [B, L2, H]
- # extract features
- tilde_a = self.f(a) # shape = [B, L1, H]
- tilde_b = self.f(b) # shape = [B, L2, H]
- # attention
- # e.shape = [B, L1, L2]
- e = F.batch_dot(tilde_a, tilde_b, transpose_b=True)
- # beta: b align to a, [B, L1, H]
- beta = F.batch_dot(e.softmax(), tilde_b)
- # alpha: a align to b, [B, L2, H]
- alpha = F.batch_dot(e.transpose([0, 2, 1]).softmax(), tilde_a)
- # compare
- feature1 = self.g(F.concat(tilde_a, beta, dim=2))
- feature2 = self.g(F.concat(tilde_b, alpha, dim=2))
- feature1 = feature1.sum(axis=1)
- feature2 = feature2.sum(axis=1)
- yhat = self.h(F.concat(feature1, feature2, dim=1))
- return yhat
diff --git a/scripts/natural_language_inference/esim.py b/scripts/natural_language_inference/esim.py
deleted file mode 100644
index e6d17b8698..0000000000
--- a/scripts/natural_language_inference/esim.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Build an Enhancing LSTM model for Natural Language Inference
-"""
-
-__all__ = ['ESIMModel']
-
-from mxnet.gluon import nn, rnn
-
-EPS = 1e-12
-
-
-class ESIMModel(nn.HybridBlock):
- """"Enhanced LSTM for Natural Language Inference" Qian Chen,
- Xiaodan Zhu, Zhenhua Ling, Si Wei, Hui Jiang, Diana Inkpen. ACL (2017)
-
- Parameters
- ----------
- vocab_size: int
- Number of words in vocab
- word_embed_size : int
- Dimension of word vector
- hidden_size : int
- Number of hidden units in lstm cell
- dense_size : int
- Number of hidden units in dense layer
- num_classes : int
- Number of categories
- dropout : int
- Dropout prob
- """
-
- def __init__(self, vocab_size, num_classes, word_embed_size, hidden_size, dense_size,
- dropout=0., **kwargs):
- super(ESIMModel, self).__init__(**kwargs)
- with self.name_scope():
- self.word_emb= nn.Embedding(vocab_size, word_embed_size)
- self.embedding_dropout = nn.Dropout(dropout, axes=1)
- self.lstm_encoder1 = rnn.LSTM(hidden_size, input_size=word_embed_size, bidirectional=True, layout='NTC')
- self.ff_proj = nn.Dense(hidden_size, in_units=hidden_size * 2 * 4, flatten=False, activation='relu')
- self.lstm_encoder2 = rnn.LSTM(hidden_size, input_size=hidden_size, bidirectional=True, layout='NTC')
-
- self.classifier = nn.HybridSequential()
- if dropout:
- self.classifier.add(nn.Dropout(rate=dropout))
- self.classifier.add(nn.Dense(units=hidden_size, activation='relu'))
- if dropout:
- self.classifier.add(nn.Dropout(rate=dropout))
- self.classifier.add(nn.Dense(units=num_classes))
-
- def _soft_attention_align(self, F, x1, x2):
- # attention shape: (batch, x1_seq_len, x2_seq_len)
- attention = F.batch_dot(x1, x2, transpose_b=True)
-
- x1_align = F.batch_dot(attention.softmax(), x2)
- x2_align = F.batch_dot(attention.transpose([0, 2, 1]).softmax(), x1)
-
- return x1_align, x2_align
-
- def _submul(self, F, x1, x2):
- mul = x1 * x2
- sub = x1 - x2
-
- return F.concat(mul, sub, dim=-1)
-
- def _pool(self, F, x):
- p1 = x.mean(axis=1)
- p2 = x.max(axis=1)
-
- return F.concat(p1, p2, dim=-1)
-
- def hybrid_forward(self, F, x1, x2):
- # x1_embed x2_embed shape: (batch, seq_len, word_embed_size)
- x1_embed = self.embedding_dropout(self.word_emb(x1))
- x2_embed = self.embedding_dropout(self.word_emb(x2))
-
- x1_lstm_encode = self.lstm_encoder1(x1_embed)
- x2_lstm_encode = self.lstm_encoder1(x2_embed)
-
- # attention
- x1_algin, x2_algin = self._soft_attention_align(F, x1_lstm_encode, x2_lstm_encode)
-
- # compose
- x1_combined = F.concat(x1_lstm_encode, x1_algin,
- self._submul(F, x1_lstm_encode, x1_algin), dim=-1)
- x2_combined = F.concat(x2_lstm_encode, x2_algin,
- self._submul(F, x2_lstm_encode, x2_algin), dim=-1)
-
- x1_compose = self.lstm_encoder2(self.ff_proj(x1_combined))
- x2_compose = self.lstm_encoder2(self.ff_proj(x2_combined))
-
- # aggregate
- x1_agg = self._pool(F, x1_compose)
- x2_agg = self._pool(F, x2_compose)
-
- # fully connection
- output = self.classifier(F.concat(x1_agg, x2_agg, dim=-1))
-
- return output
diff --git a/scripts/natural_language_inference/index.rst b/scripts/natural_language_inference/index.rst
deleted file mode 100644
index 8abf55fd45..0000000000
--- a/scripts/natural_language_inference/index.rst
+++ /dev/null
@@ -1,53 +0,0 @@
-Natural Language Inference
---------------------------
-
-:download:`Download scripts `
-
-Replication of the model described in `A Decomposable Attention Model for Natural Language Inference `_.
-
-Download the SNLI dataset:
-
-.. code-block:: console
-
- $ mkdir data
- $ curl https://nlp.stanford.edu/projects/snli/snli_1.0.zip -o data/snli_1.0.zip
- $ unzip data/snli_1.0.zip -d data
-
-Preprocess the data:
-
-.. code-block:: console
-
- $ for split in train dev test; do python preprocess.py --input data/snli_1.0/snli_1.0_$split.txt --output data/snli_1.0/$split.txt; done
-
-Train the model without intra-sentence attention:
-
-.. code-block:: console
-
- $ python main.py --train-file data/snli_1.0/train.txt --test-file data/snli_1.0/dev.txt --output-dir output/snli-basic --batch-size 32 --print-interval 5000 --lr 0.025 --epochs 300 --gpu-id 0 --dropout 0.2 --weight-decay 1e-5 --fix-embedding
-
-Test:
-
-.. code-block:: console
-
- $ python main.py --test-file data/snli_1.0/test.txt --model-dir output/snli-basic --gpu-id 0 --mode test --output-dir output/snli-basic/test
-
-We achieve 85.0% accuracy on the SNLI test set, comparable to 86.3% reported in the
-original paper. `[Training log] `__
-
-Train the model with intra-sentence attention:
-
-.. code-block:: console
-
- $ python main.py --train-file data/snli_1.0/train.txt --test-file data/snli_1.0/dev.txt --output-dir output/snli-intra --batch-size 32 --print-interval 5000 --lr 0.025 --epochs 300 --gpu-id 0 --dropout 0.2 --weight-decay 1e-5 --intra-attention --fix-embedding
-
-Test:
-
-.. code-block:: console
-
- $ python main.py --test-file data/snli_1.0/test.txt --model-dir output/snli-intra --gpu-id 0 --mode test --output-dir output/snli-intra/test
-
-We achieve 85.5% accuracy on the SNLI test set, compared to 86.8% reported in the
-original paper. `[Training log] `__
-Note that our intra-sentence attention implementation omitted the
-distance-sensitive bias term described in Equation (7) in the original paper.
-
diff --git a/scripts/natural_language_inference/main.py b/scripts/natural_language_inference/main.py
deleted file mode 100644
index 5dc79f2b5c..0000000000
--- a/scripts/natural_language_inference/main.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# Copyright 2018 Mengxiao Lin .
-# pylint: disable=redefined-outer-name,logging-format-interpolation
-
-"""
-Decomposable Attention Models for Natural Language Inference
-============================================================
-
-This script reproduces results in [Parikh et al., 2016] with the Gluon NLP Toolkit.
-
-@article{parikh2016decomposable,
- title={A decomposable attention model for natural language inference},
- author={Parikh, Ankur P and T{\"a}ckstr{\"o}m, Oscar and Das, Dipanjan and Uszkoreit, Jakob},
- journal={arXiv preprint arXiv:1606.01933},
- year={2016}
-}
-"""
-
-import os
-import argparse
-import json
-import logging
-import numpy as np
-
-import mxnet as mx
-from mxnet import gluon, autograd
-import gluonnlp as nlp
-
-from decomposable_attention import DecomposableAttentionModel
-from esim import ESIMModel
-from dataset import read_dataset, prepare_data_loader, build_vocab
-from utils import logging_config
-
-logger = logging.getLogger('nli')
-
-nlp.utils.check_version('0.7.0')
-
-def parse_args():
- """
- Parse arguments.
- """
- parser = argparse.ArgumentParser()
- parser.add_argument('--gpu-id', type=int, default=0,
- help='GPU id (-1 means CPU)')
- parser.add_argument('--train-file', default='snli_1.0/snli_1.0_train.txt',
- help='training set file')
- parser.add_argument('--test-file', default='snli_1.0/snli_1.0_dev.txt',
- help='validation set file')
- parser.add_argument('--max-num-examples', type=int, default=-1,
- help='maximum number of examples to load (for debugging)')
- parser.add_argument('--batch-size', type=int, default=32,
- help='batch size')
- parser.add_argument('--print-interval', type=int, default=20,
- help='the interval of two print')
- parser.add_argument('--model', choices=['da', 'esim'], default=None, required=True,
- help='which model to use')
- parser.add_argument('--mode', choices=['train', 'test'], default='train',
- help='train or test')
- parser.add_argument('--lr', type=float, default=0.025,
- help='learning rate')
- parser.add_argument('--epochs', type=int, default=300,
- help='maximum number of epochs to train')
- parser.add_argument('--embedding', default='glove',
- help='word embedding type')
- parser.add_argument('--fix-embedding', action='store_true',
- help='whether to fix pretrained word embedding')
- parser.add_argument('--embedding-source', default='glove.840B.300d',
- help='embedding file source')
- parser.add_argument('--embedding-size', type=int, default=300,
- help='size of pretrained word embedding')
- parser.add_argument('--hidden-size', type=int, default=200,
- help='hidden layer size')
- parser.add_argument('--output-dir', default='./output',
- help='directory for all experiment output')
- parser.add_argument('--model-dir', default='./output',
- help='directory to load model')
- parser.add_argument('--seed', type=int, default=0,
- help='random seed')
- parser.add_argument('--dropout', type=float, default=0.,
- help='dropout rate')
- parser.add_argument('--optimizer', choices=['adam', 'adagrad'], default='adagrad',
- help='optimization method')
- parser.add_argument('--weight-decay', type=float, default=0.,
- help='l2 regularization weight')
- parser.add_argument('--intra-attention', action='store_true',
- help='use intra-sentence attention')
-
- return parser.parse_args()
-
-def train_model(model, train_data_loader, val_data_loader, embedding, ctx, args):
- """
- Train model and validate/save every epoch.
- """
- logger.info(vars(args))
-
- # Initialization
- model.hybridize()
- model.collect_params().initialize(mx.init.Normal(0.01), ctx=ctx)
- model.word_emb.weight.set_data(embedding.idx_to_vec)
- # Fix word embedding
- if args.fix_embedding:
- model.word_emb.weight.grad_req = 'null'
-
- loss_func = gluon.loss.SoftmaxCrossEntropyLoss()
- trainer = gluon.Trainer(model.collect_params(), args.optimizer,
- {'learning_rate': args.lr,
- 'wd': args.weight_decay,
- 'clip_gradient': 5})
-
- checkpoints_dir = os.path.join(args.output_dir, 'checkpoints')
- if not os.path.exists(checkpoints_dir):
- os.makedirs(checkpoints_dir)
-
- best_val_acc = 0.
- for epoch_id in range(args.epochs):
- avg_loss = 0.
- avg_acc = 0.
- for batch_id, example in enumerate(train_data_loader):
- s1, s2, label = example
- s1 = s1.as_in_context(ctx)
- s2 = s2.as_in_context(ctx)
- label = label.as_in_context(ctx)
-
- with autograd.record():
- output = model(s1, s2)
- loss = loss_func(output, label).mean()
- loss.backward()
- trainer.step(1)
- avg_loss += loss.sum().asscalar()
-
- pred = output.argmax(axis=1)
- acc = (pred == label.astype(np.float32)).mean()
- avg_acc += acc.asscalar()
-
- if (batch_id + 1) % args.print_interval == 0:
- avg_loss /= args.print_interval
- avg_acc /= args.print_interval
- logger.info('[Epoch {} Batch {}/{}] loss={:.4f}, acc={:.4f}'
- .format(epoch_id, batch_id + 1, len(train_data_loader),
- avg_loss, avg_acc))
- avg_loss = 0.
- avg_acc = 0.
-
- # Validation
- val_loss, val_acc = test_model(model, val_data_loader, loss_func, ctx)
- if val_acc > best_val_acc:
- best_val_acc = val_acc
- checkpoint_path = os.path.join(args.output_dir, 'checkpoints', 'valid_best.params')
- model.save_parameters(checkpoint_path)
- logger.info('[Epoch {}] valid loss={:.4f}, valid acc={:.4f}, best valid acc={:.4f}'
- .format(epoch_id, val_loss, val_acc, best_val_acc))
-
- # Save checkpoint of last epoch
- checkpoint_path = os.path.join(args.output_dir, 'checkpoints', 'last.params')
- model.save_parameters(checkpoint_path)
-
-def test_model(model, data_loader, loss_func, ctx):
- """
- Test model.
- """
- acc = 0.
- loss = 0.
- for _, example in enumerate(data_loader):
- s1, s2, label = example
- s1 = s1.as_in_context(ctx)
- s2 = s2.as_in_context(ctx)
- label = label.as_in_context(ctx)
- output = model(s1, s2)
- loss += loss_func(output, label).mean().asscalar()
- pred = output.argmax(axis=1)
- acc += (pred == label.astype(np.float32)).mean().asscalar()
- acc /= len(data_loader)
- loss /= len(data_loader)
- return loss, acc
-
-def build_model(args, vocab):
- if args.model == 'da':
- model = DecomposableAttentionModel(len(vocab), args.embedding_size, args.hidden_size,
- args.dropout, args.intra_attention)
- elif args.model == 'esim':
- model = ESIMModel(len(vocab), 3, args.embedding_size, args.hidden_size,
- args.dropout)
- return model
-
-def main(args):
- """
- Entry point: train or test.
- """
- json.dump(vars(args), open(os.path.join(args.output_dir, 'config.json'), 'w'))
-
- if args.gpu_id == -1:
- ctx = mx.cpu()
- else:
- ctx = mx.gpu(args.gpu_id)
-
- mx.random.seed(args.seed, ctx=ctx)
-
- if args.mode == 'train':
- train_dataset = read_dataset(args, 'train_file')
- val_dataset = read_dataset(args, 'test_file')
-
- vocab_path = os.path.join(args.output_dir, 'vocab.jsons')
- if os.path.exists(vocab_path):
- vocab = nlp.Vocab.from_json(open(vocab_path).read())
- else:
- vocab = build_vocab(train_dataset)
- with open(vocab_path, 'w') as fout:
- fout.write(vocab.to_json())
- glove = nlp.embedding.create(args.embedding, source=args.embedding_source)
- vocab.set_embedding(glove)
-
- train_data_loader = prepare_data_loader(args, train_dataset, vocab)
- val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True)
-
- model = build_model(args, vocab)
- train_model(model, train_data_loader, val_data_loader, vocab.embedding, ctx, args)
- elif args.mode == 'test':
- model_args = argparse.Namespace(**json.load(
- open(os.path.join(args.model_dir, 'config.json'))))
- vocab = nlp.Vocab.from_json(
- open(os.path.join(args.model_dir, 'vocab.jsons')).read())
- val_dataset = read_dataset(args, 'test_file')
- val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True)
- model = build_model(model_args, vocab)
- model.load_parameters(os.path.join(
- args.model_dir, 'checkpoints', 'valid_best.params'), ctx=ctx)
- loss_func = gluon.loss.SoftmaxCrossEntropyLoss()
- logger.info('Test on {}'.format(args.test_file))
- loss, acc = test_model(model, val_data_loader, loss_func, ctx)
- logger.info('loss={:.4f} acc={:.4f}'.format(loss, acc))
-
-if __name__ == '__main__':
- args = parse_args()
- if not os.path.exists(args.output_dir):
- os.makedirs(args.output_dir)
-
- logging_config(os.path.join(args.output_dir, 'main.log'))
-
- main(args)
diff --git a/scripts/natural_language_inference/preprocess.py b/scripts/natural_language_inference/preprocess.py
deleted file mode 100644
index d1031bee11..0000000000
--- a/scripts/natural_language_inference/preprocess.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=redefined-outer-name
-
-"""
-Tokenize the SNLI dataset.
-"""
-
-import argparse
-import csv
-import nltk
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument('--input',
- help='.txt file for the SNLI dataset')
- parser.add_argument('--output',
- help='path for tokenized output file')
- args = parser.parse_args()
- return args
-
-def read_tokens(tree_str):
- t = nltk.Tree.fromstring(tree_str)
- return t.leaves()
-
-def main(args):
- """
- Read tokens from the provided parse tree in the SNLI dataset.
- Illegal examples are removed.
- """
- examples = []
- with open(args.input, 'r') as fin:
- reader = csv.DictReader(fin, delimiter='\t')
- for cols in reader:
- s1 = read_tokens(cols['sentence1_parse'])
- s2 = read_tokens(cols['sentence2_parse'])
- label = cols['gold_label']
- if label in ('neutral', 'contradiction', 'entailment'):
- examples.append((s1, s2, label))
- with open(args.output, 'w') as fout:
- for s1, s2, l in examples:
- fout.write('{}\t{}\t{}\n'.format(' '.join(s1), ' '.join(s2), l))
-
-
-if __name__ == '__main__':
- args = parse_args()
- main(args)
diff --git a/scripts/natural_language_inference/utils.py b/scripts/natural_language_inference/utils.py
deleted file mode 100644
index 9e1b848491..0000000000
--- a/scripts/natural_language_inference/utils.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# Copyright 2018 Mengxiao Lin .
-
-"""
-Utility functions.
-"""
-
-import logging
-
-def logging_config(logpath=None,
- level=logging.DEBUG,
- console_level=logging.INFO,
- no_console=False):
- """
- Config the logging.
- """
- logger = logging.getLogger('nli')
- # Remove all the current handlers
- for handler in logger.handlers:
- logger.removeHandler(handler)
- logger.handlers = []
- logger.propagate = False
- logger.setLevel(logging.DEBUG)
-
- formatter = logging.Formatter('%(filename)s:%(funcName)s: %(message)s')
-
- if logpath is not None:
- print('All Logs will be saved to {}'.format(logpath))
- logfile = logging.FileHandler(logpath, mode='w')
- logfile.setLevel(level)
- logfile.setFormatter(formatter)
- logger.addHandler(logfile)
-
- if not no_console:
- # Initialze the console logging
- logconsole = logging.StreamHandler()
- logconsole.setLevel(console_level)
- logconsole.setFormatter(formatter)
- logger.addHandler(logconsole)
diff --git a/scripts/ner/data.py b/scripts/ner/data.py
deleted file mode 100644
index f160f607da..0000000000
--- a/scripts/ner/data.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Data utilities for the named entity recognition task."""
-
-import logging
-from collections import namedtuple
-
-import numpy as np
-import mxnet as mx
-import gluonnlp as nlp
-
-TaggedToken = namedtuple('TaggedToken', ['text', 'tag'])
-PredictedToken = namedtuple('PredictedToken', ['text', 'true_tag', 'pred_tag'])
-
-NULL_TAG = 'X'
-
-def bio_bioes(tokens):
- """Convert a list of TaggedTokens in BIO(2) scheme to BIOES scheme.
-
- Parameters
- ----------
- tokens: List[TaggedToken]
- A list of tokens in BIO(2) scheme
-
- Returns
- -------
- List[TaggedToken]:
- A list of tokens in BIOES scheme
- """
- ret = []
- for index, token in enumerate(tokens):
- if token.tag == 'O':
- ret.append(token)
- elif token.tag.startswith('B'):
- # if a B-tag is continued by other tokens with the same entity,
- # then it is still a B-tag
- if index + 1 < len(tokens) and tokens[index + 1].tag.startswith('I'):
- ret.append(token)
- else:
- ret.append(TaggedToken(text=token.text, tag='S' + token.tag[1:]))
- elif token.tag.startswith('I'):
- # if an I-tag is continued by other tokens with the same entity,
- # then it is still an I-tag
- if index + 1 < len(tokens) and tokens[index + 1].tag.startswith('I'):
- ret.append(token)
- else:
- ret.append(TaggedToken(text=token.text, tag='E' + token.tag[1:]))
- return ret
-
-
-def read_bio_as_bio2(data_path):
- """Read CoNLL-formatted text file in BIO scheme in given path as sentences in BIO2 scheme.
-
- Parameters
- ----------
- data_path: str
- Path of the data file to read
-
- Returns
- -------
- List[List[TaggedToken]]:
- List of sentences, each of which is a List of TaggedTokens
- """
-
- with open(data_path, 'r') as ifp:
- sentence_list = []
- current_sentence = []
- prev_tag = 'O'
-
- for line in ifp:
- if len(line.strip()) > 0:
- word, _, _, tag = line.rstrip().split(' ')
- # convert BIO tag to BIO2 tag
- if tag == 'O':
- bio2_tag = 'O'
- else:
- if prev_tag == 'O' or tag[2:] != prev_tag[2:]:
- bio2_tag = 'B' + tag[1:]
- else:
- bio2_tag = tag
- current_sentence.append(TaggedToken(text=word, tag=bio2_tag))
- prev_tag = tag
- else:
- # the sentence was completed if an empty line occurred; flush the current sentence.
- sentence_list.append(current_sentence)
- current_sentence = []
- prev_tag = 'O'
-
- # check if there is a remaining token. in most CoNLL data files, this does not happen.
- if len(current_sentence) > 0:
- sentence_list.append(current_sentence)
- return sentence_list
-
-
-def remove_docstart_sentence(sentences):
- """Remove -DOCSTART- sentences in the list of sentences.
-
- Parameters
- ----------
- sentences: List[List[TaggedToken]]
- List of sentences, each of which is a List of TaggedTokens.
- This list may contain DOCSTART sentences.
-
- Returns
- -------
- List of sentences, each of which is a List of TaggedTokens.
- This list does not contain DOCSTART sentences.
- """
- ret = []
- for sentence in sentences:
- current_sentence = []
- for token in sentence:
- if token.text != '-DOCSTART-':
- current_sentence.append(token)
- if len(current_sentence) > 0:
- ret.append(current_sentence)
- return ret
-
-
-def bert_tokenize_sentence(sentence, bert_tokenizer):
- """Apply BERT tokenizer on a tagged sentence to break words into sub-words.
- This function assumes input tags are following IOBES, and outputs IOBES tags.
-
- Parameters
- ----------
- sentence: List[TaggedToken]
- List of tagged words
- bert_tokenizer: nlp.data.BertTokenizer
- BERT tokenizer
-
- Returns
- -------
- List[TaggedToken]: list of annotated sub-word tokens
- """
- ret = []
- for token in sentence:
- # break a word into sub-word tokens
- sub_token_texts = bert_tokenizer(token.text)
- # only the first token of a word is going to be tagged
- ret.append(TaggedToken(text=sub_token_texts[0], tag=token.tag))
- ret += [TaggedToken(text=sub_token_text, tag=NULL_TAG)
- for sub_token_text in sub_token_texts[1:]]
-
- return ret
-
-
-def load_segment(file_path, bert_tokenizer):
- """Load CoNLL format NER datafile with BIO-scheme tags.
-
- Tagging scheme is converted into BIOES, and words are tokenized into wordpieces
- using `bert_tokenizer`.
-
- Parameters
- ----------
- file_path: str
- Path of the file
- bert_tokenizer: nlp.data.BERTTokenizer
-
- Returns
- -------
- List[List[TaggedToken]]: List of sentences, each of which is the list of `TaggedToken`s.
- """
- logging.info('Loading sentences in %s...', file_path)
- bio2_sentences = remove_docstart_sentence(read_bio_as_bio2(file_path))
- bioes_sentences = [bio_bioes(sentence) for sentence in bio2_sentences]
- subword_sentences = [bert_tokenize_sentence(sentence, bert_tokenizer)
- for sentence in bioes_sentences]
-
- logging.info('load %s, its max seq len: %d',
- file_path, max(len(sentence) for sentence in subword_sentences))
-
- return subword_sentences
-
-
-class BERTTaggingDataset:
- """
-
- Parameters
- ----------
- text_vocab: gluon.nlp.Vocab
- Vocabulary of text tokens/
- train_path: Optional[str]
- Path of the file to locate training data.
- dev_path: Optional[str]
- Path of the file to locate development data.
- test_path: Optional[str]
- Path of the file to locate test data.
- seq_len: int
- Length of the input sequence to BERT.
- is_cased: bool
- Whether to use cased model.
- """
-
- def __init__(self, text_vocab, train_path, dev_path, test_path, seq_len, is_cased,
- tag_vocab=None):
- self.text_vocab = text_vocab
- self.seq_len = seq_len
-
- self.bert_tokenizer = nlp.data.BERTTokenizer(vocab=text_vocab, lower=not is_cased)
-
- train_sentences = [] if train_path is None else load_segment(train_path,
- self.bert_tokenizer)
- dev_sentences = [] if dev_path is None else load_segment(dev_path, self.bert_tokenizer)
- test_sentences = [] if test_path is None else load_segment(test_path, self.bert_tokenizer)
- all_sentences = train_sentences + dev_sentences + test_sentences
-
- if tag_vocab is None:
- logging.info('Indexing tags...')
- tag_counter = nlp.data.count_tokens(token.tag
- for sentence in all_sentences for token in sentence)
- self.tag_vocab = nlp.Vocab(tag_counter, padding_token=NULL_TAG,
- bos_token=None, eos_token=None, unknown_token=None)
- else:
- self.tag_vocab = tag_vocab
- self.null_tag_index = self.tag_vocab[NULL_TAG]
-
- if len(test_sentences) > 0:
- logging.info('example test sentences:')
- for i in range(2):
- logging.info(str(test_sentences[i]))
-
- self.train_inputs = [self._encode_as_input(sentence) for sentence in train_sentences]
- self.dev_inputs = [self._encode_as_input(sentence) for sentence in dev_sentences]
- self.test_inputs = [self._encode_as_input(sentence) for sentence in test_sentences]
-
- logging.info('tag_vocab: %s', self.tag_vocab)
-
- def _encode_as_input(self, sentence):
- """Enocde a single sentence into numpy arrays as input to the BERTTagger model.
-
- Parameters
- ----------
- sentence: List[TaggedToken]
- A sentence as a list of tagged tokens.
-
- Returns
- -------
- np.array: token text ids (batch_size, seq_len)
- np.array: token types (batch_size, seq_len),
- which is all zero because we have only one sentence for tagging.
- np.array: valid_length (batch_size,) the number of tokens until [SEP] token
- np.array: tag_ids (batch_size, seq_len)
- np.array: flag_nonnull_tag (batch_size, seq_len),
- which is simply tag_ids != self.null_tag_index
-
- """
- # check whether the given sequence can be fit into `seq_len`.
- assert len(sentence) <= self.seq_len - 2, \
- 'the number of tokens {} should not be larger than {} - 2. offending sentence: {}' \
- .format(len(sentence), self.seq_len, sentence)
-
- text_tokens = ([self.text_vocab.cls_token] + [token.text for token in sentence] +
- [self.text_vocab.sep_token])
- padded_text_ids = (self.text_vocab.to_indices(text_tokens)
- + ([self.text_vocab[self.text_vocab.padding_token]]
- * (self.seq_len - len(text_tokens))))
-
- tags = [NULL_TAG] + [token.tag for token in sentence] + [NULL_TAG]
- padded_tag_ids = (self.tag_vocab.to_indices(tags)
- + [self.tag_vocab[NULL_TAG]] * (self.seq_len - len(tags)))
-
- assert len(text_tokens) == len(tags)
- assert len(padded_text_ids) == len(padded_tag_ids)
- assert len(padded_text_ids) == self.seq_len
-
- valid_length = len(text_tokens)
-
- # in sequence tagging problems, only one sentence is given
- token_types = [0] * self.seq_len
-
- np_tag_ids = np.array(padded_tag_ids, dtype='int32')
- # gluon batchify cannot batchify numpy.bool? :(
- flag_nonnull_tag = (np_tag_ids != self.null_tag_index).astype('int32')
-
- return (np.array(padded_text_ids, dtype='int32'),
- np.array(token_types, dtype='int32'),
- np.array(valid_length, dtype='int32'),
- np_tag_ids,
- flag_nonnull_tag)
-
- @staticmethod
- def _get_data_loader(inputs, shuffle, batch_size):
- return mx.gluon.data.DataLoader(inputs, batch_size=batch_size, shuffle=shuffle,
- last_batch='keep')
-
- def get_train_data_loader(self, batch_size):
- return self._get_data_loader(self.train_inputs, shuffle=True, batch_size=batch_size)
-
- def get_dev_data_loader(self, batch_size):
- return self._get_data_loader(self.dev_inputs, shuffle=False, batch_size=batch_size)
-
- def get_test_data_loader(self, batch_size):
- return self._get_data_loader(self.test_inputs, shuffle=False, batch_size=batch_size)
-
- @property
- def num_tag_types(self):
- """Returns the number of unique tags.
-
- Returns
- -------
- int: number of tag types.
- """
- return len(self.tag_vocab)
-
-
-def convert_arrays_to_text(text_vocab, tag_vocab,
- np_text_ids, np_true_tags, np_pred_tags, np_valid_length):
- """Convert numpy array data into text
-
- Parameters
- ----------
- np_text_ids: token text ids (batch_size, seq_len)
- np_true_tags: tag_ids (batch_size, seq_len)
- np_pred_tags: tag_ids (batch_size, seq_len)
- np.array: valid_length (batch_size,) the number of tokens until [SEP] token
-
- Returns
- -------
- List[List[PredictedToken]]:
-
- """
- predictions = []
- for sample_index in range(np_valid_length.shape[0]):
- sample_len = np_valid_length[sample_index]
- entries = []
- for i in range(1, sample_len - 1):
- token_text = text_vocab.idx_to_token[np_text_ids[sample_index, i]]
- true_tag = tag_vocab.idx_to_token[int(np_true_tags[sample_index, i])]
- pred_tag = tag_vocab.idx_to_token[int(np_pred_tags[sample_index, i])]
- # we don't need to predict on NULL tags
- if true_tag == NULL_TAG:
- last_entry = entries[-1]
- entries[-1] = PredictedToken(text=last_entry.text + token_text,
- true_tag=last_entry.true_tag,
- pred_tag=last_entry.pred_tag)
- else:
- entries.append(PredictedToken(text=token_text,
- true_tag=true_tag, pred_tag=pred_tag))
-
- predictions.append(entries)
- return predictions
diff --git a/scripts/ner/dataset_sample/test_sample.txt b/scripts/ner/dataset_sample/test_sample.txt
deleted file mode 100644
index 3db1cb9558..0000000000
--- a/scripts/ner/dataset_sample/test_sample.txt
+++ /dev/null
@@ -1,17 +0,0 @@
--DOCSTART- -X- -X- O
-
-SOCCER NN I-NP O
-- : O O
-JAPAN NNP I-NP I-LOC
-GET VB I-VP O
-LUCKY NNP I-NP O
-WIN NNP I-NP O
-, , O O
-CHINA NNP I-NP I-PER
-IN IN I-PP O
-SURPRISE DT I-NP O
-DEFEAT NN I-NP O
-. . O O
-
-Nadim NNP I-NP I-PER
-Ladki NNP I-NP I-PER
diff --git a/scripts/ner/dataset_sample/train_sample.txt b/scripts/ner/dataset_sample/train_sample.txt
deleted file mode 100644
index d4c0f9f7dd..0000000000
--- a/scripts/ner/dataset_sample/train_sample.txt
+++ /dev/null
@@ -1,14 +0,0 @@
--DOCSTART- -X- O O
-
-EU NNP I-NP I-ORG
-rejects VBZ I-VP O
-German JJ I-NP I-MISC
-call NN I-NP O
-to TO I-VP O
-boycott VB I-VP O
-British JJ I-NP I-MISC
-lamb NN I-NP O
-. . O O
-
-Peter NNP I-NP I-PER
-Blackburn NNP I-NP I-PER
diff --git a/scripts/ner/dataset_sample/validation_sample.txt b/scripts/ner/dataset_sample/validation_sample.txt
deleted file mode 100644
index d3219e9079..0000000000
--- a/scripts/ner/dataset_sample/validation_sample.txt
+++ /dev/null
@@ -1,16 +0,0 @@
--DOCSTART- -X- O O
-
-CRICKET NNP I-NP O
-- : O O
-LEICESTERSHIRE NNP I-NP I-ORG
-TAKE NNP I-NP O
-OVER IN I-PP O
-AT NNP I-NP O
-TOP NNP I-NP O
-AFTER NNP I-NP O
-INNINGS NNP I-NP O
-VICTORY NN I-NP O
-. . O O
-
-LONDON NNP I-NP I-LOC
-1996-08-30 CD I-NP O
diff --git a/scripts/ner/finetune_bert.py b/scripts/ner/finetune_bert.py
deleted file mode 100644
index a0943fd05c..0000000000
--- a/scripts/ner/finetune_bert.py
+++ /dev/null
@@ -1,222 +0,0 @@
-#!/usr/bin/env python
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Provides command-line interace for training BERT-based named entity recognition model."""
-
-import argparse
-import logging
-import random
-
-import numpy as np
-import mxnet as mx
-
-import gluonnlp as nlp
-
-from ner_utils import get_context, get_bert_model, dump_metadata, str2bool
-from data import BERTTaggingDataset, convert_arrays_to_text
-from model import BERTTagger, attach_prediction
-
-# seqeval is a dependency that is specific to named entity recognition.
-import seqeval.metrics
-
-nlp.utils.check_version('0.7.0')
-
-def parse_args():
- """Parse command line arguments."""
- arg_parser = argparse.ArgumentParser(
- description='Train a BERT-based named entity recognition model',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
- # data file paths
- arg_parser.add_argument('--train-path', type=str, required=True,
- help='Path to the training data file')
- arg_parser.add_argument('--dev-path', type=str, required=True,
- help='Path to the development data file')
- arg_parser.add_argument('--test-path', type=str, required=True,
- help='Path to the test data file')
-
- arg_parser.add_argument('--save-checkpoint-prefix', type=str, required=False, default=None,
- help='Prefix of model checkpoint file')
-
- # bert options
- arg_parser.add_argument('--bert-model', type=str, default='bert_12_768_12',
- help='Name of the BERT model')
- arg_parser.add_argument('--cased', type=str2bool, default=True,
- help='Path to the development data file')
- arg_parser.add_argument('--dropout-prob', type=float, default=0.1,
- help='Dropout probability for the last layer')
-
- # optimization parameters
- arg_parser.add_argument('--seed', type=int, default=13531,
- help='Random number seed.')
- arg_parser.add_argument('--seq-len', type=int, default=180,
- help='The length of the sequence input to BERT.'
- ' An exception will raised if this is not large enough.')
- arg_parser.add_argument('--gpu', type=int,
- help='Number (index) of GPU to run on, e.g. 0. '
- 'If not specified, uses CPU.')
- arg_parser.add_argument('--batch-size', type=int, default=32, help='Batch size for training')
- arg_parser.add_argument('--num-epochs', type=int, default=4, help='Number of epochs to train')
- arg_parser.add_argument('--optimizer', type=str, default='bertadam',
- help='Optimization algorithm to use')
- arg_parser.add_argument('--learning-rate', type=float, default=5e-5,
- help='Learning rate for optimization')
- arg_parser.add_argument('--warmup-ratio', type=float, default=0.1,
- help='Warmup ratio for learning rate scheduling')
- args = arg_parser.parse_args()
- return args
-
-
-def main(config):
- """Main method for training BERT-based NER model."""
- # provide random seed for every RNGs we use
- np.random.seed(config.seed)
- random.seed(config.seed)
- mx.random.seed(config.seed)
-
- ctx = get_context(config.gpu)
-
- logging.info('Loading BERT model...')
- bert_model, text_vocab = get_bert_model(config.bert_model, config.cased, ctx,
- config.dropout_prob)
-
- dataset = BERTTaggingDataset(text_vocab, config.train_path, config.dev_path, config.test_path,
- config.seq_len, config.cased)
-
- train_data_loader = dataset.get_train_data_loader(config.batch_size)
- dev_data_loader = dataset.get_dev_data_loader(config.batch_size)
- test_data_loader = dataset.get_test_data_loader(config.batch_size)
-
- net = BERTTagger(bert_model, dataset.num_tag_types, config.dropout_prob)
- net.tag_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
- net.hybridize(static_alloc=True)
-
- loss_function = mx.gluon.loss.SoftmaxCrossEntropyLoss()
- loss_function.hybridize(static_alloc=True)
-
- # step size adaptation, adopted from: https://github.com/dmlc/gluon-nlp/blob/
- # 87d36e3cc7c615f93732d01048cf7ce3b3b09eb7/scripts/bert/finetune_classifier.py#L348-L351
- step_size = config.batch_size
- num_train_steps = int(len(dataset.train_inputs) / step_size * config.num_epochs)
- num_warmup_steps = int(num_train_steps * config.warmup_ratio)
-
- optimizer_params = {'learning_rate': config.learning_rate}
- trainer = mx.gluon.Trainer(net.collect_params(), config.optimizer, optimizer_params)
-
- # collect differentiable parameters
- logging.info('Collect params...')
- # do not apply weight decay on LayerNorm and bias terms
- for _, v in net.collect_params('.*beta|.*gamma|.*bias').items():
- v.wd_mult = 0.0
- params = [p for p in net.collect_params().values() if p.grad_req != 'null']
-
- if config.save_checkpoint_prefix is not None:
- logging.info('dumping metadata...')
- dump_metadata(config, tag_vocab=dataset.tag_vocab)
-
- def train(data_loader, start_step_num):
- """Training loop."""
- step_num = start_step_num
- logging.info('current starting step num: %d', step_num)
- for batch_id, (_, _, _, tag_ids, flag_nonnull_tag, out) in \
- enumerate(attach_prediction(data_loader, net, ctx, is_train=True)):
- logging.info('training on batch index: %d/%d', batch_id, len(data_loader))
-
- # step size adjustments
- step_num += 1
- if step_num < num_warmup_steps:
- new_lr = config.learning_rate * step_num / num_warmup_steps
- else:
- offset = ((step_num - num_warmup_steps) * config.learning_rate /
- (num_train_steps - num_warmup_steps))
- new_lr = config.learning_rate - offset
- trainer.set_learning_rate(new_lr)
-
- with mx.autograd.record():
- loss_value = loss_function(out, tag_ids,
- flag_nonnull_tag.expand_dims(axis=2)).mean()
-
- loss_value.backward()
- nlp.utils.clip_grad_global_norm(params, 1)
- trainer.step(1)
-
- pred_tags = out.argmax(axis=-1)
- logging.info('loss_value: %6f', loss_value.asscalar())
-
- num_tag_preds = flag_nonnull_tag.sum().asscalar()
- logging.info(
- 'accuracy: %6f', (((pred_tags == tag_ids) * flag_nonnull_tag).sum().asscalar()
- / num_tag_preds))
- return step_num
-
- def evaluate(data_loader):
- """Eval loop."""
- predictions = []
-
- for batch_id, (text_ids, _, valid_length, tag_ids, _, out) in \
- enumerate(attach_prediction(data_loader, net, ctx, is_train=False)):
- logging.info('evaluating on batch index: %d/%d', batch_id, len(data_loader))
-
- # convert results to numpy arrays for easier access
- np_text_ids = text_ids.astype('int32').asnumpy()
- np_pred_tags = out.argmax(axis=-1).asnumpy()
- np_valid_length = valid_length.astype('int32').asnumpy()
- np_true_tags = tag_ids.asnumpy()
-
- predictions += convert_arrays_to_text(text_vocab, dataset.tag_vocab, np_text_ids,
- np_true_tags, np_pred_tags, np_valid_length)
-
- all_true_tags = [[entry.true_tag for entry in entries] for entries in predictions]
- all_pred_tags = [[entry.pred_tag for entry in entries] for entries in predictions]
- seqeval_f1 = seqeval.metrics.f1_score(all_true_tags, all_pred_tags)
- return seqeval_f1
-
- best_dev_f1 = 0.0
- last_test_f1 = 0.0
- best_epoch = -1
-
- last_epoch_step_num = 0
- for epoch_index in range(config.num_epochs):
- last_epoch_step_num = train(train_data_loader, last_epoch_step_num)
- train_f1 = evaluate(train_data_loader)
- logging.info('train f1: %3f', train_f1)
- dev_f1 = evaluate(dev_data_loader)
- logging.info('dev f1: %3f, previous best dev f1: %3f', dev_f1, best_dev_f1)
- if dev_f1 > best_dev_f1:
- best_dev_f1 = dev_f1
- best_epoch = epoch_index
- logging.info('update the best dev f1 to be: %3f', best_dev_f1)
- test_f1 = evaluate(test_data_loader)
- logging.info('test f1: %3f', test_f1)
- last_test_f1 = test_f1
-
- # save params
- params_file = config.save_checkpoint_prefix + '_{:03d}.params'.format(epoch_index)
- logging.info('saving current checkpoint to: %s', params_file)
- net.save_parameters(params_file)
-
- logging.info('current best epoch: %d', best_epoch)
-
- logging.info('best epoch: %d, best dev f1: %3f, test f1 at tha epoch: %3f',
- best_epoch, best_dev_f1, last_test_f1)
-
-
-if __name__ == '__main__':
- logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
- level=logging.DEBUG, datefmt='%Y-%m-%d %I:%M:%S')
- logging.getLogger().setLevel(logging.INFO)
- main(parse_args())
diff --git a/scripts/ner/index.rst b/scripts/ner/index.rst
deleted file mode 100644
index 3bffe81eeb..0000000000
--- a/scripts/ner/index.rst
+++ /dev/null
@@ -1,34 +0,0 @@
-Named Entity Recognition
-------------------------
-
-:download:`Download scripts `
-
-Reference: Devlin, Jacob, et al. "`Bert: Pre-training of deep bidirectional transformers for language understanding. `_" arXiv preprint arXiv:1810.04805 (2018).
-
-Named Entity Recognition with BERT
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-GluonNLP provides training and prediction script for named entity recognition models.
-
-The training script for NER requires the seqeval package:
-
-.. code-block:: console
-
- $ pip install seqeval --user
-
-Dataset should be formatted in `CoNLL-2003 shared task format `_.
-Assuming data files are located in `${DATA_DIR}`, below command trains BERT model for
-named entity recognition, and saves model artifacts to `${MODEL_DIR}` with `large_bert`
-prefix in file names (assuming `${MODEL_DIR}` exists):
-
-.. code-block:: console
-
- $ python finetune_bert.py \
- --train-path ${DATA_DIR}/train.txt \
- --dev-path ${DATA_DIR}/dev.txt \
- --test-path ${DATA_DIR}/test.txt \
- --gpu 0 --learning-rate 1e-5 --dropout-prob 0.1 --num-epochs 100 --batch-size 8 \
- --optimizer bertadam --bert-model bert_24_1024_16 \
- --save-checkpoint-prefix ${MODEL_DIR}/large_bert --seed 13531
-
-This achieves Test F1 from `91.5` to `92.2` (`log `_).
diff --git a/scripts/ner/model.py b/scripts/ner/model.py
deleted file mode 100644
index 18a2076600..0000000000
--- a/scripts/ner/model.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Gluon model block for the named entity recognition task."""
-
-from contextlib import ExitStack
-
-import mxnet as mx
-from mxnet.gluon import Block, nn
-
-
-class BERTTagger(Block):
- """Model for sequence tagging with BERT
-
- Parameters
- ----------
- bert_model: BERTModel
- Bidirectional encoder with transformer.
- num_tag_types: int
- number of possible tags
- dropout_prob: float
- dropout probability for the last layer
- prefix: str or None
- See document of `mx.gluon.Block`.
- params: ParameterDict or None
- See document of `mx.gluon.Block`.
- """
-
- def __init__(self, bert_model, num_tag_types, dropout_prob, prefix=None, params=None):
- super(BERTTagger, self).__init__(prefix=prefix, params=params)
- self.bert_model = bert_model
- with self.name_scope():
- self.tag_classifier = nn.Dense(units=num_tag_types, flatten=False)
- self.dropout = nn.Dropout(rate=dropout_prob)
-
- def forward(self, token_ids, token_types, valid_length): # pylint: disable=arguments-differ
- """Generate an unnormalized score for the tag of each token
-
- Parameters
- ----------
- token_ids: NDArray, shape (batch_size, seq_length)
- ID of tokens in sentences
- See `input` of `glounnlp.model.BERTModel`
- token_types: NDArray, shape (batch_size, seq_length)
- See `glounnlp.model.BERTModel`
- valid_length: NDArray, shape (batch_size,)
- See `glounnlp.model.BERTModel`
-
- Returns
- -------
- NDArray, shape (batch_size, seq_length, num_tag_types):
- Unnormalized prediction scores for each tag on each position.
- """
- bert_output = self.dropout(self.bert_model(token_ids, token_types, valid_length))
- output = self.tag_classifier(bert_output)
- return output
-
-
-def attach_prediction(data_loader, net, ctx, is_train):
- """Attach the prediction from a model to a data loader as the last field.
-
- Parameters
- ----------
- data_loader: mx.gluon.data.DataLoader
- Input data from `bert_model.BERTTaggingDataset._encode_as_input`.
- net: mx.gluon.Block
- gluon `Block` for making the preciction.
- ctx:
- The context data should be loaded to.
- is_train:
- Whether the forward pass should be made with `mx.autograd.record()`.
-
- Returns
- -------
- All fields from `bert_model.BERTTaggingDataset._encode_as_input`,
- as well as the prediction of the model.
-
- """
- for data in data_loader:
- text_ids, token_types, valid_length, tag_ids, flag_nonnull_tag = \
- [x.astype('float32').as_in_context(ctx) for x in data]
-
- with ExitStack() as stack:
- if is_train:
- stack.enter_context(mx.autograd.record())
- out = net(text_ids, token_types, valid_length)
- yield text_ids, token_types, valid_length, tag_ids, flag_nonnull_tag, out
diff --git a/scripts/ner/ner_utils.py b/scripts/ner/ner_utils.py
deleted file mode 100644
index 332b548aa0..0000000000
--- a/scripts/ner/ner_utils.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Common utilities for the named entity recognition task."""
-
-import argparse
-import pickle
-from collections import namedtuple
-
-import mxnet as mx
-import gluonnlp as nlp
-
-__all__ = ['get_bert_model', 'get_bert_dataset_name', 'get_context',
- 'dump_metadata']
-
-BERTModelMetadata = namedtuple('BERTModelMetadata', ['config', 'tag_vocab'])
-
-def _metadata_file_path(checkpoint_prefix):
- """Gets the file path for meta data"""
- return checkpoint_prefix + '_metadata.pkl'
-
-
-def dump_metadata(config, tag_vocab):
- """Dumps meta-data to the configured path"""
- metadata = BERTModelMetadata(config=config, tag_vocab=tag_vocab)
- with open(_metadata_file_path(config.save_checkpoint_prefix), 'wb') as ofp:
- pickle.dump(metadata, ofp)
-
-
-def load_metadata(checkpoint_prefix):
- """Loads meta-data to the configured path"""
- with open(_metadata_file_path(checkpoint_prefix), 'rb') as ifp:
- metadata = pickle.load(ifp)
- return metadata.config, metadata.tag_vocab
-
-
-def get_context(gpu_index):
- """This method gets context of execution"""
- context = None
- if gpu_index is None or gpu_index == '':
- context = mx.cpu()
- if isinstance(gpu_index, int):
- context = mx.gpu(gpu_index)
- return context
-
-
-def str2bool(v):
- """Utility function for parsing boolean in argparse
-
- https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
-
- :param v: value of the argument
- :return:
- """
- if v.lower() in ('yes', 'true', 't', 'y', '1'):
- return True
- elif v.lower() in ('no', 'false', 'f', 'n', '0'):
- return False
- else:
- raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-def get_bert_dataset_name(is_cased):
- """Returns relevant BERT dataset name, depending on whether we are using a cased model.
-
- Parameters
- ----------
- is_cased: bool
- Whether we are using a cased model.
-
- Returns
- -------
- str: Named of the BERT dataset.
-
- """
- if is_cased:
- return 'book_corpus_wiki_en_cased'
- else:
- return 'book_corpus_wiki_en_uncased'
-
-
-def get_bert_model(bert_model, cased, ctx, dropout_prob):
- """Get pre-trained BERT model."""
- bert_dataset_name = get_bert_dataset_name(cased)
-
- return nlp.model.get_model(
- name=bert_model,
- dataset_name=bert_dataset_name,
- pretrained=True,
- ctx=ctx,
- use_pooler=False,
- use_decoder=False,
- use_classifier=False,
- dropout=dropout_prob,
- embed_dropout=dropout_prob)
diff --git a/scripts/ner/predict_ner.py b/scripts/ner/predict_ner.py
deleted file mode 100644
index abdc3ec535..0000000000
--- a/scripts/ner/predict_ner.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#!/usr/bin/env python
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Script for NER prediction."""
-
-import argparse
-import logging
-import os
-
-import mxnet as mx
-from ner_utils import get_bert_model, get_context
-from ner_utils import load_metadata
-from data import BERTTaggingDataset, convert_arrays_to_text
-from model import BERTTagger
-
-# TODO(bikestra): Currently, our evaluation is dependent on this package.
-# Figure out whether to take actual dependency on it.
-try:
- import seqeval.metrics
-except ImportError:
- raise ImportError('seqeval is required to run NER on BERT. Please '
- 'install it via pip3 install seqeval --user')
-
-
-def _find_model_file_from_checkpoint(checkpoint_prefix: str):
- """Load model checkpoint"""
- dirname, file_prefix = os.path.split(checkpoint_prefix)
- # find checkpoint file names and sort by name to find the most recent one.
- checkpoint_filenames = ([f for f in os.listdir(dirname)
- if f.startswith(file_prefix)
- and f.endswith(os.path.extsep + 'params')])
- last_checkpoint_filename = max(checkpoint_filenames)
- logging.info('found checkpoint filename: {:s}'.format(last_checkpoint_filename))
- last_checkpoint_path = os.path.join(dirname, last_checkpoint_filename)
- return last_checkpoint_path
-
-
-def parse_args():
- """Parse command line arguments."""
- arg_parser = argparse.ArgumentParser(
- description='Predict on CoNLL format data using BERT-based named entity recognition model',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
- # data file paths
- arg_parser.add_argument('--test-path', type=str, required=True,
- help='Path to the test data file')
- arg_parser.add_argument('--seq-len', type=int, default=200,
- help='The length of the sequence input to BERT.'
- ' An exception will raised if this is not large enough.')
- arg_parser.add_argument('--load-checkpoint-prefix', type=str, required=False, default=None,
- help='Prefix of model checkpoint file')
-
- arg_parser.add_argument('--gpu', type=int,
- help='Number (index) of GPU to run on, e.g. 0. '
- 'If not specified, CPU context is used.')
- arg_parser.add_argument('--batch-size', type=int, default=32, help='Batch size for training')
- args = arg_parser.parse_args()
- return args
-
-
-def main(config):
- """Main method for predicting BERT-based NER model on CoNLL-formatted test data."""
- train_config, tag_vocab = load_metadata(config.load_checkpoint_prefix)
-
- ctx = get_context(config.gpu)
- bert_model, text_vocab = get_bert_model(train_config.bert_model, train_config.cased, ctx,
- train_config.dropout_prob)
-
- dataset = BERTTaggingDataset(text_vocab, None, None, config.test_path,
- config.seq_len, train_config.cased, tag_vocab=tag_vocab)
-
- test_data_loader = dataset.get_test_data_loader(config.batch_size)
-
- net = BERTTagger(bert_model, dataset.num_tag_types, train_config.dropout_prob)
- model_filename = _find_model_file_from_checkpoint(config.load_checkpoint_prefix)
- net.load_parameters(model_filename, ctx=ctx)
-
- net.hybridize(static_alloc=True)
-
- loss_function = mx.gluon.loss.SoftmaxCrossEntropyLoss()
- loss_function.hybridize(static_alloc=True)
-
- # TODO(bikestra): make it not redundant between train and predict
- def evaluate(data_loader):
- """Eval function"""
- predictions = []
-
- for batch_id, data in enumerate(data_loader):
- logging.info('evaluating on batch index: %d/%d', batch_id, len(data_loader))
- text_ids, token_types, valid_length, tag_ids, _ = \
- [x.astype('float32').as_in_context(ctx) for x in data]
- out = net(text_ids, token_types, valid_length)
-
- # convert results to numpy arrays for easier access
- np_text_ids = text_ids.astype('int32').asnumpy()
- np_pred_tags = out.argmax(axis=-1).asnumpy()
- np_valid_length = valid_length.astype('int32').asnumpy()
- np_true_tags = tag_ids.asnumpy()
-
- predictions += convert_arrays_to_text(text_vocab, dataset.tag_vocab, np_text_ids,
- np_true_tags, np_pred_tags, np_valid_length)
-
- all_true_tags = [[entry.true_tag for entry in entries] for entries in predictions]
- all_pred_tags = [[entry.pred_tag for entry in entries] for entries in predictions]
- seqeval_f1 = seqeval.metrics.f1_score(all_true_tags, all_pred_tags)
- return seqeval_f1
-
- test_f1 = evaluate(test_data_loader)
- logging.info('test f1: {:.3f}'.format(test_f1))
-
-
-if __name__ == '__main__':
- logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
- level=logging.DEBUG, datefmt='%Y-%m-%d %I:%M:%S')
- logging.getLogger().setLevel(logging.INFO)
- main(parse_args())
diff --git a/scripts/parsing/__init__.py b/scripts/parsing/__init__.py
deleted file mode 100644
index 13a83393a9..0000000000
--- a/scripts/parsing/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
diff --git a/scripts/parsing/common/__init__.py b/scripts/parsing/common/__init__.py
deleted file mode 100644
index 13a83393a9..0000000000
--- a/scripts/parsing/common/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
diff --git a/scripts/parsing/common/config.py b/scripts/parsing/common/config.py
deleted file mode 100644
index 9a9a1dc63a..0000000000
--- a/scripts/parsing/common/config.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Training config."""
-
-import os
-import pickle
-
-from scripts.parsing.common.savable import Savable
-
-
-class _Config(Savable):
- def __init__(self, train_file, dev_file, test_file, save_dir,
- pretrained_embeddings_file=None, min_occur_count=2,
- lstm_layers=3, word_dims=100, tag_dims=100, dropout_emb=0.33, lstm_hiddens=400,
- dropout_lstm_input=0.33,
- dropout_lstm_hidden=0.33, mlp_arc_size=500, mlp_rel_size=100,
- dropout_mlp=0.33, learning_rate=2e-3, decay=.75, decay_steps=5000,
- beta_1=.9, beta_2=.9, epsilon=1e-12,
- num_buckets_train=40,
- num_buckets_valid=10, num_buckets_test=10,
- train_iters=50000, train_batch_size=5000, debug=False):
- """Internal structure for hyper parameters, intended for pickle serialization.
-
- May be replaced by a dict, but this class provides intuitive properties
- and saving/loading mechanism
-
- Parameters
- ----------
- train_file
- dev_file
- test_file
- save_dir
- pretrained_embeddings_file
- min_occur_count
- lstm_layers
- word_dims
- tag_dims
- dropout_emb
- lstm_hiddens
- dropout_lstm_input
- dropout_lstm_hidden
- mlp_arc_size
- mlp_rel_size
- dropout_mlp
- learning_rate
- decay
- decay_steps
- beta_1
- beta_2
- epsilon
- num_buckets_train
- num_buckets_valid
- num_buckets_test
- train_iters
- train_batch_size
- debug
- """
- super(_Config, self).__init__()
- self.pretrained_embeddings_file = pretrained_embeddings_file
- self.train_file = train_file
- self.dev_file = dev_file
- self.test_file = test_file
- self.min_occur_count = min_occur_count
- self.save_dir = save_dir
- self.lstm_layers = lstm_layers
- self.word_dims = word_dims
- self.tag_dims = tag_dims
- self.dropout_emb = dropout_emb
- self.lstm_hiddens = lstm_hiddens
- self.dropout_lstm_input = dropout_lstm_input
- self.dropout_lstm_hidden = dropout_lstm_hidden
- self.mlp_arc_size = mlp_arc_size
- self.mlp_rel_size = mlp_rel_size
- self.dropout_mlp = dropout_mlp
- self.learning_rate = learning_rate
- self.decay = decay
- self.decay_steps = decay_steps
- self.beta_1 = beta_1
- self.beta_2 = beta_2
- self.epsilon = epsilon
- self.num_buckets_train = num_buckets_train
- self.num_buckets_valid = num_buckets_valid
- self.num_buckets_test = num_buckets_test
- self.train_iters = train_iters
- self.train_batch_size = train_batch_size
- self.debug = debug
-
- @property
- def save_model_path(self):
- return os.path.join(self.save_dir, 'model.bin')
-
- @property
- def save_vocab_path(self):
- return os.path.join(self.save_dir, 'vocab.pkl')
-
- @property
- def save_config_path(self):
- return os.path.join(self.save_dir, 'config.pkl')
-
- def save(self, path=None):
- if not path:
- path = self.save_config_path
- with open(path, 'wb') as f:
- pickle.dump(self, f)
diff --git a/scripts/parsing/common/data.py b/scripts/parsing/common/data.py
deleted file mode 100644
index a2ac0585ad..0000000000
--- a/scripts/parsing/common/data.py
+++ /dev/null
@@ -1,474 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""CoNLL format template."""
-
-from collections import Counter
-import numpy as np
-
-import gluonnlp
-from scripts.parsing.common.k_means import KMeans
-
-from .savable import Savable
-
-
-class ConllWord:
- """CoNLL format template, see http://anthology.aclweb.org/W/W06/W06-2920.pdf
-
- Parameters
- ----------
- id : int
- Token counter, starting at 1 for each new sentence.
- form : str
- Word form or punctuation symbol.
- lemma : str
- Lemma or stem (depending on the particular treebank) of word form,
- or an underscore if not available.
- cpos : str
- Coarse-grained part-of-speech tag, where the tagset depends on the treebank.
- pos : str
- Fine-grained part-of-speech tag, where the tagset depends on the treebank.
- feats : str
- Unordered set of syntactic and/or morphological features
- (depending on the particular treebank), or an underscore if not available.
- head : int
- Head of the current token, which is either a value of ID,
- or zero (’0’) if the token links to the virtual root node of the sentence.
- relation : str
- Dependency relation to the HEAD.
- phead : int
- Projective head of current token, which is either a value of ID or zero (’0’),
- or an underscore if not available.
- pdeprel : str
- Dependency relation to the PHEAD, or an underscore if not available.
- """
- def __init__(self, idx, form, lemma=None, cpos=None, pos=None, feats=None,
- head=None, relation=None, phead=None, pdeprel=None):
- self.idx = idx
- self.form = form
- self.cpos = cpos
- self.pos = pos
- self.head = head
- self.relation = relation
- self.lemma = lemma
- self.feats = feats
- self.phead = phead
- self.pdeprel = pdeprel
-
- def __str__(self):
- values = [str(self.idx), self.form, self.lemma, self.cpos, self.pos, self.feats,
- str(self.head), self.relation, self.phead, self.pdeprel]
- return '\t'.join(['_' if v is None else v for v in values])
-
-
-class ConllSentence:
- """A list of ConllWord
-
- Parameters
- ----------
- words : ConllWord
- words of a sentence
- """
- def __init__(self, words):
- super().__init__()
- self.words = words
-
- def __str__(self):
- return '\n'.join([word.__str__() for word in self.words])
-
- def __len__(self):
- return len(self.words)
-
- def __getitem__(self, index):
- return self.words[index]
-
- def __iter__(self):
- return (line for line in self.words)
-
-
-class ParserVocabulary(Savable):
- """Vocabulary, holds word, tag and relation along with their id.
-
- Load from conll file
- Adopted from https://github.com/jcyk/Dynet-Biaffine-dependency-parser with some modifications
-
- Parameters
- ----------
- input_file : str
- conll file
- pret_embeddings : tuple
- (embedding_name, source), used for gluonnlp.embedding.create(embedding_name, source)
- min_occur_count : int
- threshold of word frequency, those words with smaller frequency will be replaced by UNK
- """
- def __init__(self, input_file, pret_embeddings=None, min_occur_count=2):
- super().__init__()
- word_counter = Counter()
- tag_set = set()
- rel_set = set()
-
- with open(input_file) as f:
- for line in f:
- info = line.strip().split()
- if info:
- if len(info) == 10:
- rel_offset = 7
- elif len(info) == 8:
- rel_offset = 6
- word, tag = info[1].lower(), info[3]
- rel = info[rel_offset]
- word_counter[word] += 1
- tag_set.add(tag)
- if rel != 'root':
- rel_set.add(rel)
-
- self._id2word = ['', '', '']
- self._id2tag = ['', '', '']
- self._id2rel = ['', 'root']
-
- def reverse(x):
- return dict(list(zip(x, list(range(len(x))))))
-
- for word, count in word_counter.most_common():
- if count > min_occur_count:
- self._id2word.append(word)
-
- self._pret_embeddings = pret_embeddings
- self._words_in_train_data = len(self._id2word)
- if pret_embeddings:
- self._add_pret_words(pret_embeddings)
- self._id2tag += list(tag_set)
- self._id2rel += list(rel_set)
-
- self._word2id = reverse(self._id2word)
- self._tag2id = reverse(self._id2tag)
- self._rel2id = reverse(self._id2rel)
-
- PAD, ROOT, UNK = 0, 1, 2 # Padding, Root, Unknown
-
- def log_info(self, logger):
- """Print statistical information via the provided logger
-
- Parameters
- ----------
- logger : logging.Logger
- logger created using logging.getLogger()
- """
- logger.info('#words in training set: %d', self._words_in_train_data)
- logger.info('Vocab info: #words %d, #tags %d #rels %d',
- self.vocab_size, self.tag_size, self.rel_size)
-
- def _add_pret_words(self, pret_embeddings):
- """Read pre-trained embedding file for extending vocabulary
-
- Parameters
- ----------
- pret_embeddings : tuple
- (embedding_name, source), used for gluonnlp.embedding.create(embedding_name, source)
- """
- words_in_train_data = set(self._id2word)
- pret_embeddings = gluonnlp.embedding.create(pret_embeddings[0], source=pret_embeddings[1])
-
- for token in pret_embeddings.idx_to_token:
- if token not in words_in_train_data:
- self._id2word.append(token)
-
- def has_pret_embs(self):
- """Check whether this vocabulary contains words from pre-trained embeddings
-
- Returns
- -------
- bool : Whether this vocabulary contains words from pre-trained embeddings
- """
- return self._pret_embeddings is not None
-
- def get_pret_embs(self, word_dims=None):
- """Read pre-trained embedding file
-
- Parameters
- ----------
- word_dims : int or None
- vector size. Use `None` for auto-infer
-
- Returns
- -------
- numpy.ndarray
- T x C numpy NDArray
- """
- assert self._pret_embeddings is not None, 'No pretrained file provided.'
- pret_embeddings = gluonnlp.embedding.create(self._pret_embeddings[0],
- source=self._pret_embeddings[1])
- embs = [None] * len(self._id2word)
- for idx, vec in enumerate(pret_embeddings.idx_to_vec):
- embs[idx] = vec.asnumpy()
- if word_dims is None:
- word_dims = len(pret_embeddings.idx_to_vec[0])
- for idx, emb in enumerate(embs):
- if emb is None:
- embs[idx] = np.zeros(word_dims)
- pret_embs = np.array(embs, dtype=np.float32)
- return pret_embs / np.std(pret_embs)
-
- def get_word_embs(self, word_dims):
- """Get randomly initialized embeddings when pre-trained embeddings are used,
- otherwise zero vectors.
-
- Parameters
- ----------
- word_dims : int
- word vector size
- Returns
- -------
- numpy.ndarray
- T x C numpy NDArray
- """
- if self._pret_embeddings is not None:
- return np.random.randn(self.words_in_train, word_dims).astype(np.float32)
- return np.zeros((self.words_in_train, word_dims), dtype=np.float32)
-
- def get_tag_embs(self, tag_dims):
- """Randomly initialize embeddings for tag
-
- Parameters
- ----------
- tag_dims : int
- tag vector size
-
- Returns
- -------
- numpy.ndarray
- random embeddings
- """
- return np.random.randn(self.tag_size, tag_dims).astype(np.float32)
-
- def word2id(self, xs):
- """Map word(s) to its id(s)
-
- Parameters
- ----------
- xs : str or list
- word or a list of words
-
- Returns
- -------
- int or list
- id or a list of ids
- """
- if isinstance(xs, list):
- return [self._word2id.get(x, self.UNK) for x in xs]
- return self._word2id.get(xs, self.UNK)
-
- def id2word(self, xs):
- """Map id(s) to word(s)
-
- Parameters
- ----------
- xs : int
- id or a list of ids
-
- Returns
- -------
- str or list
- word or a list of words
- """
- if isinstance(xs, list):
- return [self._id2word[x] for x in xs]
- return self._id2word[xs]
-
- def rel2id(self, xs):
- """Map relation(s) to id(s)
-
- Parameters
- ----------
- xs : str or list
- relation
-
- Returns
- -------
- int or list
- id(s) of relation
- """
- if isinstance(xs, list):
- return [self._rel2id[x] for x in xs]
- return self._rel2id[xs]
-
- def id2rel(self, xs):
- """Map id(s) to relation(s)
-
- Parameters
- ----------
- xs : int
- id or a list of ids
-
- Returns
- -------
- str or list
- relation or a list of relations
- """
- if isinstance(xs, list):
- return [self._id2rel[x] for x in xs]
- return self._id2rel[xs]
-
- def tag2id(self, xs):
- """Map tag(s) to id(s)
-
- Parameters
- ----------
- xs : str or list
- tag or tags
-
- Returns
- -------
- int or list
- id(s) of tag(s)
- """
- if isinstance(xs, list):
- return [self._tag2id.get(x, self.UNK) for x in xs]
- return self._tag2id.get(xs, self.UNK)
-
- @property
- def words_in_train(self):
- """
- get #words in training set
- Returns
- -------
- int
- #words in training set
- """
- return self._words_in_train_data
-
- @property
- def vocab_size(self):
- return len(self._id2word)
-
- @property
- def tag_size(self):
- return len(self._id2tag)
-
- @property
- def rel_size(self):
- return len(self._id2rel)
-
-
-class DataLoader:
- """
- Load CoNLL data
- Adopted from https://github.com/jcyk/Dynet-Biaffine-dependency-parser with some modifications
-
- Parameters
- ----------
- input_file : str
- path to CoNLL file
- n_bkts : int
- number of buckets
- vocab : ParserVocabulary
- vocabulary object
- """
-
- def __init__(self, input_file, n_bkts, vocab):
- self.vocab = vocab
- sents = []
- sent = [[ParserVocabulary.ROOT, ParserVocabulary.ROOT, 0, ParserVocabulary.ROOT]]
- with open(input_file) as f:
- for line in f:
- info = line.strip().split()
- if info:
- arc_offset = 5
- rel_offset = 6
- if len(info) == 10:
- arc_offset = 6
- rel_offset = 7
- assert info[rel_offset] in vocab._rel2id, 'Relation OOV: %s' % line
- word, tag = vocab.word2id(info[1].lower()), vocab.tag2id(info[3])
- head, rel = int(info[arc_offset]), vocab.rel2id(info[rel_offset])
- sent.append([word, tag, head, rel])
- else:
- sents.append(sent)
- sent = [[ParserVocabulary.ROOT, ParserVocabulary.ROOT, 0,
- ParserVocabulary.ROOT]]
- if len(sent) > 1: # last sent in file without '\n'
- sents.append(sent)
-
- self.samples = len(sents)
- len_counter = Counter()
- for sent in sents:
- len_counter[len(sent)] += 1
- self._bucket_lengths = KMeans(n_bkts, len_counter).splits
- self._buckets = [[] for i in range(n_bkts)]
- # bkt_idx x length x sent_idx x 4
- len2bkt = {}
- prev_length = -1
- for bkt_idx, length in enumerate(self._bucket_lengths):
- len2bkt.update(list(zip(list(range(prev_length + 1, length + 1)),
- [bkt_idx] * (length - prev_length))))
- prev_length = length
-
- self._record = []
- for sent in sents:
- bkt_idx = len2bkt[len(sent)]
- idx = len(self._buckets[bkt_idx])
- self._buckets[bkt_idx].append(sent)
- self._record.append((bkt_idx, idx))
-
- for bkt_idx, (bucket, length) in enumerate(zip(self._buckets, self._bucket_lengths)):
- self._buckets[bkt_idx] = np.zeros((length, len(bucket), 4), dtype=np.int32)
- for idx, sent in enumerate(bucket):
- self._buckets[bkt_idx][:len(sent), idx, :] = np.array(sent, dtype=np.int32)
-
- @property
- def idx_sequence(self):
- """Indices of sentences when enumerating data set from batches.
- Useful when retrieving the correct order of sentences
-
- Returns
- -------
- list
- List of ids ranging from 0 to #sent -1
- """
- return [x[1] for x in sorted(zip(self._record, list(range(len(self._record)))))]
-
- def get_batches(self, batch_size, shuffle=True):
- """Get batch iterator
-
- Parameters
- ----------
- batch_size : int
- size of one batch
- shuffle : bool
- whether to shuffle batches. Don't set to True when evaluating on dev or test set.
- Returns
- -------
- tuple
- word_inputs, tag_inputs, arc_targets, rel_targets
- """
- batches = []
- for bkt_idx, bucket in enumerate(self._buckets):
- bucket_size = bucket.shape[1]
- n_tokens = bucket_size * self._bucket_lengths[bkt_idx]
- n_splits = min(max(n_tokens // batch_size, 1), bucket_size)
- range_func = np.random.permutation if shuffle else np.arange
- for bkt_batch in np.array_split(range_func(bucket_size), n_splits):
- batches.append((bkt_idx, bkt_batch))
-
- if shuffle:
- np.random.shuffle(batches)
-
- for bkt_idx, bkt_batch in batches:
- word_inputs = self._buckets[bkt_idx][:, bkt_batch, 0] # word_id x sent_id
- tag_inputs = self._buckets[bkt_idx][:, bkt_batch, 1]
- arc_targets = self._buckets[bkt_idx][:, bkt_batch, 2]
- rel_targets = self._buckets[bkt_idx][:, bkt_batch, 3]
- yield word_inputs, tag_inputs, arc_targets, rel_targets
diff --git a/scripts/parsing/common/exponential_scheduler.py b/scripts/parsing/common/exponential_scheduler.py
deleted file mode 100644
index 3773a129f2..0000000000
--- a/scripts/parsing/common/exponential_scheduler.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Learning rate schedule for parser training."""
-
-from mxnet.lr_scheduler import LRScheduler
-
-
-class ExponentialScheduler(LRScheduler):
- """A simple learning rate decay scheduler
- lr = base_lr * decay_rate ^ (num_update / decay_every)
-
- Parameters
- ----------
- base_lr : float
- the initial learning rate.
- decay_rate : float
- what percentage does the learning rate decreases to in every decay compared to last one
- decay_every : float
- how often does the decay occurs
- """
- def __init__(self, base_lr=0.01, decay_rate=0.5, decay_every=1):
- super().__init__(base_lr)
- self.decay_rate = decay_rate
- self.decay_every = decay_every
-
- def __call__(self, num_update):
- return self.base_lr * self.decay_rate ** (num_update / self.decay_every)
diff --git a/scripts/parsing/common/k_means.py b/scripts/parsing/common/k_means.py
deleted file mode 100755
index 512ee0d2a0..0000000000
--- a/scripts/parsing/common/k_means.py
+++ /dev/null
@@ -1,183 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2016 Timothy Dozat
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""KMeans utility."""
-
-from collections import Counter
-
-import numpy as np
-
-
-class KMeans:
- """
- Cluster sentences by their lengths
-
- Parameters
- ----------
- k : int
- number of clusters
- len_cntr : Counter
- length counter
- """
- def __init__(self, k, len_cntr):
- # Error checking
- if len(len_cntr) < k:
- raise ValueError('Trying to sort %d data points into %d buckets' % (len(len_cntr), k))
-
- # Initialize variables
- self._k = k
- self._len_cntr = len_cntr
- self._lengths = sorted(self._len_cntr.keys())
- self._splits = []
- self._split2len_idx = {}
- self._len2split_idx = {}
- self._split_cntr = Counter()
-
- # Initialize the splits evenly
- lengths = []
- unique_length = []
- for length, count in list(self._len_cntr.items()):
- lengths.extend([length] * count)
- unique_length.append(length)
- lengths.sort()
- unique_length.sort()
- self._splits = [np.max(split) for split in np.array_split(lengths, self._k)]
-
- i = len(self._splits) - 1
- while i > 0:
- while self._splits[i - 1] >= self._splits[i]:
- index = unique_length.index(self._splits[i - 1])
- if index == 0:
- break
- self._splits[i - 1] = unique_length[index - 1]
- i -= 1
-
- unique_length.reverse()
- i = 1
- while i < len(self._splits) - 1:
- while self._splits[i] <= self._splits[i - 1]:
- index = unique_length.index(self._splits[i])
- if index == 0:
- break
- self._splits[i] = unique_length[index - 1]
- i += 1
-
- # Reindex everything
- split_idx = 0
- split = self._splits[split_idx]
- for len_idx, length in enumerate(self._lengths):
- count = self._len_cntr[length]
- self._split_cntr[split] += count
- if length == split:
- self._split2len_idx[split] = len_idx
- split_idx += 1
- if split_idx < len(self._splits):
- split = self._splits[split_idx]
- self._split_cntr[split] = 0
- elif length > split:
- raise IndexError()
-
- # Iterate
- old_splits = None
- # print('0) Initial splits: %s; Initial mass: %d' % (self._splits, self.get_mass()))
- i = 0
- while self._splits != old_splits:
- old_splits = list(self._splits)
- self._recenter()
- i += 1
- # print('%d) Final splits: %s; Final mass: %d' % (i, self._splits, self.get_mass()))
-
- self._reindex()
-
- def _recenter(self):
- """
- one iteration of k-means
- """
- for split_idx in range(len(self._splits)):
- split = self._splits[split_idx]
- len_idx = self._split2len_idx[split]
- if split == self._splits[-1]:
- continue
- right_split = self._splits[split_idx + 1]
-
- # Try shifting the centroid to the left
- if len_idx > 0 and self._lengths[len_idx - 1] not in self._split_cntr:
- new_split = self._lengths[len_idx - 1]
- left_delta = (self._len_cntr[split] * (right_split - new_split)
- - self._split_cntr[split] * (split - new_split))
- if left_delta < 0:
- self._splits[split_idx] = new_split
- self._split2len_idx[new_split] = len_idx - 1
- del self._split2len_idx[split]
- self._split_cntr[split] -= self._len_cntr[split]
- self._split_cntr[right_split] += self._len_cntr[split]
- self._split_cntr[new_split] = self._split_cntr[split]
- del self._split_cntr[split]
-
- # Try shifting the centroid to the right
- elif len_idx < len(self._lengths) - 2 \
- and self._lengths[len_idx + 1] not in self._split_cntr:
- new_split = self._lengths[len_idx + 1]
- right_delta = (self._split_cntr[split] * (new_split - split)
- - self._len_cntr[split] * (new_split - split))
- if right_delta <= 0:
- self._splits[split_idx] = new_split
- self._split2len_idx[new_split] = len_idx + 1
- del self._split2len_idx[split]
- self._split_cntr[split] += self._len_cntr[split]
- self._split_cntr[right_split] -= self._len_cntr[split]
- self._split_cntr[new_split] = self._split_cntr[split]
- del self._split_cntr[split]
-
- def _reindex(self):
- """
- Index every sentence into a cluster
- """
- self._len2split_idx = {}
- last_split = -1
- for split_idx, split in enumerate(self._splits):
- self._len2split_idx.update(
- dict(list(zip(list(range(last_split + 1, split)),
- [split_idx] * (split - (last_split + 1))))))
-
- def __len__(self):
- return self._k
-
- def __iter__(self):
- return (split for split in self.splits)
-
- def __getitem__(self, key):
- return self._splits[key]
-
- @property
- def splits(self):
- """Get clusters
-
- Returns
- -------
- tuple
- (bucket, length) mapping
- """
- return self._splits
-
- @property
- def len2split_idx(self):
- """Get length to bucket mapping
-
- Returns
- -------
- tuple
- (length, bucket) mapping
- """
- return self._len2split_idx
diff --git a/scripts/parsing/common/savable.py b/scripts/parsing/common/savable.py
deleted file mode 100644
index 55dd42909c..0000000000
--- a/scripts/parsing/common/savable.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility base class for saving objects."""
-
-import pickle
-
-
-class Savable:
- """
- A super class for save/load operations.
- """
-
- def __init__(self):
- super().__init__()
-
- def save(self, path):
- """Save to path
-
- Parameters
- ----------
- path : str
- file path
- """
- with open(path, 'wb') as f:
- pickle.dump(self, f)
-
- @staticmethod
- def load(path):
- """Load from path
-
- Parameters
- ----------
- path : str
- file path
-
- Returns
- -------
- Savable
- An object
- """
- with open(path, 'rb') as f:
- return pickle.load(f)
diff --git a/scripts/parsing/common/tarjan.py b/scripts/parsing/common/tarjan.py
deleted file mode 100755
index bf60a2adaf..0000000000
--- a/scripts/parsing/common/tarjan.py
+++ /dev/null
@@ -1,95 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2016 Timothy Dozat
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tarjan's algorithm for strongly connected components."""
-
-from collections import defaultdict
-
-
-# ***************************************************************
-class Tarjan:
- """
- Computes Tarjan's algorithm for finding strongly connected components (cycles) of a graph
-
- Attributes:
- edges: dictionary of edges such that edges[dep] = head
- vertices: set of dependents
- SCCs: list of sets of strongly connected components. Non-singleton sets are cycles.
-
- Parameters
- ----------
- prediction : numpy.ndarray
- a predicted dependency tree where prediction[dep_idx] = head_idx
- tokens : numpy.ndarray
- the tokens we care about (i.e. exclude _GO, _EOS, and _PAD)
- """
- def __init__(self, prediction, tokens):
- self._edges = defaultdict(set)
- self._vertices = set((0,))
- for dep, head in enumerate(prediction[tokens]):
- self._vertices.add(dep + 1)
- self._edges[head].add(dep + 1)
- self._indices = {}
- self._lowlinks = {}
- self._onstack = defaultdict(lambda: False)
- self._SCCs = []
-
- index = 0
- stack = []
- for v in self.vertices:
- if v not in self.indices:
- self.strongconnect(v, index, stack)
-
- # =============================================================
- def strongconnect(self, v, index, stack):
- """Find strongly connected components."""
-
- self._indices[v] = index
- self._lowlinks[v] = index
- index += 1
- stack.append(v)
- self._onstack[v] = True
- for w in self.edges[v]:
- if w not in self.indices:
- self.strongconnect(w, index, stack)
- self._lowlinks[v] = min(self._lowlinks[v], self._lowlinks[w])
- elif self._onstack[w]:
- self._lowlinks[v] = min(self._lowlinks[v], self._indices[w])
-
- if self._lowlinks[v] == self._indices[v]:
- self._SCCs.append(set())
- while stack[-1] != v:
- w = stack.pop()
- self._onstack[w] = False
- self._SCCs[-1].add(w)
- w = stack.pop()
- self._onstack[w] = False
- self._SCCs[-1].add(w)
-
- # ======================
- @property
- def edges(self):
- return self._edges
-
- @property
- def vertices(self):
- return self._vertices
-
- @property
- def indices(self):
- return self._indices
-
- @property
- def SCCs(self):
- return self._SCCs
diff --git a/scripts/parsing/common/utils.py b/scripts/parsing/common/utils.py
deleted file mode 100644
index 06d7aebdba..0000000000
--- a/scripts/parsing/common/utils.py
+++ /dev/null
@@ -1,526 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility classes."""
-
-import logging
-import math
-import os
-import sys
-import time
-
-import numpy as np
-import mxnet as mx
-from mxnet import nd
-from mxnet.gluon import rnn, contrib
-
-from .data import ParserVocabulary
-from .tarjan import Tarjan
-
-
-class Progbar:
- """Progbar class copied from keras (https://github.com/fchollet/keras/)
-
- Displays a progress bar.
- Small edit : added strict arg to update
-
- Parameters
- ----------
- target : int
- Total number of steps expected.
- width : int
- Progress bar width.
- verbose : int
- Verbosity level. Options are 1 and 2.
- """
- def __init__(self, target, width=30, verbose=1):
- self.width = width
- self.target = target
- self.sum_values = {}
- self.unique_values = []
- self.start = time.time()
- self.total_width = 0
- self.seen_so_far = 0
- self.verbose = verbose
-
- def update(self, current, values=None, exact=None, strict=None):
- """
- Updates the progress bar.
-
- Parameters
- ----------
- current : int
- Index of current step.
- values : List of tuples (name, value_for_last_step).
- The progress bar will display averages for these values.
- exact : List of tuples (name, value_for_last_step).
- The progress bar will display these values directly.
- """
- values = values or []
- exact = exact or []
- strict = strict or []
-
- for k, v in values:
- if k not in self.sum_values:
- self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far]
- self.unique_values.append(k)
- else:
- self.sum_values[k][0] += v * (current - self.seen_so_far)
- self.sum_values[k][1] += (current - self.seen_so_far)
-
- for cells in exact:
- k, v, w = cells[0], cells[1], 4
- if len(cells) == 3:
- w = cells[2]
- if k not in self.sum_values:
- self.unique_values.append(k)
- self.sum_values[k] = [v, 1, w]
-
- for k, v in strict:
- if k not in self.sum_values:
- self.unique_values.append(k)
- self.sum_values[k] = v
-
- self.seen_so_far = current
-
- now = time.time()
- if self.verbose == 1:
- prev_total_width = self.total_width
- sys.stdout.write('\b' * prev_total_width)
- sys.stdout.write('\r')
-
- numdigits = 0 if self.target == 0 or math.isnan(self.target) \
- else int(np.floor(np.log10(self.target))) + 1
- barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
- bar = barstr % (current, self.target)
- prog = 0 if self.target == 0 else float(current) / self.target
- prog_width = int(self.width * prog)
- if prog_width > 0:
- bar += ('=' * (prog_width - 1))
- if current < self.target:
- bar += '>'
- else:
- bar += '='
- bar += ('.' * (self.width - prog_width))
- bar += ']'
- sys.stdout.write(bar)
- self.total_width = len(bar)
-
- if current:
- time_per_unit = (now - self.start) / current
- else:
- time_per_unit = 0
- eta = time_per_unit * (self.target - current)
- info = ''
- if current < self.target:
- info += ' - ETA: %ds' % eta
- else:
- info += ' - %ds' % (now - self.start)
- for k in self.unique_values:
- if isinstance(self.sum_values[k], list):
- info += (' - %s: %.' + str(self.sum_values[k][2]) + 'f') % (
- k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
- else:
- info += ' - %s: %s' % (k, self.sum_values[k])
-
- self.total_width += len(info)
- if prev_total_width > self.total_width:
- info += ((prev_total_width - self.total_width) * ' ')
-
- sys.stdout.write(info)
- sys.stdout.flush()
-
- if current >= self.target:
- sys.stdout.write('\n')
-
- if self.verbose == 2:
- if current >= self.target:
- info = '%ds' % (now - self.start)
- for k in self.unique_values:
- info += ' - %s: %.4f' % (k,
- self.sum_values[k][0] / max(1, self.sum_values[k][1]))
- sys.stdout.write(info + '\n')
-
- def add(self, n, values=None):
- values = values or []
- self.update(self.seen_so_far + n, values)
-
-
-def mxnet_prefer_gpu():
- """If gpu available return gpu, else cpu
-
- Returns
- -------
- context : Context
- The preferable GPU context.
- """
- gpu = int(os.environ.get('MXNET_GPU', default=0))
- if gpu in mx.test_utils.list_gpus():
- return mx.gpu(gpu)
- return mx.cpu()
-
-
-def init_logger(root_dir, name='train.log'):
- """Initialize a logger
-
- Parameters
- ----------
- root_dir : str
- directory for saving log
- name : str
- name of logger
-
- Returns
- -------
- logger : logging.Logger
- a logger
- """
- os.makedirs(root_dir, exist_ok=True)
- log_formatter = logging.Formatter('%(message)s')
- logger = logging.getLogger(name)
- file_handler = logging.FileHandler('{0}/{1}'.format(root_dir, name), mode='w')
- file_handler.setFormatter(log_formatter)
- logger.addHandler(file_handler)
- console_handler = logging.StreamHandler()
- console_handler.setFormatter(log_formatter)
- logger.addHandler(console_handler)
- logger.setLevel(logging.INFO)
- return logger
-
-
-def orthonormal_VanillaLSTMBuilder(lstm_layers, input_dims, lstm_hiddens,
- dropout_h=0., debug=False):
- """Build a standard LSTM cell, with variational dropout,
- with weights initialized to be orthonormal (https://arxiv.org/abs/1312.6120)
-
- Parameters
- ----------
- lstm_layers : int
- Currently only support one layer
- input_dims : int
- word vector dimensions
- lstm_hiddens : int
- hidden size
- dropout_h : float
- dropout on hidden states
- debug : bool
- set to True to skip orthonormal initialization
-
- Returns
- -------
- lstm_cell : VariationalDropoutCell
- A LSTM cell
- """
- assert lstm_layers == 1, 'only accept one layer lstm'
- W = orthonormal_initializer(lstm_hiddens, lstm_hiddens + input_dims, debug)
- W_h, W_x = W[:, :lstm_hiddens], W[:, lstm_hiddens:]
- b = nd.zeros((4 * lstm_hiddens,))
- b[lstm_hiddens:2 * lstm_hiddens] = -1.0
- lstm_cell = rnn.LSTMCell(input_size=input_dims, hidden_size=lstm_hiddens,
- i2h_weight_initializer=mx.init.Constant(np.concatenate([W_x] * 4, 0)),
- h2h_weight_initializer=mx.init.Constant(np.concatenate([W_h] * 4, 0)),
- h2h_bias_initializer=mx.init.Constant(b))
- wrapper = contrib.rnn.VariationalDropoutCell(lstm_cell, drop_states=dropout_h)
- return wrapper
-
-
-def biLSTM(f_lstm, b_lstm, inputs, dropout_x=0.):
- """Feature extraction through BiLSTM
-
- Parameters
- ----------
- f_lstm : VariationalDropoutCell
- Forward cell
- b_lstm : VariationalDropoutCell
- Backward cell
- inputs : NDArray
- seq_len x batch_size
- dropout_x : float
- Variational dropout on inputs
-
- Returns
- -------
- outputs : NDArray
- Outputs of BiLSTM layers, seq_len x 2 hidden_dims x batch_size
- """
- for f, b in zip(f_lstm, b_lstm):
- inputs = nd.Dropout(inputs, dropout_x, axes=[0]) # important for variational dropout
- fo, _ = f.unroll(length=inputs.shape[0], inputs=inputs, layout='TNC', merge_outputs=True)
- bo, _ = b.unroll(length=inputs.shape[0], inputs=inputs.flip(axis=0), layout='TNC',
- merge_outputs=True)
- f.reset()
- b.reset()
- inputs = nd.concat(fo, bo.flip(axis=0), dim=2)
- return inputs
-
-
-def leaky_relu(x):
- """slope=0.1 leaky ReLu
-
- Parameters
- ----------
- x : NDArray
- Input
-
- Returns
- -------
- y : NDArray
- y = x > 0 ? x : 0.1 * x
- """
- return nd.LeakyReLU(x, slope=.1)
-
-
-def bilinear(x, W, y, input_size, seq_len, batch_size, num_outputs=1, bias_x=False, bias_y=False):
- """Do xWy
-
- Parameters
- ----------
- x : NDArray
- (input_size x seq_len) x batch_size
- W : NDArray
- (num_outputs x ny) x nx
- y : NDArray
- (input_size x seq_len) x batch_size
- input_size : int
- input dimension
- seq_len : int
- sequence length
- batch_size : int
- batch size
- num_outputs : int
- number of outputs
- bias_x : bool
- whether concat bias vector to input x
- bias_y : bool
- whether concat bias vector to input y
-
- Returns
- -------
- output : NDArray
- [seq_len_y x seq_len_x if output_size == 1 else seq_len_y x num_outputs x seq_len_x]
- x batch_size
- """
- if bias_x:
- x = nd.concat(x, nd.ones((1, seq_len, batch_size)), dim=0)
- if bias_y:
- y = nd.concat(y, nd.ones((1, seq_len, batch_size)), dim=0)
-
- ny = input_size + bias_y
- # W: (num_outputs x ny) x nx
- lin = nd.dot(W, x)
- if num_outputs > 1:
- lin = reshape_fortran(lin, (ny, num_outputs * seq_len, batch_size))
- y = y.transpose([2, 1, 0]) # May cause performance issues
- lin = lin.transpose([2, 1, 0])
- blin = nd.batch_dot(lin, y, transpose_b=True)
- blin = blin.transpose([2, 1, 0])
- if num_outputs > 1:
- blin = reshape_fortran(blin, (seq_len, num_outputs, seq_len, batch_size))
- return blin
-
-
-def orthonormal_initializer(output_size, input_size, debug=False):
- """adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/linalg.py
-
- Parameters
- ----------
- output_size : int
- input_size : int
- debug : bool
- Whether to skip this initializer
- Returns
- -------
- Q : np.ndarray
- The orthonormal weight matrix of input_size x output_size
- """
- print((output_size, input_size))
- if debug:
- Q = np.random.randn(input_size, output_size) / np.sqrt(output_size)
- return np.transpose(Q.astype(np.float32))
- I = np.eye(output_size)
- lr = .1
- eps = .05 / (output_size + input_size)
- success = False
- tries = 0
- while not success and tries < 10:
- Q = np.random.randn(input_size, output_size) / np.sqrt(output_size)
- for _ in range(100):
- QTQmI = Q.T.dot(Q) - I
- loss = np.sum(QTQmI ** 2 / 2)
- Q2 = Q ** 2
- Q -= lr * Q.dot(QTQmI) / (
- np.abs(Q2 + Q2.sum(axis=0, keepdims=True)
- + Q2.sum(axis=1, keepdims=True) - 1) + eps)
- if np.max(Q) > 1e6 or loss > 1e6 or not np.isfinite(loss):
- tries += 1
- lr /= 2
- break
- success = True
- if success:
- print(('Orthogonal pretrainer loss: %.2e' % loss))
- else:
- print('Orthogonal pretrainer failed, using non-orthogonal random matrix')
- Q = np.random.randn(input_size, output_size) / np.sqrt(output_size)
- return np.transpose(Q.astype(np.float32))
-
-
-def arc_argmax(parse_probs, length, tokens_to_keep, ensure_tree=True):
- """MST
- Adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/models/nn.py
-
- Parameters
- ----------
- parse_probs : NDArray
- seq_len x seq_len, the probability of arcs
- length : NDArray
- real sentence length
- tokens_to_keep : NDArray
- mask matrix
- ensure_tree :
- whether to ensure tree structure of output (apply MST)
- Returns
- -------
- parse_preds : np.ndarray
- prediction of arc parsing with size of (seq_len,)
- """
- if ensure_tree:
- I = np.eye(len(tokens_to_keep))
- # block loops and pad heads
- parse_probs = parse_probs * tokens_to_keep * (1 - I)
- parse_preds = np.argmax(parse_probs, axis=1)
- tokens = np.arange(1, length)
- roots = np.where(parse_preds[tokens] == 0)[0] + 1
- # ensure at least one root
- if len(roots) < 1:
- # The current root probabilities
- root_probs = parse_probs[tokens, 0]
- # The current head probabilities
- old_head_probs = parse_probs[tokens, parse_preds[tokens]]
- # Get new potential root probabilities
- new_root_probs = root_probs / old_head_probs
- # Select the most probable root
- new_root = tokens[np.argmax(new_root_probs)]
- # Make the change
- parse_preds[new_root] = 0
- # ensure at most one root
- elif len(roots) > 1:
- # The probabilities of the current heads
- root_probs = parse_probs[roots, 0]
- # Set the probability of depending on the root zero
- parse_probs[roots, 0] = 0
- # Get new potential heads and their probabilities
- new_heads = np.argmax(parse_probs[roots][:, tokens], axis=1) + 1
- new_head_probs = parse_probs[roots, new_heads] / root_probs
- # Select the most probable root
- new_root = roots[np.argmin(new_head_probs)]
- # Make the change
- parse_preds[roots] = new_heads
- parse_preds[new_root] = 0
- # remove cycles
- tarjan = Tarjan(parse_preds, tokens)
- for SCC in tarjan.SCCs:
- if len(SCC) > 1:
- dependents = set()
- to_visit = set(SCC)
- while len(to_visit) > 0:
- node = to_visit.pop()
- if not node in dependents:
- dependents.add(node)
- to_visit.update(tarjan.edges[node])
- # The indices of the nodes that participate in the cycle
- cycle = np.array(list(SCC))
- # The probabilities of the current heads
- old_heads = parse_preds[cycle]
- old_head_probs = parse_probs[cycle, old_heads]
- # Set the probability of depending on a non-head to zero
- non_heads = np.array(list(dependents))
- parse_probs[np.repeat(cycle, len(non_heads)),
- np.repeat([non_heads], len(cycle), axis=0).flatten()] = 0
- # Get new potential heads and their probabilities
- new_heads = np.argmax(parse_probs[cycle][:, tokens], axis=1) + 1
- new_head_probs = parse_probs[cycle, new_heads] / old_head_probs
- # Select the most probable change
- change = np.argmax(new_head_probs)
- changed_cycle = cycle[change]
- old_head = old_heads[change]
- new_head = new_heads[change]
- # Make the change
- parse_preds[changed_cycle] = new_head
- tarjan.edges[new_head].add(changed_cycle)
- tarjan.edges[old_head].remove(changed_cycle)
- return parse_preds
- else:
- # block and pad heads
- parse_probs = parse_probs * tokens_to_keep
- parse_preds = np.argmax(parse_probs, axis=1)
- return parse_preds
-
-
-def rel_argmax(rel_probs, length, ensure_tree=True):
- """Fix the relation prediction by heuristic rules
-
- Parameters
- ----------
- rel_probs : NDArray
- seq_len x rel_size
- length :
- real sentence length
- ensure_tree :
- whether to apply rules
- Returns
- -------
- rel_preds : np.ndarray
- prediction of relations of size (seq_len,)
- """
- if ensure_tree:
- rel_probs[:, ParserVocabulary.PAD] = 0
- root = ParserVocabulary.ROOT
- tokens = np.arange(1, length)
- rel_preds = np.argmax(rel_probs, axis=1)
- roots = np.where(rel_preds[tokens] == root)[0] + 1
- if len(roots) < 1:
- rel_preds[1 + np.argmax(rel_probs[tokens, root])] = root
- elif len(roots) > 1:
- root_probs = rel_probs[roots, root]
- rel_probs[roots, root] = 0
- new_rel_preds = np.argmax(rel_probs[roots], axis=1)
- new_rel_probs = rel_probs[roots, new_rel_preds] / root_probs
- new_root = roots[np.argmin(new_rel_probs)]
- rel_preds[roots] = new_rel_preds
- rel_preds[new_root] = root
- return rel_preds
- else:
- rel_probs[:, ParserVocabulary.PAD] = 0
- rel_preds = np.argmax(rel_probs, axis=1)
- return rel_preds
-
-
-def reshape_fortran(tensor, shape):
- """The missing Fortran reshape for mx.NDArray
-
- Parameters
- ----------
- tensor : NDArray
- source tensor
- shape : NDArray
- desired shape
-
- Returns
- -------
- output : NDArray
- reordered result
- """
- return tensor.T.reshape(tuple(reversed(shape))).T
diff --git a/scripts/parsing/index.rst b/scripts/parsing/index.rst
deleted file mode 100644
index 878f9d9d2b..0000000000
--- a/scripts/parsing/index.rst
+++ /dev/null
@@ -1,79 +0,0 @@
-Dependency Parsing
----------------------------------
-
-:download:`Download scripts `
-
-Deep Biaffine Dependency Parser
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-This package contains an implementation of `Deep Biaffine Attention for Neural Dependency Parsing `_ proposed by Dozat and Manning (2016), with SOTA accuracy.
-
-Train
-""""""""""
-
-As the Penn Treebank dataset (PTB) is proprietary, we are unable to distribute it.
-If you have a legal copy, please place it in ``tests/data/biaffine/ptb``, use this `pre-processing script `_ to convert it into conllx format.
-The tree view of data folder should be as follows.
-
-.. code-block:: console
-
- $ tree tests/data/biaffine
- tests/data/biaffine
- └── ptb
- ├── dev.conllx
- ├── test.conllx
- └── train.conllx
-
-Then Run the following code to train the biaffine model.
-
-.. code-block:: python
-
- parser = DepParser()
- parser.train(train_file='tests/data/biaffine/ptb/train.conllx',
- dev_file='tests/data/biaffine/ptb/dev.conllx',
- test_file='tests/data/biaffine/ptb/test.conllx', save_dir='tests/data/biaffine/model',
- pretrained_embeddings=('glove', 'glove.6B.100d'))
- parser.evaluate(test_file='tests/data/biaffine/ptb/test.conllx', save_dir='tests/data/biaffine/model')
-
-
-The expected UAS should be around ``96%`` (see `training log `_ and `evaluation log `_). The trained model will be saved in following folder.
-
-.. code-block:: console
-
- $ tree tests/data/biaffine/model
- tests/data/biaffine/model
- ├── config.pkl
- ├── model.bin
- ├── test.log
- ├── train.log
- └── vocab.pkl
-
-Note that the embeddings are not kept in ``model.bin``, in order to reduce file size.
-Users need to keep embeddings at the same place after training.
-A good practice is to place embeddings in the model folder and distribute them together.
-
-Decode
-""""""""""
-
-Once we trained a model or downloaded a pre-trained one, we can load it and decode raw sentences.
-
-.. code-block:: python
-
- parser = DepParser()
- parser.load('tests/data/biaffine/model')
- sentence = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'),
- ('music', 'NN'), ('?', '.')]
- print(parser.parse(sentence))
-
-
-The output should be as follows.
-
-.. code-block:: text
-
- 1 Is _ _ VBZ _ 4 cop _ _
- 2 this _ _ DT _ 4 nsubj _ _
- 3 the _ _ DT _ 4 det _ _
- 4 future _ _ NN _ 0 root _ _
- 5 of _ _ IN _ 4 prep _ _
- 6 chamber _ _ NN _ 7 nn _ _
- 7 music _ _ NN _ 5 pobj _ _
- 8 ? _ _ . _ 4 punct _ _
diff --git a/scripts/parsing/parser/__init__.py b/scripts/parsing/parser/__init__.py
deleted file mode 100644
index 13a83393a9..0000000000
--- a/scripts/parsing/parser/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
diff --git a/scripts/parsing/parser/biaffine_parser.py b/scripts/parsing/parser/biaffine_parser.py
deleted file mode 100644
index 5c9dfdd5ed..0000000000
--- a/scripts/parsing/parser/biaffine_parser.py
+++ /dev/null
@@ -1,357 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Deep Biaffine Parser Model."""
-
-import numpy as np
-import mxnet as mx
-from mxnet import nd, ndarray, autograd
-from mxnet.gluon import nn, loss
-
-from scripts.parsing.common import utils
-from gluonnlp.model import apply_weight_drop
-
-
-class BiaffineParser(nn.Block):
- """A MXNet replicate of biaffine parser, see following paper
- Dozat, T., & Manning, C. D. (2016). Deep biaffine attention for neural dependency parsing.
- arXiv:1611.01734.
-
- It's a re-implementation of DyNet version
- https://github.com/jcyk/Dynet-Biaffine-dependency-parser
-
- Parameters
- ----------
- vocab : ParserVocabulary
- built from a data set
- word_dims : int
- word vector dimension
- tag_dims : int
- tag vector dimension
- dropout_dim : int
- keep rate of word dropout (drop out entire embedding)
- lstm_layers : int
- number of lstm layers
- lstm_hiddens : int
- size of lstm hidden states
- dropout_lstm_input : float
- dropout on x in variational RNN
- dropout_lstm_hidden : float
- dropout on h in variational RNN
- mlp_arc_size : int
- output size of MLP for arc feature extraction
- mlp_rel_size : int
- output size of MLP for rel feature extraction
- dropout_mlp : int
- dropout on the output of LSTM
- debug : bool
- debug mode
- """
- def __init__(self, vocab,
- word_dims,
- tag_dims,
- dropout_dim,
- lstm_layers,
- lstm_hiddens,
- dropout_lstm_input,
- dropout_lstm_hidden,
- mlp_arc_size,
- mlp_rel_size,
- dropout_mlp,
- debug=False):
- super(BiaffineParser, self).__init__()
-
- def embedding_from_numpy(_we, trainable=True):
- word_embs = nn.Embedding(_we.shape[0], _we.shape[1],
- weight_initializer=mx.init.Constant(_we))
- apply_weight_drop(word_embs, 'weight', dropout_dim, axes=(1,))
- if not trainable:
- word_embs.collect_params().setattr('grad_req', 'null')
- return word_embs
-
- self._vocab = vocab
- self.word_embs = embedding_from_numpy(vocab.get_word_embs(word_dims))
- self.pret_word_embs = embedding_from_numpy(vocab.get_pret_embs(),
- trainable=False) if vocab.has_pret_embs() \
- else None
- self.tag_embs = embedding_from_numpy(vocab.get_tag_embs(tag_dims))
-
- self.f_lstm = nn.Sequential()
- self.b_lstm = nn.Sequential()
- self.f_lstm.add(utils.orthonormal_VanillaLSTMBuilder(1, word_dims + tag_dims,
- lstm_hiddens,
- dropout_lstm_hidden, debug))
- self.b_lstm.add(
- utils.orthonormal_VanillaLSTMBuilder(1, word_dims + tag_dims,
- lstm_hiddens,
- dropout_lstm_hidden, debug))
- for _ in range(lstm_layers - 1):
- self.f_lstm.add(
- utils.orthonormal_VanillaLSTMBuilder(1, 2 * lstm_hiddens,
- lstm_hiddens,
- dropout_lstm_hidden, debug))
- self.b_lstm.add(
- utils.orthonormal_VanillaLSTMBuilder(1, 2 * lstm_hiddens,
- lstm_hiddens,
- dropout_lstm_hidden, debug))
- self.dropout_lstm_input = dropout_lstm_input
- self.dropout_lstm_hidden = dropout_lstm_hidden
-
- mlp_size = mlp_arc_size + mlp_rel_size
- W = utils.orthonormal_initializer(mlp_size, 2 * lstm_hiddens, debug)
- self.mlp_dep_W = self.parameter_from_numpy('mlp_dep_W', W)
- self.mlp_head_W = self.parameter_from_numpy('mlp_head_W', W)
- self.mlp_dep_b = self.parameter_init('mlp_dep_b', (mlp_size,), mx.init.Zero())
- self.mlp_head_b = self.parameter_init('mlp_head_b', (mlp_size,), mx.init.Zero())
- self.mlp_arc_size = mlp_arc_size
- self.mlp_rel_size = mlp_rel_size
- self.dropout_mlp = dropout_mlp
-
- self.arc_W = self.parameter_init('arc_W', (mlp_arc_size, mlp_arc_size + 1),
- init=mx.init.Zero())
- self.rel_W = self.parameter_init('rel_W', (vocab.rel_size * (mlp_rel_size + 1),
- mlp_rel_size + 1),
- init=mx.init.Zero())
- self.softmax_loss = loss.SoftmaxCrossEntropyLoss(axis=0, batch_axis=-1)
-
- self.initialize()
-
- def parameter_from_numpy(self, name, array):
- """ Create parameter with its value initialized according to a numpy tensor
-
- Parameters
- ----------
- name : str
- parameter name
- array : np.ndarray
- initiation value
-
- Returns
- -------
- mxnet.gluon.parameter
- a parameter object
- """
- p = self.params.get(name, shape=array.shape, init=mx.init.Constant(array))
- return p
-
- def parameter_init(self, name, shape, init):
- """Create parameter given name, shape and initiator
-
- Parameters
- ----------
- name : str
- parameter name
- shape : tuple
- parameter shape
- init : mxnet.initializer
- an initializer
-
- Returns
- -------
- mxnet.gluon.parameter
- a parameter object
- """
- p = self.params.get(name, shape=shape, init=init)
- return p
-
- def forward(self, word_inputs, tag_inputs, arc_targets=None, rel_targets=None):
- # pylint: disable=arguments-differ
- """Run decoding
-
- Parameters
- ----------
- word_inputs : mxnet.ndarray.NDArray
- word indices of seq_len x batch_size
- tag_inputs : mxnet.ndarray.NDArray
- tag indices of seq_len x batch_size
- arc_targets : mxnet.ndarray.NDArray
- gold arc indices of seq_len x batch_size
- rel_targets : mxnet.ndarray.NDArray
- gold rel indices of seq_len x batch_size
- Returns
- -------
- tuple
- (arc_accuracy, rel_accuracy, overall_accuracy, loss) when training,
- else if given gold target
- then return arc_accuracy, rel_accuracy, overall_accuracy, outputs,
- otherwise return outputs, where outputs is a list of (arcs, rels).
- """
- def flatten_numpy(arr):
- """Flatten nd-array to 1-d column vector
-
- Parameters
- ----------
- arr : numpy.ndarray
- input tensor
-
- Returns
- -------
- numpy.ndarray
- A column vector
-
- """
- return np.reshape(arr, (-1,), 'F')
-
- is_train = autograd.is_training()
- batch_size = word_inputs.shape[1]
- seq_len = word_inputs.shape[0]
- mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
- num_tokens = int(np.sum(mask)) # non padding, non root token number
-
- if is_train or arc_targets is not None:
- mask_1D = flatten_numpy(mask)
- mask_1D_tensor = nd.array(mask_1D)
-
- unked_words = np.where(word_inputs < self._vocab.words_in_train,
- word_inputs, self._vocab.UNK)
- word_embs = self.word_embs(nd.array(unked_words, dtype='int'))
- if self.pret_word_embs:
- word_embs = word_embs + self.pret_word_embs(nd.array(word_inputs))
- tag_embs = self.tag_embs(nd.array(tag_inputs))
-
- # Dropout
- emb_inputs = nd.concat(word_embs, tag_embs, dim=2) # seq_len x batch_size
-
- top_recur = utils.biLSTM(self.f_lstm, self.b_lstm, emb_inputs,
- dropout_x=self.dropout_lstm_input)
- top_recur = nd.Dropout(data=top_recur, axes=[0], p=self.dropout_mlp)
-
- W_dep, b_dep = self.mlp_dep_W.data(), self.mlp_dep_b.data()
- W_head, b_head = self.mlp_head_W.data(), self.mlp_head_b.data()
- dep = nd.Dropout(data=utils.leaky_relu(nd.dot(top_recur, W_dep.T) + b_dep),
- axes=[0], p=self.dropout_mlp)
- head = nd.Dropout(data=utils.leaky_relu(nd.dot(top_recur, W_head.T) + b_head),
- axes=[0], p=self.dropout_mlp)
- dep, head = nd.transpose(dep, axes=[2, 0, 1]), nd.transpose(head, axes=[2, 0, 1])
- dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
- head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]
-
- W_arc = self.arc_W.data()
- arc_logits = utils.bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size,
- seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False)
- # (#head x #dep) x batch_size
-
- flat_arc_logits = utils.reshape_fortran(arc_logits, (seq_len, seq_len * batch_size))
- # (#head ) x (#dep x batch_size)
-
- arc_preds = arc_logits.argmax(0)
- # seq_len x batch_size
-
- if is_train or arc_targets is not None:
- correct = np.equal(arc_preds.asnumpy(), arc_targets)
- arc_correct = correct.astype(np.float32) * mask
- arc_accuracy = np.sum(arc_correct) / num_tokens
- targets_1D = flatten_numpy(arc_targets)
- losses = self.softmax_loss(flat_arc_logits, nd.array(targets_1D))
- arc_loss = nd.sum(losses * mask_1D_tensor) / num_tokens
-
- if not is_train:
- arc_probs = np.transpose(
- np.reshape(nd.softmax(flat_arc_logits, axis=0).asnumpy(),
- (seq_len, seq_len, batch_size), 'F'))
- # #batch_size x #dep x #head
-
- W_rel = self.rel_W.data()
- rel_logits = utils.bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size,
- seq_len, batch_size, num_outputs=self._vocab.rel_size,
- bias_x=True, bias_y=True)
- # (#head x rel_size x #dep) x batch_size
-
- flat_rel_logits = utils.reshape_fortran(rel_logits, (seq_len, self._vocab.rel_size,
- seq_len * batch_size))
- # (#head x rel_size) x (#dep x batch_size)
-
- if is_train: # pylint: disable=using-constant-test
- _target_vec = targets_1D
- else:
- _target_vec = flatten_numpy(arc_preds.asnumpy())
- _target_vec = nd.array(_target_vec).reshape(seq_len * batch_size, 1)
- _target_mat = _target_vec * nd.ones((1, self._vocab.rel_size))
-
- partial_rel_logits = nd.pick(flat_rel_logits, _target_mat.T, axis=0)
- # (rel_size) x (#dep x batch_size)
-
- if is_train or arc_targets is not None:
- rel_preds = partial_rel_logits.argmax(0)
- targets_1D = flatten_numpy(rel_targets)
- rel_correct = np.equal(rel_preds.asnumpy(), targets_1D).astype(np.float32) * mask_1D
- rel_accuracy = np.sum(rel_correct) / num_tokens
- losses = self.softmax_loss(partial_rel_logits, nd.array(targets_1D))
- rel_loss = nd.sum(losses * mask_1D_tensor) / num_tokens
-
- if not is_train:
- rel_probs = np.transpose(np.reshape(nd.softmax(flat_rel_logits.transpose([1, 0, 2]),
- axis=0).asnumpy(),
- (self._vocab.rel_size, seq_len,
- seq_len, batch_size), 'F'))
- # batch_size x #dep x #head x #nclasses
-
- if is_train or arc_targets is not None:
- l = arc_loss + rel_loss
- correct = rel_correct * flatten_numpy(arc_correct)
- overall_accuracy = np.sum(correct) / num_tokens
-
- if is_train: # pylint: disable=using-constant-test
- return arc_accuracy, rel_accuracy, overall_accuracy, l
-
- outputs = []
-
- for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs):
- # parse sentences one by one
- msk[0] = 1.
- sent_len = int(np.sum(msk))
- arc_pred = utils.arc_argmax(arc_prob, sent_len, msk)
- rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
- rel_pred = utils.rel_argmax(rel_prob, sent_len)
- outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len]))
-
- if arc_targets is not None:
- return arc_accuracy, rel_accuracy, overall_accuracy, outputs
- return outputs
-
- def save_parameters(self, filename): # pylint: disable=arguments-differ
- """Save model
-
- Parameters
- ----------
- filename : str
- path to model file
- """
- params = self._collect_params_with_prefix()
- if self.pret_word_embs: # don't save word embeddings inside model
- params.pop('pret_word_embs.weight', None)
- arg_dict = {key: val._reduce() for key, val in params.items()}
- ndarray.save(filename, arg_dict)
-
- def save(self, save_path):
- """Save model
-
- Parameters
- ----------
- filename : str
- path to model file
- """
- self.save_parameters(save_path)
-
- def load(self, load_path):
- """Load model
-
- Parameters
- ----------
- load_path : str
- path to model file
- """
- self.load_parameters(load_path, allow_missing=True)
diff --git a/scripts/parsing/parser/dep_parser.py b/scripts/parsing/parser/dep_parser.py
deleted file mode 100644
index f9f1b9fa9b..0000000000
--- a/scripts/parsing/parser/dep_parser.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Deep Biaffine Dependency Parser driver class and script."""
-
-import math
-import os
-import numpy as np
-
-import mxnet as mx
-from mxnet import gluon, autograd
-
-from scripts.parsing.common.config import _Config
-from scripts.parsing.common.data import ParserVocabulary, DataLoader, ConllWord, ConllSentence
-from scripts.parsing.common.exponential_scheduler import ExponentialScheduler
-from scripts.parsing.common.utils import init_logger, mxnet_prefer_gpu, Progbar
-from scripts.parsing.parser.biaffine_parser import BiaffineParser
-from scripts.parsing.parser.evaluate import evaluate_official_script
-
-
-class DepParser:
- """User interfaces for biaffine dependency parser.
-
- It wraps a biaffine model inside, provides training, evaluating and parsing.
- """
-
- def __init__(self):
- super().__init__()
- self._parser = None
- self._vocab = None
-
- def train(self, train_file, dev_file, test_file, save_dir,
- pretrained_embeddings=None, min_occur_count=2,
- lstm_layers=3, word_dims=100, tag_dims=100, dropout_emb=0.33, lstm_hiddens=400,
- dropout_lstm_input=0.33, dropout_lstm_hidden=0.33,
- mlp_arc_size=500, mlp_rel_size=100,
- dropout_mlp=0.33, learning_rate=2e-3, decay=.75, decay_steps=5000,
- beta_1=.9, beta_2=.9, epsilon=1e-12,
- num_buckets_train=40,
- num_buckets_valid=10, num_buckets_test=10, train_iters=50000, train_batch_size=5000,
- test_batch_size=5000, validate_every=100, save_after=5000, debug=False):
- """Train a deep biaffine dependency parser.
-
- Parameters
- ----------
- train_file : str
- path to training set
- dev_file : str
- path to dev set
- test_file : str
- path to test set
- save_dir : str
- a directory for saving model and related meta-data
- pretrained_embeddings : tuple
- (embedding_name, source), used for gluonnlp.embedding.create(embedding_name, source)
- min_occur_count : int
- threshold of rare words, which will be replaced with UNKs,
- lstm_layers : int
- layers of lstm
- word_dims : int
- dimension of word embedding
- tag_dims : int
- dimension of tag embedding
- dropout_emb : float
- word dropout
- lstm_hiddens : int
- size of lstm hidden states
- dropout_lstm_input : int
- dropout on x in variational RNN
- dropout_lstm_hidden : int
- dropout on h in variational RNN
- mlp_arc_size : int
- output size of MLP for arc feature extraction
- mlp_rel_size : int
- output size of MLP for rel feature extraction
- dropout_mlp : float
- dropout on the output of LSTM
- learning_rate : float
- learning rate
- decay : float
- see ExponentialScheduler
- decay_steps : int
- see ExponentialScheduler
- beta_1 : float
- see ExponentialScheduler
- beta_2 : float
- see ExponentialScheduler
- epsilon : float
- see ExponentialScheduler
- num_buckets_train : int
- number of buckets for training data set
- num_buckets_valid : int
- number of buckets for dev data set
- num_buckets_test : int
- number of buckets for testing data set
- train_iters : int
- training iterations
- train_batch_size : int
- training batch size
- test_batch_size : int
- test batch size
- validate_every : int
- validate on dev set every such number of batches
- save_after : int
- skip saving model in early epochs
- debug : bool
- debug mode
-
- Returns
- -------
- DepParser
- parser itself
- """
- logger = init_logger(save_dir)
- config = _Config(train_file, dev_file, test_file, save_dir, pretrained_embeddings,
- min_occur_count,
- lstm_layers, word_dims, tag_dims, dropout_emb, lstm_hiddens,
- dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size,
- dropout_mlp, learning_rate, decay, decay_steps,
- beta_1, beta_2, epsilon, num_buckets_train, num_buckets_valid,
- num_buckets_test, train_iters,
- train_batch_size, debug)
- config.save()
- self._vocab = vocab = ParserVocabulary(train_file,
- pretrained_embeddings,
- min_occur_count)
- vocab.save(config.save_vocab_path)
- vocab.log_info(logger)
-
- with mx.Context(mxnet_prefer_gpu()):
- self._parser = parser = BiaffineParser(vocab, word_dims, tag_dims,
- dropout_emb,
- lstm_layers,
- lstm_hiddens, dropout_lstm_input,
- dropout_lstm_hidden,
- mlp_arc_size,
- mlp_rel_size, dropout_mlp, debug)
- parser.initialize()
- scheduler = ExponentialScheduler(learning_rate, decay, decay_steps)
- optimizer = mx.optimizer.Adam(learning_rate, beta_1, beta_2, epsilon,
- lr_scheduler=scheduler)
- trainer = gluon.Trainer(parser.collect_params(), optimizer=optimizer)
- data_loader = DataLoader(train_file, num_buckets_train, vocab)
- global_step = 0
- best_UAS = 0.
- batch_id = 0
- epoch = 1
- total_epoch = math.ceil(train_iters / validate_every)
- logger.info('Epoch %d out of %d', epoch, total_epoch)
- bar = Progbar(target=min(validate_every, data_loader.samples))
- while global_step < train_iters:
- for words, tags, arcs, rels in data_loader.get_batches(batch_size=train_batch_size,
- shuffle=True):
- with autograd.record():
- arc_accuracy, _, _, loss = parser.forward(words, tags, arcs, rels)
- loss_value = loss.asscalar()
- loss.backward()
- trainer.step(train_batch_size)
- batch_id += 1
- try:
- bar.update(batch_id,
- exact=[('UAS', arc_accuracy, 2),
- ('loss', loss_value)])
- except OverflowError:
- pass # sometimes loss can be 0 or infinity, crashes the bar
-
- global_step += 1
- if global_step % validate_every == 0:
- bar = Progbar(target=min(validate_every, train_iters - global_step))
- batch_id = 0
- UAS, LAS, speed = evaluate_official_script(parser, vocab,
- num_buckets_valid,
- test_batch_size,
- dev_file,
- os.path.join(save_dir,
- 'valid_tmp'))
- logger.info('Dev: UAS %.2f%% LAS %.2f%% %d sents/s', UAS, LAS, speed)
- epoch += 1
- if global_step < train_iters:
- logger.info('Epoch %d out of %d', epoch, total_epoch)
- if global_step > save_after and UAS > best_UAS:
- logger.info('- new best score!')
- best_UAS = UAS
- parser.save(config.save_model_path)
-
- # When validate_every is too big
- if not os.path.isfile(config.save_model_path) or best_UAS != UAS:
- parser.save(config.save_model_path)
-
- return self
-
- def load(self, path):
- """Load from disk
-
- Parameters
- ----------
- path : str
- path to the directory which typically contains a config.pkl file and a model.bin file
-
- Returns
- -------
- DepParser
- parser itself
- """
- config = _Config.load(os.path.join(path, 'config.pkl'))
- config.save_dir = path # redirect root path to what user specified
- self._vocab = vocab = ParserVocabulary.load(config.save_vocab_path)
- with mx.Context(mxnet_prefer_gpu()):
- self._parser = BiaffineParser(vocab, config.word_dims, config.tag_dims,
- config.dropout_emb,
- config.lstm_layers,
- config.lstm_hiddens, config.dropout_lstm_input,
- config.dropout_lstm_hidden,
- config.mlp_arc_size, config.mlp_rel_size,
- config.dropout_mlp, config.debug)
- self._parser.load(config.save_model_path)
- return self
-
- def evaluate(self, test_file, save_dir=None, logger=None,
- num_buckets_test=10, test_batch_size=5000):
- """Run evaluation on test set
-
- Parameters
- ----------
- test_file : str
- path to test set
- save_dir : str
- where to store intermediate results and log
- logger : logging.logger
- logger for printing results
- num_buckets_test : int
- number of clusters for sentences from test set
- test_batch_size : int
- batch size of test set
-
- Returns
- -------
- tuple
- UAS, LAS
- """
- parser = self._parser
- vocab = self._vocab
- with mx.Context(mxnet_prefer_gpu()):
- UAS, LAS, speed = evaluate_official_script(parser, vocab, num_buckets_test,
- test_batch_size, test_file,
- os.path.join(save_dir, 'valid_tmp'))
- if logger is None:
- logger = init_logger(save_dir, 'test.log')
- logger.info('Test: UAS %.2f%% LAS %.2f%% %d sents/s', UAS, LAS, speed)
-
- return UAS, LAS
-
- def parse(self, sentence):
- """Parse raw sentence into ConllSentence
-
- Parameters
- ----------
- sentence : list
- a list of (word, tag) tuples
-
- Returns
- -------
- ConllSentence
- ConllSentence object
- """
- words = np.zeros((len(sentence) + 1, 1), np.int32)
- tags = np.zeros((len(sentence) + 1, 1), np.int32)
- words[0, 0] = ParserVocabulary.ROOT
- tags[0, 0] = ParserVocabulary.ROOT
- vocab = self._vocab
-
- for i, (word, tag) in enumerate(sentence):
- words[i + 1, 0], tags[i + 1, 0] = vocab.word2id(word.lower()), vocab.tag2id(tag)
-
- with mx.Context(mxnet_prefer_gpu()):
- outputs = self._parser.forward(words, tags)
- words = []
- for arc, rel, (word, tag) in zip(outputs[0][0], outputs[0][1], sentence):
- words.append(ConllWord(idx=len(words) + 1, form=word, pos=tag,
- head=arc, relation=vocab.id2rel(rel)))
- return ConllSentence(words)
-
-
-if __name__ == '__main__':
- dep_parser = DepParser()
- dep_parser.train(train_file='tests/data/biaffine/ptb/train.conllx',
- dev_file='tests/data/biaffine/ptb/dev.conllx',
- test_file='tests/data/biaffine/ptb/test.conllx',
- save_dir='tests/data/biaffine/model',
- pretrained_embeddings=('glove', 'glove.6B.100d'))
- dep_parser.load('tests/data/biaffine/model')
- dep_parser.evaluate(test_file='tests/data/biaffine/ptb/test.conllx',
- save_dir='tests/data/biaffine/model')
-
- sent = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'),
- ('of', 'IN'), ('chamber', 'NN'), ('music', 'NN'), ('?', '.')]
- print(dep_parser.parse(sent))
diff --git a/scripts/parsing/parser/evaluate/__init__.py b/scripts/parsing/parser/evaluate/__init__.py
deleted file mode 100644
index ee3f45778c..0000000000
--- a/scripts/parsing/parser/evaluate/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Official evaluation for parsing."""
-
-from .evaluate import evaluate_official_script
diff --git a/scripts/parsing/parser/evaluate/evaluate.py b/scripts/parsing/parser/evaluate/evaluate.py
deleted file mode 100644
index 4622919d79..0000000000
--- a/scripts/parsing/parser/evaluate/evaluate.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Evaluation module for parsing results."""
-
-import time
-from functools import reduce
-import numpy as np
-import gluonnlp as nlp
-
-from scripts.parsing.common.data import DataLoader
-
-nlp.utils.check_version('0.7.0')
-
-def evaluate_official_script(parser, vocab, num_buckets_test, test_batch_size,
- test_file, output_file, debug=False):
- """Evaluate parser on a data set
-
- Parameters
- ----------
- parser : BiaffineParser
- biaffine parser
- vocab : ParserVocabulary
- vocabulary built from data set
- num_buckets_test : int
- size of buckets (cluster sentences into this number of clusters)
- test_batch_size : int
- batch size
- test_file : str
- gold test file
- output_file : str
- output result to this file
- debug : bool
- only evaluate first 1000 sentences for debugging
-
- Returns
- -------
- tuple
- UAS, LAS, speed
- """
- data_loader = DataLoader(test_file, num_buckets_test, vocab)
- record = data_loader.idx_sequence
- results = [None] * len(record)
- idx = 0
- seconds = time.time()
- uc, lc, total = 0, 0, 0
- for words, tags, arcs, rels in data_loader.get_batches(batch_size=test_batch_size,
- shuffle=False):
- outputs = parser.forward(words, tags)
- for output, gold_arc, gold_rel in zip(
- outputs, arcs.transpose([1, 0]), rels.transpose([1, 0])):
- pred_arc = output[0]
- pred_rel = output[1]
- length = pred_arc.shape[0]
- gold_arc = gold_arc[1:length + 1]
- gold_rel = gold_rel[1:length + 1]
-
- arc_mask = np.equal(pred_arc, gold_arc)
- uc += np.sum(arc_mask)
- total += length
-
- lc += np.sum(np.equal(pred_rel, gold_rel) * arc_mask)
- sent_idx = record[idx]
- results[sent_idx] = output
- idx += 1
- speed = len(record) / seconds
- UAS = uc / total * 100
- LAS = lc / total * 100
- if output_file:
- arcs = reduce(lambda x, y: x + y, [list(result[0]) for result in results])
- rels = reduce(lambda x, y: x + y, [list(result[1]) for result in results])
- idx = 0
- with open(test_file) as f:
- if debug:
- f = f.readlines()[:1000]
- with open(output_file, 'w') as fo:
- for line in f:
- info = line.strip().split()
- if info:
- arc_offset = 5
- rel_offset = 6
- if len(info) == 10: # conll or conllx
- arc_offset = 6
- rel_offset = 7
- # assert len(info) == 10, 'Illegal line: %s' % line
- info[arc_offset] = str(arcs[idx])
- info[rel_offset] = vocab.id2rel(rels[idx])
- fo.write('\t'.join(info) + '\n')
- idx += 1
- else:
- fo.write('\n')
- return UAS, LAS, speed
-
-
-def prf(correct, pred_sum, gold_sum):
- """
- Calculate precision, recall and f1 score
- Parameters
- ----------
- correct : int
- number of correct predictions
- pred_sum : int
- number of predictions
- gold_sum : int
- number of gold answers
- Returns
- -------
- tuple
- (p, r, f)
- """
- if pred_sum:
- p = correct / pred_sum
- else:
- p = 0
- if gold_sum:
- r = correct / gold_sum
- else:
- r = 0
- if p + r:
- f = 2 * p * r / (p + r)
- else:
- f = 0
- return p, r, f
diff --git a/scripts/preprocess/README.md b/scripts/preprocess/README.md
new file mode 100644
index 0000000000..bbecc9ca27
--- /dev/null
+++ b/scripts/preprocess/README.md
@@ -0,0 +1,20 @@
+# Data Preprocessing Toolkit in GluonNLP
+
+## Clean and Tokenize a Parallel Corpus
+
+To clean and tokenize a parallel corpus, use
+```
+nlp_preprocess clean_tok_para_corpus --help
+```
+
+## Learn/Apply Subwords
+
+To learn a subword tokenizer, use
+```
+nlp_preprocess learn_subword --help
+```
+
+To apply the learned subword tokenizer, user
+```
+nlp_preprocess apply_subword --help
+```
diff --git a/scripts/preprocess/__init__.py b/scripts/preprocess/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/preprocess/__main__.py b/scripts/preprocess/__main__.py
new file mode 100644
index 0000000000..472d304e58
--- /dev/null
+++ b/scripts/preprocess/__main__.py
@@ -0,0 +1,49 @@
+import argparse
+import textwrap
+
+from . import (
+ clean_tok_para_corpus,
+ clean_tok_mono_corpus,
+ learn_subword,
+ apply_subword
+)
+
+
+SUBCOMMANDS = ['clean_tok_para_corpus', 'clean_tok_mono_corpus',
+ 'learn_subword', 'apply_subword', 'help']
+
+
+def cli_main():
+ parser = argparse.ArgumentParser(
+ description='Sharable data preprocessing utilities in GluonNLP.',
+ prog='nlp_preprocess', add_help=False)
+ parser.add_argument('command', type=str,
+ choices=SUBCOMMANDS,
+ metavar='[subcommand]',
+ help='The subcommand to use. '
+ 'Choices are {}.'.format(SUBCOMMANDS))
+ args, other_args = parser.parse_known_args()
+ if args.command == 'clean_tok_para_corpus':
+ parser = clean_tok_para_corpus.get_parser()
+ sub_args = parser.parse_args(other_args)
+ clean_tok_para_corpus.main(sub_args)
+ elif args.command == 'clean_tok_mono_corpus':
+ parser = clean_tok_mono_corpus.get_parser()
+ sub_args = parser.parse_args(other_args)
+ clean_tok_mono_corpus.main(sub_args)
+ elif args.command == 'learn_subword':
+ parser = learn_subword.get_parser()
+ sub_args = parser.parse_args(other_args)
+ learn_subword.main(sub_args)
+ elif args.command == 'apply_subword':
+ parser = apply_subword.get_parser()
+ sub_args = parser.parse_args(other_args)
+ apply_subword.main(sub_args)
+ elif args.command == 'help':
+ parser.print_help()
+ else:
+ parser.print_help()
+
+
+if __name__ == '__main__':
+ cli_main()
diff --git a/scripts/preprocess/apply_subword.py b/scripts/preprocess/apply_subword.py
new file mode 100644
index 0000000000..dc4c0c974a
--- /dev/null
+++ b/scripts/preprocess/apply_subword.py
@@ -0,0 +1,176 @@
+import argparse
+import textwrap
+from multiprocessing import Pool
+import numpy as np
+import time
+from gluonnlp.data import tokenizers
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description=textwrap.dedent('''
+ Encode based on different implementations.
+
+ We support the following models:
+
+ "python3 apply_subword.py --model spm" : Encode with Sentencepiece Model;
+ "python3 apply_subword.py --model subword_nmt" : Encode with the subword-nmt package;
+ "python3 apply_subword.py --model yttm" : Encode with YouTokenToMe;
+ "python3 apply_subword.py --model hf_bytebpe" : Encode with the Byte-level BPE Tokenizer Implemented by Huggingface.
+ "python3 apply_subword.py --model hf_wordpiece" : Encode with the Wordpiece Tokenizer Implementated by Huggingface.
+ "python3 apply_subword.py --model hf_bpe" : Encode with the BPE Tokenizer Implemented by Huggingface.
+ ''')
+ )
+ parser.add_argument('--corpus', type=str, nargs='+', required=True,
+ help='Path of the corpus. '
+ 'You may input multiple corpus files separated by space.')
+ parser.add_argument('--save-path', type=str, required=True,
+ help='Path of the output file')
+ parser.add_argument('--model-path', type=str, default=None,
+ help='Path of the model file')
+ parser.add_argument('--vocab-path', type=str, default=None,
+ help='Path of the vocabulary file')
+ parser.add_argument('--model', type=str, choices=['spm',
+ 'subword_nmt',
+ 'yttm',
+ 'hf_bytebpe',
+ 'hf_wordpiece',
+ 'hf_bpe'],
+ required=True, help='Subword model type')
+ parser.add_argument('--num-process', type=int, default=16,
+ help='Number of process')
+ parser.add_argument('--lowercase', action='store_true', default=False,
+ help='Use lowercase, '
+ 'only applicable to hf_bpe, hf_bytebpe and hf_wordpiece')
+ parser.add_argument('--strip-accents', action='store_true', default=False,
+ help='Disable BERT characters normalization, '
+ 'only applicable to hf_wordpiece')
+ parser.add_argument('--output-type', type=str, choices=['subword', 'id'], default='subword',
+ help='Whether output subwords or ids')
+ parser.add_argument('--bpe-dropout', type=float, default=None,
+ help='BPE dropout, applicable to subword_nmt, yttm, hf_bpe and hf_bytebpe')
+
+ return parser
+
+
+class ParallelCorpusApplyer:
+ def __init__(self, corpus, tokenizer_model, output_type):
+ self.chunk_size = 1024 * 1024
+ self.corpus = corpus
+ self.tokenizer_model = tokenizer_model
+ self.output_type = output_type
+
+ def chunk_iter(self, step=10):
+ for corpus_path in self.corpus:
+ line_pos = [0]
+ with open(corpus_path, 'rb') as fcb:
+ pos = 0
+ for line in fcb:
+ pos += len(line)
+ line_pos.append(pos)
+ line_pos = np.array(line_pos, dtype=np.int64)
+ line_size = line_pos[1:] - line_pos[:-1]
+ num_lines = line_pos.shape[0] - 1
+ budget = self.chunk_size
+ chunk_start = 0
+ cur_chunk_size = 0
+ for i in range(0, num_lines, step):
+ line_batch_num = min(num_lines - i, step)
+ batch_line_size = line_size[i:(i + line_batch_num)].sum()
+ budget -= batch_line_size
+ cur_chunk_size += batch_line_size
+ if budget <= 0 or i + step >= num_lines:
+ yield corpus_path, chunk_start, cur_chunk_size
+ chunk_start += cur_chunk_size
+ budget = self.chunk_size
+ cur_chunk_size = 0
+
+ def process_chunk(self, args):
+ corpus_path, chunk_start, cur_chunk_size = args
+ with open(corpus_path, 'rb') as fcb:
+ fcb.seek(chunk_start)
+ lines_byte = fcb.read(cur_chunk_size)
+ lines_byte = lines_byte.splitlines()
+ sentences = [line_byte.decode('utf-8').strip() for line_byte in lines_byte]
+ all_tokens = self.tokenizer_model.encode(sentences, self.output_type)
+ tokenized_sentences = []
+ for ele_tokens in all_tokens:
+ if self.output_type == int:
+ ele_tokens = [str(token) for token in ele_tokens]
+ tokenized_sentences.append(' '.join(ele_tokens))
+ sentence_num = len(tokenized_sentences)
+ token_num = sum([len(sentence) for sentence in tokenized_sentences])
+ unk = self.tokenizer_model.vocab.unk_token
+ unk_num = sum(sentence.count(unk) for sentence in tokenized_sentences)
+ return tokenized_sentences, sentence_num, token_num, unk_num
+
+
+def main(args):
+ start = time.time()
+ if args.model == 'spm':
+ tokenizer_model = tokenizers.create('spm',
+ model_path=args.model_path,
+ vocab=args.vocab_path)
+ elif args.model == 'subword_nmt':
+ tokenizer_model = tokenizers.create('subword_nmt',
+ codec_path=args.model_path,
+ vocab_path=args.vocab_path,
+ bpe_dropout=args.bpe_dropout)
+ elif args.model == 'yttm':
+ args.bpe_dropout = 0.0 if not args.bpe_dropout else args.bpe_dropout
+ tokenizer_model = tokenizers.create('yttm',
+ model_path=args.model_path,
+ bpe_dropout=args.bpe_dropout,
+ n_threads=1)
+ elif args.model == 'hf_bytebpe':
+ tokenizer_model = tokenizers.create('hf_bytebpe',
+ merges_file=args.model_path,
+ vocab_file=args.vocab_path,
+ dropout=args.bpe_dropout,
+ lowercase=args.lowercase)
+ elif args.model == 'hf_wordpiece':
+ tokenizer_model = tokenizers.create('hf_wordpiece',
+ vocab_file=args.vocab_path,
+ lowercase=args.lowercase,
+ strip_accents=args.strip_accents)
+ elif args.model == 'hf_bpe':
+ tokenizer_model = tokenizers.create('hf_bpe',
+ merges_file=args.model_path,
+ vocab_file=args.vocab_path,
+ dropout=args.bpe_dropout,
+ lowercase=args.lowercase)
+ else:
+ raise NotImplementedError
+ print('Applying {} to {}'. format(tokenizer_model.__class__.__name__,
+ ', '.join(args.corpus)))
+ output_type = {'subword': str, 'id': int}[args.output_type]
+ applyer = ParallelCorpusApplyer(args.corpus, tokenizer_model, output_type)
+ with open(args.save_path, 'w', encoding='utf-8', newline='\n') as fo:
+ with Pool(args.num_process) as pool:
+ sentence_count = token_count = unk_count = 0
+ for i, (tokenized_sentences, sentence_num, token_num, unk_num) in \
+ enumerate(pool.imap(applyer.process_chunk, applyer.chunk_iter())):
+ fo.write('\n'.join(tokenized_sentences))
+ fo.write('\n')
+ sentence_count += sentence_num
+ token_count += token_num
+ unk_count += unk_num
+ if (i + 1) % 100 == 0:
+ print('Chunk {} , #Lines processed: {}'
+ .format(i + 1, sentence_count))
+ end = time.time()
+ print('Done, #Lines processed {}, Avg tokens of sentences {:.1f},'
+ 'Unknown rate {:.1f}%, Time spent {}'
+ .format(sentence_count, token_count / sentence_count,
+ unk_count * 100 / token_count, end - start))
+
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == '__main__':
+ cli_main()
diff --git a/scripts/preprocess/clean_tok_mono_corpus.py b/scripts/preprocess/clean_tok_mono_corpus.py
new file mode 100644
index 0000000000..79416b4798
--- /dev/null
+++ b/scripts/preprocess/clean_tok_mono_corpus.py
@@ -0,0 +1,252 @@
+import argparse
+import os
+import multiprocessing
+import time
+import numpy as np
+import warnings
+import re
+from gluonnlp.data.filtering import MosesNormalizer
+from gluonnlp.data.tokenizers import MosesTokenizer, BaseTokenizer,\
+ WhitespaceTokenizer, JiebaTokenizer
+from typing import List, Union, Optional
+re._MAXCACHE = 1024
+
+
+def get_tokenizer(tokenizer, lang=None):
+ if isinstance(tokenizer, BaseTokenizer):
+ return tokenizer
+ else:
+ if tokenizer == 'moses':
+ return MosesTokenizer(lang=lang)
+ elif tokenizer == 'whitespace':
+ return WhitespaceTokenizer()
+ elif tokenizer == 'jieba':
+ return JiebaTokenizer()
+ else:
+ raise NotImplementedError
+
+
+# TODO(sxjscience) Consider whether to
+def check_latin1(sentence: str) -> bool:
+ """Check whether the sentence can be encoded in latin1
+
+ This is used in
+ https://github.com/mlperf/training/blob/master/rnn_translator/pytorch/scripts/filter_dataset.py
+
+ The idea is to filter the sentences with rare unicode glyphs
+
+ Returns
+ -------
+ ret
+ Whether sentences are latin1
+ """
+ try:
+ sentence.encode('latin1')
+ except UnicodeEncodeError:
+ return False
+ else:
+ return True
+
+
+def get_line_byte_start(corpus_path: str) -> np.ndarray:
+ """Get the start position of each lines in terms of bytes so that we can use seek + read to
+ load an arbitrary line.
+
+ Parameters
+ ----------
+ corpus_path
+ The path of the corpus
+
+ Returns
+ -------
+ line_pos
+ Shape (#Lens + 1,)
+ """
+ line_pos = [0]
+ with open(corpus_path, 'rb') as in_f:
+ pos = 0
+ for line in in_f:
+ pos += len(line)
+ line_pos.append(pos)
+ return np.array(line_pos, dtype=np.int64)
+
+
+class MonoCorpusProcessor:
+ """Process sentence of corpus.
+
+ This largely recovers the functionality of 'clean-corpus-n.perl' in mosesdecoder.
+ The difference is that it is customizable with pure python.
+
+ By default, we will perform the following pre-processing pipeline.
+ Each stage could be turned on/off and specialized based on the input arguments.
+ Also, you may directly revise the code and write your own processing script.
+
+ 1. Normalize sentence
+ 2. Pre-filter
+ 3. Tokenize the sentence
+ 4. Filter the sentence based on different rules
+ 3.1 Remove sentences where `max(len(lhs) / len(rhs), len(rhs) / len(lhs) > max_ratio`
+ 3.2 Remove sentences where not `min_max_words <= len(lhs) <= max_num_words` and
+ `min_max_words <= len(rhs) <= max_num_words`
+ """
+ def __init__(self, lang: str,
+ normalize: bool = True,
+ tokenizer: Union[str, BaseTokenizer] = 'whitespace',
+ min_num_words: Optional[int] = None,
+ max_num_words: Optional[int] = None,
+ discard_non_latin1: bool = False):
+ self._lang = lang
+ if normalize:
+ self._normalizer = MosesNormalizer(lang=lang)
+ self._tokenizer = get_tokenizer(tokenizer, lang)
+ self._min_num_words = min_num_words
+ self._max_num_words = max_num_words
+ self._discard_non_latin1 = discard_non_latin1
+
+ def process_chunk(self, args):
+ path, chunk_start, chunk_size = args
+ processed_lines = []
+ with open(path, 'rb') as in_f:
+ # Read chunk
+ in_f.seek(chunk_start)
+ lines = in_f.read(chunk_size)
+ lines = lines.splitlines()
+ unfiltered_line_num = len(lines)
+ for line in lines:
+ line = line.decode('utf-8').strip()
+ # 1. Normalize
+ line = self._normalizer(line)
+ # 2. Filter after normalization.
+ if self._discard_non_latin1:
+ if not check_latin1(line):
+ continue
+ # 3. Tokenize the sentence
+ tokens = self._tokenizer.encode(line)
+ # 4. Filter after tokenization. Filter with multiple rules
+ if len(tokens) == 0:
+ continue
+ if self._max_num_words is not None:
+ if len(tokens) > self._max_num_words:
+ continue
+ if self._min_num_words is not None:
+ if len(tokens) < self._min_num_words:
+ continue
+ processed_lines.append(' '.join(tokens))
+ return processed_lines, unfiltered_line_num
+
+ def process_mono_corpus(self,
+ corpus_paths: List[str],
+ out_path: str,
+ chunk_size: int = 1024 * 1024,
+ num_process: int = 8) -> int:
+ """Preprocess the mono corpus
+
+ Parameters
+ ----------
+ corpus_paths
+ Corpus paths
+ out_path
+ Write the results to the output path
+ chunk_size
+ Approximately split the corpus files into multiple chunks
+ num_process
+ The number of process
+
+ Returns
+ -------
+ line_count
+ The number of lines in the final filtered file
+ """
+ start = time.time()
+ total_line_count = 0
+ filtered_line_count = 0
+
+ def chunk_iterator(step=10):
+ for path in corpus_paths:
+ line_pos = get_line_byte_start(path)
+ line_size = line_pos[1:] - line_pos[:-1]
+ num_lines = line_pos.shape[0] - 1
+ budget = chunk_size
+ chunk_start = 0
+ cur_chunk_size = 0
+ for i in range(0, num_lines, step):
+ line_batch_num = min(num_lines - i, step)
+ batch_line_size = line_size[i:(i + line_batch_num)].sum()
+ budget -= batch_line_size
+ cur_chunk_size += batch_line_size
+ if budget <= 0 or i + step >= num_lines:
+ yield path, chunk_start, cur_chunk_size
+ chunk_start += cur_chunk_size
+ cur_chunk_size = 0
+ budget = chunk_size
+
+ with open(out_path, 'w', encoding='utf-8', newline='\n') as out_f:
+ with multiprocessing.Pool(num_process) as pool:
+ for i, (processed_lines, unfiltered_line_num) in \
+ enumerate(pool.imap(self.process_chunk, chunk_iterator())):
+ out_f.write('\n'.join(processed_lines) + '\n')
+ filtered_line_count += len(processed_lines)
+ total_line_count += unfiltered_line_num
+ if (i + 1) % 100 == 0:
+ print('Chunk {}, #Lines Processed: {}, Filtered: {}, Remain: {}'
+ .format(i + 1, total_line_count,
+ total_line_count - filtered_line_count,
+ filtered_line_count))
+ end = time.time()
+ print('Done, #Lines {}/{}, Time spent {}'.format(filtered_line_count,
+ total_line_count,
+ end - start))
+ return filtered_line_count
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ description='Clean mono corpus used in machine translation.')
+ parser.add_argument('--corpus', type=str, nargs='+', required=True)
+ parser.add_argument('--lang', type=str, required=True)
+ parser.add_argument('--save-path', type=str, default=None,
+ help='Path to save the cleaned and tokenized corpus. If not set, '
+ 'the default is "corpus.tok.{lang}"')
+ parser.add_argument('--tokenizer', type=str, default='moses')
+ parser.add_argument('--min-num-words', type=int, default=None)
+ parser.add_argument('--max-num-words', type=int, default=None)
+ parser.add_argument('--discard-non-latin1', action='store_true',
+ help='Whether to discard the sentence pair if both sentences cannot be '
+ 'encoded into latin1.')
+ parser.add_argument('--num-process', type=int, default=8,
+ help='number of process')
+ parser.add_argument('--overwrite', action='store_true')
+
+ return parser
+
+
+def main(args):
+ corpus_processor = MonoCorpusProcessor(lang=args.lang,
+ tokenizer=args.tokenizer,
+ min_num_words=args.min_num_words,
+ max_num_words=args.max_num_words,
+ discard_non_latin1=args.discard_non_latin1)
+ print('Clean the mono corpus:')
+ print(' {}: {}'.format(args.lang, args.corpus))
+ if args.save_path is None:
+ save_path = 'corpus.tok.{}'.format(args.lang)
+ else:
+ save_path = args.save_path
+ print('Save to {} -> {} \n'.format(args.lang, save_path))
+ if os.path.exists(save_path) and not args.overwrite:
+ warnings.warn('{} or {} exists, skip. If you need to overwrite this file, '
+ 'rerun the script with --overwrite.'.format(save_path))
+ else:
+ corpus_processor.process_mono_corpus(
+ corpus_paths=args.corpus,
+ out_path=save_path,
+ num_process=args.num_process)
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == '__main__':
+ cli_main()
diff --git a/scripts/preprocess/clean_tok_para_corpus.py b/scripts/preprocess/clean_tok_para_corpus.py
new file mode 100644
index 0000000000..dd4afcb4a6
--- /dev/null
+++ b/scripts/preprocess/clean_tok_para_corpus.py
@@ -0,0 +1,314 @@
+import argparse
+import os
+import multiprocessing
+import time
+import numpy as np
+import warnings
+import re
+from gluonnlp.data.filtering import MosesNormalizer
+from gluonnlp.data.tokenizers import MosesTokenizer, BaseTokenizer,\
+ WhitespaceTokenizer, JiebaTokenizer
+from typing import List, Union, Optional
+re._MAXCACHE = 1024
+
+
+def get_tokenizer(tokenizer, lang=None):
+ if isinstance(tokenizer, BaseTokenizer):
+ return tokenizer
+ else:
+ if tokenizer == 'moses':
+ return MosesTokenizer(lang=lang)
+ elif tokenizer == 'whitespace':
+ return WhitespaceTokenizer()
+ elif tokenizer == 'jieba':
+ return JiebaTokenizer()
+ else:
+ raise NotImplementedError
+
+
+# TODO(sxjscience) Consider whether to
+def check_both_latin1(src_sentence: str, tgt_sentence: str) -> bool:
+ """Check whether the sentence pair can all be encoded in latin1
+
+ This is used in
+ https://github.com/mlperf/training/blob/master/rnn_translator/pytorch/scripts/filter_dataset.py
+
+ The idea is to filter the sentences with rare unicode glyphs and are unlikely to be en-de
+
+ Returns
+ -------
+ ret
+ Whether both sentences are latin1
+ """
+ try:
+ src_sentence.encode('latin1')
+ tgt_sentence.encode('latin1')
+ except UnicodeEncodeError:
+ return False
+ else:
+ return True
+
+
+def get_line_byte_start(corpus_path: str) -> np.ndarray:
+ """Get the start position of each lines in terms of bytes so that we can use seek + read to
+ load an arbitrary line.
+
+ Parameters
+ ----------
+ corpus_path
+ The path of the corpus
+
+ Returns
+ -------
+ line_pos
+ Shape (#Lens + 1,)
+ """
+ line_pos = [0]
+ with open(corpus_path, 'rb') as in_f:
+ pos = 0
+ for line in in_f:
+ pos += len(line)
+ line_pos.append(pos)
+ return np.array(line_pos, dtype=np.int64)
+
+
+class ParallelCorpusProcessor:
+ """Process a pair of corpus.
+
+ This largely recovers the functionality of 'clean-corpus-n.perl' in mosesdecoder.
+ The difference is that it is customizable with pure python.
+
+ By default, we will perform the following pre-processing pipeline.
+ Each stage could be turned on/off and specialized based on the input arguments.
+ Also, you may directly revise the code and write your own processing script.
+
+ 1. Normalize sentence
+ 2. Pre-filter
+ 3. Tokenize the sentence
+ 4. Filter the sentence based on different rules
+ 3.1 Remove pairs where `max(len(lhs) / len(rhs), len(rhs) / len(lhs) > max_ratio`
+ 3.2 Remove pairs where not `min_max_words <= len(lhs) <= max_num_words` and
+ `min_max_words <= len(rhs) <= max_num_words`
+ """
+ def __init__(self, src_lang: str, tgt_lang: str,
+ normalize: bool = True,
+ src_tokenizer: Union[str, BaseTokenizer] = 'whitespace',
+ tgt_tokenizer: Union[str, BaseTokenizer] = 'whitespace',
+ max_ratio: Optional[float] = None,
+ min_num_words: Optional[int] = None,
+ max_num_words: Optional[int] = None,
+ discard_non_latin1: bool = False):
+ self._src_lang = src_lang
+ self._tgt_lang = tgt_lang
+ if normalize:
+ self._src_normalizer = MosesNormalizer(lang=src_lang)
+ self._tgt_normalizer = MosesNormalizer(lang=tgt_lang)
+ self._src_tokenizer = get_tokenizer(src_tokenizer, src_lang)
+ self._tgt_tokenizer = get_tokenizer(tgt_tokenizer, tgt_lang)
+ self._max_ratio = max_ratio
+ self._min_num_words = min_num_words
+ self._max_num_words = max_num_words
+ self._discard_non_latin1 = discard_non_latin1
+
+ def process_chunk(self, args):
+ src_path, src_chunk_start, src_chunk_size, tgt_path, tgt_chunk_start, tgt_chunk_size = args
+ processed_src_lines = []
+ processed_tgt_lines = []
+ with open(src_path, 'rb') as src_in_f:
+ with open(tgt_path, 'rb') as tgt_in_f:
+ # Read chunk from source and target
+ src_in_f.seek(src_chunk_start)
+ src_lines = src_in_f.read(src_chunk_size)
+ tgt_in_f.seek(tgt_chunk_start)
+ tgt_lines = tgt_in_f.read(tgt_chunk_size)
+ src_lines = src_lines.splitlines()
+ tgt_lines = tgt_lines.splitlines()
+ unfiltered_line_num = len(src_lines)
+ for src_line, tgt_line in zip(src_lines, tgt_lines):
+ src_line = src_line.decode('utf-8').strip()
+ tgt_line = tgt_line.decode('utf-8').strip()
+ # 1. Normalize
+ src_line = self._src_normalizer(src_line)
+ tgt_line = self._tgt_normalizer(tgt_line)
+ # 2. Filter after normalization.
+ if self._discard_non_latin1:
+ if not check_both_latin1(src_line, tgt_line):
+ continue
+ # 3. Tokenize the sentence
+ src_tokens = self._src_tokenizer.encode(src_line)
+ tgt_tokens = self._tgt_tokenizer.encode(tgt_line)
+ # 4. Filter after tokenization. Filter with multiple rules
+ if len(src_tokens) == 0 or len(tgt_tokens) == 0:
+ continue
+ if self._max_ratio is not None:
+ if max(len(src_tokens) / len(tgt_tokens),
+ len(tgt_tokens) / len(src_tokens)) > self._max_ratio:
+ continue
+ if self._max_num_words is not None:
+ if len(src_tokens) > self._max_num_words or\
+ len(tgt_tokens) > self._max_num_words:
+ continue
+ if self._min_num_words is not None:
+ if len(src_tokens) < self._min_num_words\
+ or len(tgt_tokens) < self._min_num_words:
+ continue
+ processed_src_lines.append(' '.join(src_tokens))
+ processed_tgt_lines.append(' '.join(tgt_tokens))
+ return processed_src_lines, processed_tgt_lines, unfiltered_line_num
+
+ def process_parallel_corpus(self, src_corpus_paths: List[str],
+ tgt_corpus_paths: List[str],
+ src_out_path: str, tgt_out_path: str,
+ chunk_size: int = 1024 * 1024,
+ num_process: int = 8) -> int:
+ """Preprocess the parallel corpus
+
+ Parameters
+ ----------
+ src_corpus_paths
+ Source corpus paths
+ tgt_corpus_paths
+ Target corpus paths
+ src_out_path
+ Write the results to the source output path
+ tgt_out_path
+ Write the results to the target output path
+ chunk_size
+ Approximately split the corpus files into multiple chunks
+ num_process
+ The number of process
+
+ Returns
+ -------
+ line_count
+ The number of lines in the final filtered file
+ """
+ start = time.time()
+ total_line_count = 0
+ filtered_line_count = 0
+
+ def chunk_iterator(step=10):
+ for src_path, tgt_path in zip(src_corpus_paths, tgt_corpus_paths):
+ src_line_pos = get_line_byte_start(src_path)
+ tgt_line_pos = get_line_byte_start(tgt_path)
+ src_line_size = src_line_pos[1:] - src_line_pos[:-1]
+ tgt_line_size = tgt_line_pos[1:] - tgt_line_pos[:-1]
+ num_src_lines = src_line_pos.shape[0] - 1
+ num_tgt_lines = tgt_line_pos.shape[0] - 1
+ assert num_src_lines == num_tgt_lines
+ src_budget = chunk_size
+ tgt_budget = chunk_size
+ src_chunk_start = 0
+ tgt_chunk_start = 0
+ src_chunk_size = 0
+ tgt_chunk_size = 0
+ for i in range(0, num_src_lines, step):
+ line_batch_num = min(num_src_lines - i, step)
+ src_batch_line_size = src_line_size[i:(i + line_batch_num)].sum()
+ tgt_batch_line_size = tgt_line_size[i:(i + line_batch_num)].sum()
+ src_budget -= src_batch_line_size
+ tgt_budget -= tgt_batch_line_size
+ src_chunk_size += src_batch_line_size
+ tgt_chunk_size += tgt_batch_line_size
+ if src_budget <= 0 or tgt_budget <= 0 or i + step >= num_src_lines:
+ yield src_path, src_chunk_start, src_chunk_size,\
+ tgt_path, tgt_chunk_start, tgt_chunk_size
+ src_chunk_start += src_chunk_size
+ tgt_chunk_start += tgt_chunk_size
+ src_chunk_size = 0
+ tgt_chunk_size = 0
+ src_budget = chunk_size
+ tgt_budget = chunk_size
+
+ with open(src_out_path, 'w', encoding='utf-8', newline='\n') as src_out_f:
+ with open(tgt_out_path, 'w', encoding='utf-8', newline='\n') as tgt_out_f:
+ with multiprocessing.Pool(num_process) as pool:
+ for i, (processed_src_lines, processed_tgt_lines, unfiltered_line_num) in \
+ enumerate(pool.imap(self.process_chunk, chunk_iterator())):
+ src_out_f.write('\n'.join(processed_src_lines) + '\n')
+ tgt_out_f.write('\n'.join(processed_tgt_lines) + '\n')
+ filtered_line_count += len(processed_src_lines)
+ total_line_count += unfiltered_line_num
+ if (i + 1) % 100 == 0:
+ print('Chunk {}, #Lines Processed: {}, Filtered: {}, Remain: {}'
+ .format(i + 1, total_line_count,
+ total_line_count - filtered_line_count,
+ filtered_line_count))
+ end = time.time()
+ print('Done, #Lines {}/{}, Time spent {}'.format(filtered_line_count,
+ total_line_count,
+ end - start))
+ return filtered_line_count
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ description='Clean parallel corpus used in machine translation.')
+ parser.add_argument('--src-corpus', type=str, nargs='+', required=True)
+ parser.add_argument('--tgt-corpus', type=str, nargs='+', required=True)
+ parser.add_argument('--src-lang', type=str, required=True)
+ parser.add_argument('--tgt-lang', type=str, required=True)
+ parser.add_argument('--src-save-path', type=str, default=None,
+ help='Path to save the cleaned and tokenized source corpus. If not set, '
+ 'the default is "corpus.tok.{src_lang}"')
+ parser.add_argument('--tgt-save-path', type=str, default=None,
+ help='Path to save the cleaned and tokenized source corpus. If not set, '
+ 'the default is "corpus.tok.{src_lang}"')
+ parser.add_argument('--src-tokenizer', type=str, default='moses')
+ parser.add_argument('--tgt-tokenizer', type=str, default='moses')
+ parser.add_argument('--max-ratio', type=float, default=None)
+ parser.add_argument('--min-num-words', type=int, default=None)
+ parser.add_argument('--max-num-words', type=int, default=None)
+ parser.add_argument('--discard-non-latin1', action='store_true',
+ help='Whether to discard the sentence pair if both sentences cannot be '
+ 'encoded into latin1.')
+ parser.add_argument('--num-process', type=int, default=8,
+ help='number of process')
+ parser.add_argument('--overwrite', action='store_true')
+
+ return parser
+
+
+def main(args):
+ src_lang, tgt_lang = args.src_lang, args.tgt_lang
+ corpus_processor = ParallelCorpusProcessor(src_lang=src_lang,
+ tgt_lang=tgt_lang,
+ src_tokenizer=args.src_tokenizer,
+ tgt_tokenizer=args.tgt_tokenizer,
+ max_ratio=args.max_ratio,
+ min_num_words=args.min_num_words,
+ max_num_words=args.max_num_words,
+ discard_non_latin1=args.discard_non_latin1)
+ print('Clean the corpus:')
+ print(' Source {}: {}'.format(src_lang, args.src_corpus))
+ print(' Target {}: {}'.format(tgt_lang, args.tgt_corpus))
+ if args.src_save_path is None:
+ src_save_path = 'corpus.tok.{}'.format(src_lang)
+ else:
+ src_save_path = args.src_save_path
+ if args.tgt_save_path is None:
+ tgt_save_path = 'corpus.tok.{}'.format(tgt_lang)
+ else:
+ tgt_save_path = args.tgt_save_path
+ print('Save to {} -> {} \n'
+ ' {} -> {}'.format(src_lang, src_save_path, tgt_lang, tgt_save_path))
+ if (os.path.exists(src_save_path) or os.path.exists(tgt_save_path)) and not args.overwrite:
+ warnings.warn('{} or {} exists, skip. If you need to overwrite these two files, '
+ 'rerun the script with --overwrite.'.format(src_save_path, tgt_save_path))
+ else:
+ corpus_processor.process_parallel_corpus(
+ src_corpus_paths=args.src_corpus,
+ tgt_corpus_paths=args.tgt_corpus,
+ src_out_path=src_save_path,
+ tgt_out_path=tgt_save_path,
+ num_process=args.num_process)
+
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == '__main__':
+ cli_main()
diff --git a/scripts/preprocess/learn_subword.py b/scripts/preprocess/learn_subword.py
new file mode 100644
index 0000000000..ba0dbde627
--- /dev/null
+++ b/scripts/preprocess/learn_subword.py
@@ -0,0 +1,252 @@
+from gluonnlp.utils.lazy_imports import try_import_sentencepiece,\
+ try_import_subword_nmt, try_import_yttm, try_import_huggingface_tokenizers
+import argparse
+import textwrap
+import os
+from collections import OrderedDict
+import json
+from uuid import uuid4
+from gluonnlp.data import Vocab
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description=textwrap.dedent('''
+ Learn BPE based on different implementations.
+
+ We support the following models:
+
+ "python3 learn_subword.py --model spm" : Train a Sentencepiece Model on raw text;
+ "python3 learn_subword.py --model subword_nmt" : Train with the subword-nmt package;
+ "python3 learn_subword.py --model yttm" : Train with YouTokenToMe;
+ "python3 learn_subword.py --model hf_bytebpe" : Train with the Byte-level BPE Tokenizer Implemented by Huggingface.
+ "python3 learn_subword.py --model hf_wordpiece" : Train with the Wordpiece Tokenizer Implementated by Huggingface.
+ "python3 learn_subword.py --model hf_bpe" : Train with the BPE Tokenizer Implemented by Huggingface.
+ ''')
+ )
+ parser.add_argument('--corpus', type=str, nargs='+', required=True,
+ help='Path of the corpus. '
+ 'You may input multiple corpus files separated by space.')
+ parser.add_argument('--vocab-size', type=int, required=True,
+ help='Estimated learned vocabulary size')
+ parser.add_argument('--model', type=str, choices=['spm',
+ 'subword_nmt',
+ 'yttm',
+ 'hf_bytebpe',
+ 'hf_wordpiece',
+ 'hf_bpe'],
+ required=True, help='Subword model type')
+ parser.add_argument('--save-dir', type=str, required=True,
+ help='Directory for saving the model and vocabulary file')
+ parser.add_argument('--coverage', type=float, default=1.0,
+ help='Amount of characters covered by the model, '
+ 'this is only applicable to spm and yttm')
+ parser.add_argument('--n-threads', type=int, default=-1,
+ help='Number of threads, only applicable to yttm')
+ parser.add_argument('--input-sentence-size', type=int, default=1000000,
+ help='Size of input sentence, only applicable to sentencepiece, '
+ 'you can reduce this value when getting out of memory error')
+ parser.add_argument('--lowercase', action='store_true', default=False,
+ help='Use lowercase, '
+ 'only applicable to hf_bpe, hf_bytebpe and hf_wordpiece')
+ parser.add_argument('--strip-accents', action='store_true', default=False,
+ help='Disable BERT characters normalization, '
+ 'only applicable to hf_wordpiece')
+ parser.add_argument('--disable-bos', action='store_true', default=False,
+ help='Disable bos token (default settings enable bos)')
+ parser.add_argument('--disable-eos', action='store_true', default=False,
+ help='Disable eos token (default settings enable eos)')
+ parser.add_argument('--disable-pad', action='store_true', default=False,
+ help='Disable pad token (default settings enable pad)')
+ parser.add_argument('--custom-special-tokens', type=str, nargs='*', default=[],
+ help='Specified special tokens key value pairs besides unk, '
+ 'bos, eos and pad, for example: '
+ '--custom special tokens cls_token= sep_token=, '
+ 'this is not applicable to yttm')
+ return parser
+
+def main(args):
+ corpus_path_list = args.corpus
+ if not os.path.exists(args.save_dir):
+ os.makedirs(args.save_dir)
+ model_prefix = os.path.join(args.save_dir, args.model)
+ special_tokens_kv = OrderedDict()
+ # unk is always required
+ special_tokens_kv['unk_token'] = Vocab.UNK_TOKEN
+ if not args.disable_bos:
+ special_tokens_kv['bos_token'] = Vocab.BOS_TOKEN
+ if not args.disable_eos:
+ special_tokens_kv['eos_token'] = Vocab.EOS_TOKEN
+ if not args.disable_pad:
+ special_tokens_kv['pad_token'] = Vocab.PAD_TOKEN
+ # split custom special tokens
+ if args.model in ['yttm'] and len(args.custom_special_tokens) > 0:
+ raise ValueError('model {} do not support custom_special_tokens'.format(args.model))
+ for custom_special_token in args.custom_special_tokens:
+ kv = custom_special_token.split('=')
+ if not len(kv) == 2:
+ raise ValueError('parameter {} has wrong format'.format(custom_special_token))
+ k, v = kv[0], kv[1]
+ if k in special_tokens_kv:
+ raise ValueError('There are overlaps between the custom special tokens and the'
+ ' unk, bos, eos, pad tokens')
+ special_tokens_kv[k] = v
+ # hf_wordpiece must contains mask, cls and sep tokens
+ # the costom defined mask,cls,sep can overwrite the default settings
+ if args.model == 'hf_wordpiece':
+ if 'mask_token' not in special_tokens_kv:
+ special_tokens_kv['mask_token'] = Vocab.MASK_TOKEN
+ if 'cls_token' not in special_tokens_kv:
+ special_tokens_kv['cls_token'] = Vocab.CLS_TOKEN
+ if 'sep_token' not in special_tokens_kv:
+ special_tokens_kv['sep_token'] = Vocab.SEP_TOKEN
+ special_tokens = list(special_tokens_kv.values())
+ print('special tokens: ' + ', '.join(special_tokens))
+ vocab = []
+ if args.model == 'spm':
+ try_import_sentencepiece()
+ import sentencepiece as spm
+ corpus_path = ','.join(corpus_path_list)
+ script = '--input={} --model_prefix={} --vocab_size={} --character_coverage={} --input_sentence_size={}' \
+ .format(corpus_path, model_prefix, args.vocab_size, args.coverage, args.input_sentence_size)
+ script += (' --unk_id=' + str(special_tokens.index(Vocab.UNK_TOKEN)))
+ script += (' --bos_id=' + ('-1' if args.disable_bos else str(special_tokens.index(Vocab.BOS_TOKEN))))
+ script += (' --eos_id=' + ('-1' if args.disable_eos else str(special_tokens.index(Vocab.EOS_TOKEN))))
+ script += (' --pad_id=' + ('-1' if args.disable_pad else str(special_tokens.index(Vocab.PAD_TOKEN))))
+ if len(args.custom_special_tokens) > 0:
+ ids_in_script = script.count('_id')
+ script += (' --control_symbols=' + ','.join(special_tokens[ids_in_script:]))
+ print(script)
+ spm.SentencePieceTrainer.Train(script)
+ if 'bos_token' in special_tokens_kv:
+ special_tokens_kv['bos_token'] = ''
+ if 'eos_token' in special_tokens_kv:
+ special_tokens_kv['eos_token'] = ''
+ # build spm vocab
+ spm_model = spm.SentencePieceProcessor()
+ spm_model.load(model_prefix + '.model')
+ vocab = [spm_model.id_to_piece(i) for i in range(len(spm_model))]
+ os.remove(model_prefix + '.vocab')
+ elif args.model == 'subword_nmt':
+ try_import_subword_nmt()
+ from subword_nmt import learn_bpe
+ corpus_path = cat_corpus(corpus_path_list)\
+ if len(corpus_path_list) > 1 else corpus_path_list[0]
+ # build model
+ with open(corpus_path, 'r', encoding='utf-8') as fc,\
+ open(model_prefix + '.model', 'w', encoding='utf-8') as fm:
+ learn_bpe.learn_bpe(fc, fm, args.vocab_size - len(special_tokens), total_symbols=True)
+ # build vocab
+ with open(corpus_path, 'r', encoding='utf-8') as fc, \
+ open(model_prefix + '.model', 'r', encoding='utf-8') as fm:
+ vocab.extend(special_tokens)
+ uniq_chars_internal = set()
+ uniq_chars_final = set()
+ uniq_words = set()
+ for line in fc:
+ for word in line.strip('\r\n ').split(' '):
+ if word:
+ uniq_words.add(word)
+ # this code piece is same as
+ # https://github.com/rsennrich/subword-nmt/blob/master/subword_nmt/learn_bpe.py shows
+ uniq_words = [tuple(x[:-1]) + (x[-1]+'',) for x in uniq_words]
+ for word in uniq_words:
+ for char in word[:-1]:
+ uniq_chars_internal.add(char)
+ uniq_chars_final.add(word[-1])
+ # sort to ensure the same settings produce the same vocab
+ vocab.extend(sorted(list(uniq_chars_internal)))
+ vocab.extend(sorted(list(uniq_chars_final)))
+ fm.readline()
+ pair = fm.readline()
+ while (pair):
+ vocab.append(pair.replace(' ', '', 1).strip())
+ pair = fm.readline()
+ if len(corpus_path_list) > 1:
+ os.remove(corpus_path)
+ elif args.model == 'yttm':
+ try_import_yttm()
+ import youtokentome as yttm
+ corpus_path = cat_corpus(corpus_path_list)\
+ if len(corpus_path_list) > 1 else corpus_path_list[0]
+ tokenizer = yttm.BPE.train(
+ data=corpus_path,
+ model=model_prefix + '.model',
+ vocab_size=args.vocab_size,
+ coverage=args.coverage,
+ n_threads=args.n_threads,
+ unk_id=special_tokens.index(Vocab.UNK_TOKEN),
+ bos_id=-1 if args.disable_bos else special_tokens.index(Vocab.BOS_TOKEN),
+ eos_id=-1 if args.disable_eos else special_tokens.index(Vocab.EOS_TOKEN),
+ pad_id=-1 if args.disable_pad else special_tokens.index(Vocab.PAD_TOKEN))
+ vocab = tokenizer.vocab()
+ if 'unk_token' in special_tokens_kv:
+ special_tokens_kv['unk_token'] = ''
+ if 'bos_token' in special_tokens_kv:
+ special_tokens_kv['bos_token'] = ''
+ if 'eos_token' in special_tokens_kv:
+ special_tokens_kv['eos_token'] = ''
+ if 'pad_token' in special_tokens_kv:
+ special_tokens_kv['pad_token'] = ''
+ if len(corpus_path_list) > 1:
+ os.remove(corpus_path)
+ elif args.model in ['hf_bpe', 'hf_bytebpe', 'hf_wordpiece']:
+ tokenizers = try_import_huggingface_tokenizers()
+ if args.model == 'hf_bpe':
+ tokenizer = tokenizers.CharBPETokenizer(lowercase=args.lowercase)
+ elif args.model == 'hf_bytebpe':
+ tokenizer = tokenizers.ByteLevelBPETokenizer(lowercase=args.lowercase)
+ elif args.model == 'hf_wordpiece':
+ tokenizer = tokenizers.BertWordPieceTokenizer(lowercase=args.lowercase,
+ strip_accents=args.strip_accents)
+ else:
+ raise NotImplementedError
+ tokenizer.train(
+ corpus_path_list,
+ vocab_size=args.vocab_size,
+ show_progress=True,
+ special_tokens=special_tokens)
+ tokenizer.save(args.save_dir, args.model)
+ # we replace the huggingface vocab file with our Vocab implementation
+ if args.model == 'hf_wordpiece':
+ hf_vocab_file = model_prefix + '-vocab.txt'
+ with open(hf_vocab_file, 'r', encoding='utf-8') as fv:
+ for line in fv:
+ vocab.append(line.strip())
+ else:
+ # Move the hf_${model}-merges.txt to hf_${model}.models
+ os.rename(os.path.join(args.save_dir, '{}-merges.txt'.format(args.model)),
+ os.path.join(args.save_dir, '{}.model'.format(args.model)))
+ hf_vocab_file = model_prefix + '-vocab.json'
+ with open(hf_vocab_file, 'r', encoding='utf-8') as fv:
+ vocab_kv = json.load(fv)
+ vocab_kv = sorted(list(vocab_kv.items()), key=lambda x: x[1])
+ for kv in vocab_kv:
+ vocab.append(kv[0])
+ os.remove(hf_vocab_file)
+ else:
+ raise NotImplementedError
+ unk_token = special_tokens_kv.pop('unk_token')
+ vocab_obj = Vocab(vocab, unk_token=unk_token, **special_tokens_kv)
+ vocab_obj.save(model_prefix + '.vocab')
+
+
+def cat_corpus(corpus_path_list):
+ # TODO Use temporary file
+ corpus_path = "./" + str(uuid4()) + '.corpus'
+ with open(corpus_path, 'wb') as cat_corpus:
+ for cp in corpus_path_list:
+ with open(cp, 'rb') as corpus:
+ cat_corpus.write(corpus.read())
+ return corpus_path
+
+
+def cli_main():
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == '__main__':
+ cli_main()
diff --git a/scripts/pretraining/README.md b/scripts/pretraining/README.md
new file mode 100644
index 0000000000..ec2c0a7ea2
--- /dev/null
+++ b/scripts/pretraining/README.md
@@ -0,0 +1,101 @@
+# Datasets
+## OpenWebTextCorpus
+Following the instruction of [Prepare OpenWebTextCorpus](../datasets/pretrain_corpus#openwebtext), download and prepare the dataset, obtaining a total of 20610 text files in the folder `prepared_owt`.
+
+```bash
+python3 data_preprocessing.py --input prepared_owt --output preprocessed_owt --max_seq_length 128 --shuffle
+```
+The above command allows us to generate the preprocessed Numpy features saved in `.npz`.
+# Pretrain Model
+## ELECTRA
+Following [Official Quickstart](https://github.com/google-research/electra#quickstart-pre-train-a-small-electra-model), pretrain a small model using OpenWebText as pretraining corpus. Note that [horovod](https://github.com/horovod/horovod) needs to be installed in advance, if `comm_backend` is set to `horovod`.
+
+```bash
+horovodrun -np 2 -H localhost:2 python3 -m run_electra \
+ --model_name google_electra_small \
+ --data 'preprocessed_owt/*.npz' \
+ --generator_units_scale 0.25 \
+ --gpus 0,1 \
+ --do_train \
+ --do_eval \
+ --output_dir ${OUTPUT} \
+ --num_accumulated 1 \
+ --batch_size 64 \
+ --lr 5e-4 \
+ --wd 0.01 \
+ --max_seq_len 128 \
+ --max_grad_norm 1 \
+ --warmup_steps 10000 \
+ --num_train_steps 1000000 \
+ --log_interval 50 \
+ --save_interval 10000 \
+ --mask_prob 0.15 \
+ --comm_backend horovod \
+```
+
+Alternatively, we could preprocessing the features on the fly and train this model with raw text directly like
+```bash
+horovodrun -np 2 -H localhost:2 python3 -m run_electra \
+ --model_name google_electra_small \
+ --generator_units_scale 0.25 \
+ --data 'prepared_owt/*.txt' \
+ --from_raw \
+ --gpus 0,1 \
+ --do_train \
+ --do_eval \
+ --output_dir ${OUTPUT} \
+ --num_accumulated 1 \
+ --batch_size 64 \
+ --lr 5e-4 \
+ --wd 0.01 \
+ --max_seq_len 128 \
+ --max_grad_norm 1 \
+ --warmup_steps 10000 \
+ --num_train_steps 1000000 \
+ --log_interval 50 \
+ --save_interval 10000 \
+ --mask_prob 0.15 \
+ --comm_backend horovod \
+```
+
+For the convenience of verification, the pretrained small model trained on OpenWebText named `gluon_electra_small_owt` is released and uploaded to S3 with directory structure as
+
+```
+gluon_electra_small_owt
+├── vocab-{short_hash}.json
+├── model-{short_hash}.params
+├── model-{short_hash}.yml
+├── gen_model-{short_hash}.params
+├── disc_model-{short_hash}.params
+```
+
+After pretraining, several downstream NLP tasks such as Question Answering are available to fine-tune. Here is an example of fine-tuning a local pretrained model on [SQuAD 1.1/2.0](../question_answering#squad).
+
+```bash
+python3 run_squad.py \
+ --model_name google_electra_small \
+ --data_dir squad \
+ --backbone_path ${OUTPUT}/model-{short_hash}.params \
+ --output_dir ${FINE-TUNE_OUTPUT} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 32 \
+ --num_accumulated 1 \
+ --gpus 0 \
+ --epochs 2 \
+ --lr 3e-4 \
+ --layerwise_decay 0.8 \
+ --warmup_ratio 0.1 \
+ --max_saved_ckpt 6 \
+ --all_evaluate \
+ --wd 0 \
+ --max_seq_length 128 \
+ --max_grad_norm 0.1 \
+```
+
+Resulting in the following output
+
+| Model Name | SQuAD1.1 dev | SQuAD2.0 dev |
+|--------------------------|---------------|--------------|
+|gluon_electra_small_owt | 69.40/76.98 | 67.63/69.89 |
diff --git a/scripts/pretraining/data_preprocessing.py b/scripts/pretraining/data_preprocessing.py
new file mode 100644
index 0000000000..1f75e2f782
--- /dev/null
+++ b/scripts/pretraining/data_preprocessing.py
@@ -0,0 +1,89 @@
+"""
+Prepare the feature for openwebtext dataset
+"""
+import os
+import time
+import math
+import random
+import argparse
+import multiprocessing
+
+import numpy as np
+
+from pretraining_utils import get_all_features
+from gluonnlp.models import get_backbone
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("-i", "--input", required=True,
+ help="path to extraed openwebtext dataset")
+ parser.add_argument("-o", "--output", default="preprocessed_owt",
+ help="directory for preprocessed features")
+ parser.add_argument("--num_process", type=int, default=8,
+ help="number of processes for multiprocessing")
+ parser.add_argument("--max_seq_length", type=int, default=128,
+ help="the maximum length of the pretraining sequence")
+ parser.add_argument("--num_out_files", type=int, default=1000,
+ help="Number of desired output files, where each is processed"
+ " independently by a worker.")
+ parser.add_argument('--model_name', type=str, default='google_electra_small',
+ help='Name of the pretrained model.')
+ parser.add_argument("--shuffle", action="store_true",
+ help="Wether to shuffle the data order")
+ parser.add_argument("--do_lower_case", dest='do_lower_case',
+ action="store_true", help="Lower case input text.")
+ parser.add_argument("--no_lower_case", dest='do_lower_case',
+ action='store_false', help="Don't lower case input text.")
+ parser.add_argument("--short_seq_prob", type=float, default=0.05,
+ help="The probability of sampling sequences shorter than"
+ " the max_seq_length.")
+ parser.set_defaults(do_lower_case=True)
+ return parser
+
+
+def main(args):
+ num_process = min(multiprocessing.cpu_count(), args.num_process)
+ _, cfg, tokenizer, _, _ = \
+ get_backbone(args.model_name, load_backbone=False)
+
+ fnames = sorted(os.listdir(args.input))
+ fnames = [os.path.join(args.input, fname) for fname in fnames]
+ if args.shuffle:
+ random.shuffle(fnames)
+ num_files = len(fnames)
+ num_out_files = min(args.num_out_files, num_files)
+ splited_files = np.array_split(fnames, num_out_files)
+ output_files = [os.path.join(
+ args.output, "owt-pretrain-record-{}.npz".format(str(i).zfill(4))) for i in range(num_out_files)]
+ print("All preprocessed features will be saved in {} npz files".format(num_out_files))
+ if not os.path.exists(args.output):
+ os.makedirs(args.output, exist_ok=True)
+ num_process = min(num_process, num_out_files)
+ print('Start preprocessing {} text files with {} cores'.format(
+ num_files, num_process))
+ process_args = [
+ (splited_files[i],
+ output_files[i],
+ tokenizer,
+ args.max_seq_length,
+ args.short_seq_prob) for i in range(
+ num_out_files)]
+ start_time = time.time()
+ with multiprocessing.Pool(num_process) as pool:
+ iter = pool.imap(get_all_features, process_args)
+ fea_written = 0
+ f_read = 0
+ for i, np_features in enumerate(iter):
+ elapsed = time.time() - start_time
+ fea_written += len(np_features[0])
+ f_read += len(splited_files[i])
+ print("Processed {:} files, Elapsed: {:.2f}s, ETA: {:.2f}s, ".format(
+ fea_written, elapsed, (num_files - f_read) / (f_read / elapsed)))
+ print("Done processing within {:.2f} seconds".format(elapsed))
+
+
+if __name__ == '__main__':
+ parser = get_parser()
+ args = parser.parse_args()
+ main(args)
diff --git a/scripts/pretraining/pretraining_utils.py b/scripts/pretraining/pretraining_utils.py
new file mode 100644
index 0000000000..cc84641589
--- /dev/null
+++ b/scripts/pretraining/pretraining_utils.py
@@ -0,0 +1,554 @@
+"""Utilities for pre-training."""
+import io
+import os
+import re
+import random
+import logging
+import collections
+
+import numpy as np
+from mxnet.gluon import HybridBlock
+from mxnet.gluon.data import ArrayDataset
+
+import gluonnlp.data.batchify as bf
+from gluonnlp.utils.misc import glob
+from gluonnlp.data.loading import NumpyDataset, DatasetLoader
+from gluonnlp.data.sampler import SplitSampler, FixedBucketSampler
+from gluonnlp.op import select_vectors_by_position, update_vectors_by_position
+
+PretrainFeature = collections.namedtuple(
+ 'PretrainFeature',
+ ['input_id',
+ 'segment_id',
+ 'valid_length'])
+
+
+def tokenize_lines_to_ids(lines, tokenizer):
+ """
+ Worker function to tokenize lines based on the tokenizer, and perform vocabulary lookup.
+
+ Parameters
+ ----------
+ lines
+ Lines to be tokenized of the whole file
+ tokenizer
+ The trained tokenizer
+
+ Returns
+ -------
+ results
+ A list storing the valid tokenized lines
+ """
+ results = []
+ # tag line delimiters or doc delimiters
+ for line in lines:
+ if not line:
+ break
+ line = line.strip()
+ # Single empty lines are used as line delimiters
+ # Double empty lines are used as document delimiters
+ if not line:
+ results.append([])
+ else:
+ token_ids = tokenizer.encode(line, int)
+ if token_ids:
+ results.append(token_ids)
+ return results
+
+
+def get_all_features(x):
+ """
+ Get the feature data in numpy form.
+
+ Parameters
+ ----------
+ x
+ List/tuple that contains:
+
+ - file_list
+ A list of text files
+ - output_file
+ The path to a output file that store the np_features
+ - tokenizer
+ The trained tokenizer
+ - max_seq_length
+ Maximum sequence length of the training features
+ - short_seq_prob
+ The probability of sampling sequences shorter than the max_seq_length.
+
+ Returns
+ -------
+ np_features
+ A tuple of (input_ids, segment_ids, valid_lengths),
+ in which each item is a list of numpy arrays.
+ """
+ file_list, output_file, tokenizer, max_seq_length, short_seq_prob = x
+ all_features = []
+ for text_file in file_list:
+ features = process_a_text(text_file, tokenizer, max_seq_length, short_seq_prob)
+ all_features.extend(features)
+ np_features = convert_to_npz(all_features, output_file)
+ return np_features
+
+
+def process_a_text(text_file, tokenizer, max_seq_length, short_seq_prob=0.05):
+ """
+ Create features from a single raw text file, in which one line is treated
+ as a sentence, and double blank lines represent document separators.
+
+ In this process, mxnet-unrelated features are generated, to easily convert
+ to features of a particular deep learning framework in subsequent steps
+
+ Parameters
+ ----------
+ text_file
+ The path to a single text file
+ tokenizer
+ The trained tokenizer
+ max_seq_length
+ Maximum sequence length of the training features
+ short_seq_prob
+ The probability of sampling sequences shorter than the max_seq_length.
+
+ Returns
+ -------
+ features
+ A list of processed features from a single text file
+ """
+ vocab = tokenizer.vocab
+ features = []
+ # TODO(zheyuye), support whole word masking
+ with io.open(text_file, 'r', encoding='utf-8') as reader:
+ lines = reader.readlines()
+ tokenized_lines = tokenize_lines_to_ids(lines, tokenizer)
+ target_seq_length = max_seq_length
+ current_sentences = []
+ current_length = 0
+ for tokenized_line in tokenized_lines:
+ current_sentences.append(tokenized_line)
+ current_length += len(tokenized_line)
+ # Create feature when meets the empty line or reaches the target length
+ if (not tokenized_line and current_length != 0) or (
+ current_length >= target_seq_length):
+ first_segment, second_segment = \
+ sentenceize(current_sentences, max_seq_length, target_seq_length)
+
+ input_id = [vocab.cls_id] + first_segment + [vocab.sep_id]
+ segment_id = [0] * len(input_id)
+
+ if second_segment:
+ input_id += second_segment + [vocab.sep_id]
+ segment_id += [1] * (len(second_segment) + 1)
+
+ # Padding with zeros for parallel storage
+ valid_length = len(input_id)
+ input_id += [0] * (max_seq_length - len(input_id))
+ segment_id += [0] * (max_seq_length - len(segment_id))
+
+ feature = PretrainFeature(input_id=input_id,
+ segment_id=segment_id,
+ valid_length=valid_length)
+ features.append(feature)
+
+ current_sentences = []
+ current_length = 0
+ # small chance for random-length instead of max_length-length feature
+ if random.random() < short_seq_prob:
+ target_seq_length = random.randint(5, max_seq_length)
+ else:
+ target_seq_length = max_seq_length
+
+ return features
+
+
+def convert_to_npz(all_features, output_file=None):
+ """
+ Convert features to numpy array and store if output_file provided
+
+ Parameters
+ ----------
+ all_features
+ A list of processed features.
+ output_file
+ The path to a output file that store the np_features.
+ Returns
+ -------
+ input_ids
+ A tuple of features
+ segment_ids
+ The segment ids
+ valid_lengths
+ The valid lengths
+ """
+ input_ids = []
+ segment_ids = []
+ valid_lengths = []
+ for fea_index, feature in enumerate(all_features):
+ input_ids.append(np.ascontiguousarray(feature.input_id, dtype='int32'))
+ segment_ids.append(np.ascontiguousarray(feature.segment_id, dtype='int32'))
+ valid_lengths.append(feature.valid_length)
+ if fea_index < 1:
+ logging.debug('*** Example Feature ***')
+ logging.debug('Generated {}'.format(feature))
+
+ if output_file:
+ # The length numpy array are fixed to max_seq_length with zero padding
+ npz_outputs = collections.OrderedDict()
+ npz_outputs['input_ids'] = np.array(input_ids, dtype='int32')
+ npz_outputs['segment_ids'] = np.array(segment_ids, dtype='int32')
+ npz_outputs['valid_lengths'] = np.array(valid_lengths, dtype='int32')
+ np.savez_compressed(output_file, **npz_outputs)
+ logging.info("Saved {} features in {} ".format(len(all_features), output_file))
+ return input_ids, segment_ids, valid_lengths
+
+
+def sentenceize(current_sentences, max_seq_length, target_seq_length):
+ """
+ Generate a pair of sentences based on a segmentation strategy
+ cloned from official electra model.
+
+ Parameters
+ ----------
+ current_sentences
+ max_seq_length
+ Maximum sequence length of the training features
+ target_seq_length
+ Target sequence length of the training features
+ Returns
+ -------
+ first_segment
+ The first sentence of the pretraining sequence
+ second_segment
+ The second sentence of the pretraining sequence.
+ Could be None for diversity of training instances.
+ """
+ # 10% chance to only produce one segment
+ if random.random() < 0.1:
+ first_segment_target_length = 100000
+ else:
+ # The reserved space for [CLS] and [SEP] tokens
+ first_segment_target_length = (target_seq_length - 3) // 2
+ first_segment = []
+ second_segment = []
+ for sentence in current_sentences:
+ if sentence:
+ # the sentence goes to the first segment if (1) the first segment is
+ # empty, (2) the sentence doesn't put the first segment over length or
+ # (3) 50% of the time when it does put the first segment over length
+ if (len(first_segment) == 0 or
+ len(first_segment) + len(sentence) < first_segment_target_length or
+ (len(second_segment) == 0 and
+ len(first_segment) < first_segment_target_length and
+ random.random() < 0.5)):
+ first_segment += sentence
+ else:
+ second_segment += sentence
+
+ # trim to max_length while accounting for not-yet-added [CLS]/[SEP] tokens
+ first_segment = first_segment[:max_seq_length - 2]
+ second_segment = second_segment[:max(0, max_seq_length -
+ len(first_segment) - 3)]
+
+ return first_segment, second_segment
+
+
+def prepare_pretrain_npz_dataset(filename, allow_pickle=False):
+ """Create dataset based on the numpy npz file"""
+ if isinstance(filename, (list, tuple)):
+ assert len(filename) == 1, \
+ 'When .npy/.npz data file is loaded, len(filename) must be 1.' \
+ ' Received len(filename)={}.'.format(len(filename))
+ filename = filename[0]
+ logging.debug('start to load file %s ...', filename)
+ return NumpyDataset(filename, allow_pickle=allow_pickle)
+
+
+def prepare_pretrain_text_dataset(
+ filenames,
+ tokenizer,
+ max_seq_length,
+ short_seq_prob,
+ cached_file_path):
+ """Create dataset based on the raw text files"""
+ if not isinstance(filenames, (list, tuple)):
+ filenames = [filenames]
+ if cached_file_path:
+ # generate a filename based on the input filename ensuring no crash.
+ # filename example: urlsf_subset00-130_data.txt
+ suffix = re.split(r'\.|/', filenames[0])[-2]
+ output_file = os.path.join(cached_file_path, "{}-pretrain-record.npz".format(suffix))
+ else:
+ output_file = None
+ np_features = get_all_features(
+ (filenames, output_file, tokenizer, max_seq_length, short_seq_prob))
+
+ return ArrayDataset(*np_features)
+
+
+def prepare_pretrain_bucket_sampler(dataset, batch_size, shuffle=False, num_buckets=1):
+ """Create data sampler based on the dataset"""
+ if isinstance(dataset, NumpyDataset):
+ lengths = dataset.get_field('valid_lengths')
+ else:
+ lengths = dataset.transform(lambda input_ids, segment_ids,
+ valid_lengths: valid_lengths, lazy=False)
+ sampler = FixedBucketSampler(lengths,
+ batch_size=batch_size,
+ num_buckets=num_buckets,
+ ratio=0,
+ shuffle=shuffle)
+ logging.debug('Sampler created for a new dataset:\n {}'.format(sampler))
+ return sampler
+
+
+def get_pretrain_data_npz(data, batch_size, shuffle, num_buckets,
+ vocab, num_parts=1, part_idx=0,
+ num_dataset_workers=1, num_batch_workers=1,
+ circle_length=1, repeat=1,
+ dataset_cached=False,
+ num_max_dataset_cached=0):
+ """Get a data iterator from pre-processed npz files.
+
+ Parameters
+ ----------
+ data: str
+ The path to the dataset directory
+ batch_size : int
+ The batch size per GPU.
+ shuffle : bool
+ Whether to shuffle the data.
+ num_buckets : int
+ The number of buckets for the FixedBucketSampler for training.
+ vocab : Vocab
+ The vocabulary.
+ num_parts : int
+ The number of partitions for the dataset.
+ part_idx : int
+ The index of the partition to read.
+ num_dataset_workers : int
+ The number of worker processes for dataset construction.
+ num_batch_workers : int
+ The number of worker processes for batch contruction.
+ circle_length : int, default is 1
+ The number of files to be read for a single worker at the same time.
+ When circle_length is larger than 1, we merge circle_length files.
+ repeat : int, default is 1
+ The number of times that files are repeated.
+ dataset_cached : bool, default is False
+ Whether or not to cache last processed dataset.
+ Each processed dataset can only be cached for once.
+ When there is no new available processed dataset to be fetched,
+ we pop a cached processed dataset.
+ num_max_dataset_cached : int, default is 0
+ Maximum number of cached datasets. It is valid only if dataset_cached is True
+ """
+ num_files = len(glob(data))
+ logging.info('%d files are found.', num_files)
+ assert num_files >= num_parts, \
+ 'The number of text files must be no less than the number of ' \
+ 'workers/partitions (%d). Only %d files at %s are found.' % (num_parts, num_files, data)
+ split_sampler = SplitSampler(num_files, num_parts=num_parts,
+ part_index=part_idx, repeat=repeat)
+ dataset_fn = prepare_pretrain_npz_dataset
+ sampler_fn = prepare_pretrain_bucket_sampler
+ dataset_params = {'allow_pickle': True}
+ sampler_params = {'batch_size': batch_size, 'shuffle': shuffle, 'num_buckets': num_buckets}
+ batchify_fn = bf.Tuple(
+ bf.Pad(val=vocab.pad_id), # input_ids
+ bf.Pad(val=0), # segment_ids
+ bf.Stack(), # valid_lengths
+ )
+ dataloader = DatasetLoader(data,
+ file_sampler=split_sampler,
+ dataset_fn=dataset_fn,
+ batch_sampler_fn=sampler_fn,
+ dataset_params=dataset_params,
+ batch_sampler_params=sampler_params,
+ batchify_fn=batchify_fn,
+ num_dataset_workers=num_dataset_workers,
+ num_batch_workers=num_batch_workers,
+ pin_memory=False,
+ circle_length=circle_length)
+ return dataloader
+
+
+def get_pretrain_data_text(data, batch_size, shuffle, num_buckets, tokenizer, vocab,
+ max_seq_length, short_seq_prob=0.05, num_parts=1,
+ part_idx=0, num_dataset_workers=1, num_batch_workers=1,
+ circle_length=1, repeat=1, cached_file_path=None):
+ """Get a data iterator from raw text documents.
+
+ Parameters
+ ----------
+ batch_size : int
+ The batch size per GPU.
+ shuffle : bool
+ Whether to shuffle the data.
+ num_buckets : int
+ The number of buckets for the FixedBucketSampler for training.
+ vocab : Vocab
+ The vocabulary.
+ tokenizer : HuggingFaceWordPieceTokenizer or SentencepieceTokenizer
+ The tokenizer.
+ max_seq_length : int
+ The hard limit of maximum sequence length of sentence pairs.
+ short_seq_prob : float
+ The probability of sampling sequences shorter than the max_seq_length.
+ num_parts : int
+ The number of partitions for the dataset.
+ part_idx : int
+ The index of the partition to read.
+ num_dataset_workers : int
+ The number of worker processes for dataset construction.
+ num_batch_workers : int
+ The number of worker processes for batch construction.
+ circle_length : int, default is 1
+ The number of files to be read for a single worker at the same time.
+ When circle_length is larger than 1, we merge circle_length files.
+ repeat : int, default is 1
+ The number of times that files are repeated.
+ cached_file_path: str, default is None
+ Directory for saving preprocessed features
+ """
+ num_files = len(glob(data))
+ logging.info('%d files are found.', num_files)
+ assert num_files >= num_parts, \
+ 'The number of text files must be no less than the number of ' \
+ 'workers/partitions (%d). Only %d files at %s are found.' % (num_parts, num_files, data)
+ split_sampler = SplitSampler(num_files, num_parts=num_parts,
+ part_index=part_idx, repeat=repeat)
+ dataset_fn = prepare_pretrain_text_dataset
+ sampler_fn = prepare_pretrain_bucket_sampler
+ dataset_params = {'tokenizer': tokenizer, 'max_seq_length': max_seq_length,
+ 'short_seq_prob': short_seq_prob, 'cached_file_path': cached_file_path}
+ sampler_params = {'batch_size': batch_size, 'shuffle': shuffle, 'num_buckets': num_buckets}
+ batchify_fn = bf.Tuple(
+ bf.Pad(val=vocab.pad_id), # input_ids
+ bf.Pad(val=0), # segment_ids
+ bf.Stack(), # valid_lengths
+ )
+
+ dataloader = DatasetLoader(data,
+ file_sampler=split_sampler,
+ dataset_fn=dataset_fn,
+ batch_sampler_fn=sampler_fn,
+ dataset_params=dataset_params,
+ batch_sampler_params=sampler_params,
+ batchify_fn=batchify_fn,
+ num_dataset_workers=num_dataset_workers,
+ num_batch_workers=num_batch_workers,
+ pin_memory=False,
+ circle_length=circle_length)
+ return dataloader
+
+
+class ElectraMasker(HybridBlock):
+ """process the pre-processed pretrain data"""
+ MaskedInput = collections.namedtuple('MaskedInput',
+ ['input_ids',
+ 'masks',
+ 'unmasked_tokens',
+ 'masked_positions',
+ 'masked_weights'])
+
+ def __init__(self, tokenizer, max_seq_length, mask_prob,
+ proposal_distribution=1.0):
+ super().__init__()
+ self._max_seq_length = max_seq_length
+ self._mask_prob = mask_prob
+ self._max_num_masked_position = int((self._mask_prob + 0.005) *
+ self._max_seq_length)
+ self._proposal_distribution = proposal_distribution
+ self.vocab = tokenizer.vocab
+
+ def dynamic_masking(self, F, input_ids, valid_lengths):
+ # TODO(zheyuye), two additional flag `disallow_from_mask` and `already_masked`
+ # that control the masking status for each positions in the sequence.
+ """
+ Generate masking positions on-the-fly instead of during preprocessing
+ Parameters
+ ----------
+ input_ids
+ The batchified input_ids with shape (batch_size, max_seq_length)
+ valid_lengths
+ The batchified valid_lengths with shape (batch_size, )
+ Returns
+ ------
+ masked_input_ids
+ The masked input sequence with 15% tokens are masked with [MASK]
+ shape (batch_size, max_seq_length)
+ length_masks
+ The masking matrix for the whole sequence that indicates the positions
+ are greater than valid_length.
+
+ shape (batch_size, max_seq_length)
+ unmasked_tokens
+ The original tokens that appear in the unmasked input sequence
+ shape (batch_size, num_masked_positions)
+ masked_positions
+ The masking positions in mx.np.ndarray with shape (batch_size, num_masked_positions)
+ shape (batch_size, num_masked_positions)
+ masked_lm_weights
+ The weight matrix containing 0 or 1 to mark the actual effect of masked positions
+ shape (batch_size, num_masked_positions)
+ """
+ N = self._max_num_masked_position
+ # Only valid token without special token are allowed to mask
+ valid_candidates = F.np.ones_like(input_ids, dtype=np.bool)
+ ignore_tokens = [self.vocab.cls_id, self.vocab.sep_id, self.vocab.pad_id]
+
+ for ignore_token in ignore_tokens:
+ # TODO(zheyuye), Update when operation += supported
+ valid_candidates = valid_candidates * \
+ F.np.not_equal(input_ids, ignore_token)
+ valid_lengths = valid_lengths.astype(np.float32)
+ valid_candidates = valid_candidates.astype(np.float32)
+ num_masked_position = F.np.maximum(
+ 1, F.np.minimum(N, round(valid_lengths * self._mask_prob)))
+
+ # Get the masking probability of each position
+ sample_probs = self._proposal_distribution * valid_candidates
+ sample_probs /= F.np.sum(sample_probs, axis=-1, keepdims=True)
+ sample_probs = F.npx.stop_gradient(sample_probs)
+ gumbels = F.np.random.gumbel(F.np.zeros_like(sample_probs))
+ # Following the instruction of official repo to avoid deduplicate postions
+ # with Top_k Sampling as https://github.com/google-research/electra/issues/41
+ masked_positions = F.npx.topk(
+ F.np.log(sample_probs) + gumbels, k=N,
+ axis=-1, ret_typ='indices', dtype=np.int32)
+
+ masked_weights = F.npx.sequence_mask(
+ F.np.ones_like(masked_positions),
+ sequence_length=num_masked_position,
+ use_sequence_length=True, axis=1, value=0)
+ masked_positions = masked_positions * masked_weights
+ length_masks = F.npx.sequence_mask(
+ F.np.ones_like(input_ids, dtype=np.float32),
+ sequence_length=valid_lengths,
+ use_sequence_length=True, axis=1, value=0)
+ unmasked_tokens = select_vectors_by_position(
+ F, input_ids, masked_positions) * masked_weights
+ masked_weights = masked_weights.astype(np.float32)
+ replaced_positions = (
+ F.np.random.uniform(
+ F.np.zeros_like(masked_positions),
+ F.np.ones_like(masked_positions)) > self._mask_prob) * masked_positions
+ # dealing with multiple zero values in replaced_positions which causes
+ # the [CLS] being replaced
+ filled = F.np.where(
+ replaced_positions,
+ self.vocab.mask_id,
+ self.vocab.cls_id).astype(
+ np.int32)
+ # Masking token by replacing with [MASK]
+ masked_input_ids = update_vectors_by_position(F, input_ids, filled, replaced_positions)
+
+ # Note: It is likely have multiple zero values in masked_positions if number of masked of
+ # positions not reached the maximum. However, this example hardly exists since valid_length
+ # is almost always equal to max_seq_length
+ masked_input = self.MaskedInput(input_ids=masked_input_ids,
+ masks=length_masks,
+ unmasked_tokens=unmasked_tokens,
+ masked_positions=masked_positions,
+ masked_weights=masked_weights)
+ return masked_input
diff --git a/scripts/pretraining/run_electra.py b/scripts/pretraining/run_electra.py
new file mode 100644
index 0000000000..1678eeae8d
--- /dev/null
+++ b/scripts/pretraining/run_electra.py
@@ -0,0 +1,554 @@
+"""Pretraining Example for Electra Model on the OpenWebText dataset"""
+
+import os
+import time
+import shutil
+import logging
+import argparse
+import functools
+import collections
+
+import mxnet as mx
+import numpy as np
+from mxnet.lr_scheduler import PolyScheduler
+
+from sklearn import metrics
+from pretraining_utils import ElectraMasker, get_pretrain_data_npz, get_pretrain_data_text
+from gluonnlp.utils.misc import repeat, grouper, set_seed, init_comm, logging_config, naming_convention
+from gluonnlp.initializer import TruncNorm
+from gluonnlp.models.electra import ElectraModel, ElectraForPretrain, get_pretrained_electra
+from gluonnlp.utils.parameter import clip_grad_global_norm
+try:
+ import horovod.mxnet as hvd
+except ImportError:
+ pass
+
+mx.npx.set_np()
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument('--model_name', type=str, default='google_electra_small',
+ help='Name of the pretrained model.')
+ parser.add_argument('--do_train', action='store_true',
+ help='Whether to train the model')
+ parser.add_argument('--do_eval', action='store_true',
+ help='Whether to evaluate the model')
+ parser.add_argument('--data', type=str, default=None,
+ help='Path to pretraining corpus file. File name with wildcard such as'
+ ' dir/*.npz is accepted. Or file name with wildcard such as dir/*.txt if'
+ ' --from_raw_text is set.')
+ parser.add_argument('--output_dir', type=str, default='electra_owt',
+ help='The output directory where the model params will be written.'
+ ' default is squad_out')
+ # Training hyperparameters
+ parser.add_argument('--seed', type=int, default=100, help='Random seed')
+ parser.add_argument('--log_interval', type=int,
+ default=100, help='The logging interval.')
+ parser.add_argument('--save_interval', type=int, default=1000,
+ help='the number of steps to save model parameters.'
+ 'default is every epoch')
+ # Data Loading from npz, need to be same as pretraining example
+ parser.add_argument('--max_seq_length', type=int, default=128,
+ help='The maximum total input sequence length after tokenization.'
+ 'Sequences longer than this will be truncated, and sequences shorter '
+ 'than this will be padded. default is 128')
+ parser.add_argument("--do_lower_case", dest='do_lower_case',
+ action="store_true", help="Lower case input text. Default is True")
+ parser.add_argument("--no_lower_case", dest='do_lower_case',
+ action='store_false', help="Don't lower case input text.")
+ parser.add_argument('--mask_prob', type=float, default=0.15,
+ help='mask probability for generator input')
+ parser.set_defaults(do_lower_case=True)
+ parser.add_argument('--num_dataset_workers', type=int, default=4,
+ help='Number of workers to pre-process dataset.')
+ parser.add_argument('--num_batch_workers', type=int, default=2,
+ help='Number of workers to pre-process mini-batch.')
+ parser.add_argument('--num_buckets', type=int, default=1,
+ help='Number of buckets for variable length sequence sampling')
+ # Data pre-processing from raw text. the below flags are only valid if --from_raw_text is set
+ parser.add_argument('--from_raw_text', action='store_true',
+ help='If set, both training and dev samples are generated on-the-fly '
+ 'from raw texts instead of pre-processed npz files. ')
+ parser.add_argument("--short_seq_prob", type=float, default=0.05,
+ help='The probability of sampling sequences '
+ 'shorter than the max_seq_length.')
+ parser.add_argument("--cached_file_path", default=None,
+ help='Directory for saving preprocessed features')
+ parser.add_argument('--circle_length', type=int, default=2,
+ help='Number of files to be read for a single GPU at the same time.')
+ parser.add_argument('--repeat', type=int, default=8,
+ help='Number of times that files are repeated in each shuffle.')
+ # Optimization
+ parser.add_argument('--num_train_steps', type=int, default=1000000,
+ help='The number of training steps. Note that epochs will be ignored '
+ 'if training steps are set')
+ parser.add_argument('--warmup_steps', type=int, default=10000,
+ help='warmup steps. Note that either warmup_steps or warmup_ratio is set.')
+ parser.add_argument('--warmup_ratio', type=float, default=0.1,
+ help='Ratio of warmup steps in the learning rate scheduler.')
+ parser.add_argument('--batch_size', type=int, default=8,
+ help='Batch size. Number of examples per gpu in a minibatch. default is 8')
+ parser.add_argument('--max_grad_norm', type=float, default=1.0,
+ help='Max gradient norm.')
+ parser.add_argument('--optimizer', type=str, default='adamw',
+ help='optimization algorithm. default is adamw')
+ parser.add_argument('--lr_decay_power', type=float, default=1.0,
+ help="Decay power for layer-wise learning rate")
+ parser.add_argument('--num_accumulated', type=int, default=1,
+ help='The number of batches for gradients accumulation to '
+ 'simulate large batch size.')
+ parser.add_argument('--lr', type=float, default=5e-4,
+ help='Initial learning rate. default is 5e-4')
+ parser.add_argument('--wd', type=float, default=0.01, help='weight decay')
+ parser.add_argument('--start_step', type=int, default=0,
+ help='Start optimization step from the checkpoint.')
+ # Modle Configuration
+ parser.add_argument('--disc_weight', type=float, default=50.0,
+ help='loss wight for discriminator')
+ parser.add_argument('--gen_weight', type=float, default=1.0,
+ help='loss wight for generator')
+ parser.add_argument('--hidden_dropout_prob', type=float, default=0.1,
+ help='dropout of hidden layer')
+ parser.add_argument('--attention_dropout_prob', type=float, default=0.1,
+ help='dropout of attention layer')
+ parser.add_argument('--generator_units_scale', type=float, default=None,
+ help='The scale size of the generator units')
+ parser.add_argument('--generator_layers_scale', type=float, default=None,
+ help='The scale size of the generator layer')
+ # Communication
+ parser.add_argument('--comm_backend', type=str, default='device',
+ choices=['horovod', 'dist_sync_device', 'device'],
+ help='Communication backend.')
+ parser.add_argument('--gpus', type=str, default='0',
+ help='list of gpus to run, e.g. 0 or 0,2,5. -1 means using cpu.')
+ args = parser.parse_args()
+ return args
+
+
+def get_pretraining_model(model_name, ctx_l,
+ max_seq_length=128,
+ hidden_dropout_prob=0.1,
+ attention_dropout_prob=0.1,
+ generator_units_scale=None,
+ generator_layers_scale=None):
+ """
+ A Electra Pretrain Model is built with a generator and a discriminator, in which
+ the generator has the same embedding as the discriminator but different backbone.
+ """
+ cfg, tokenizer, _, _ = get_pretrained_electra(
+ model_name, load_backbone=False)
+ cfg = ElectraModel.get_cfg().clone_merge(cfg)
+ cfg.defrost()
+ cfg.MODEL.hidden_dropout_prob = hidden_dropout_prob
+ cfg.MODEL.attention_dropout_prob = attention_dropout_prob
+ cfg.MODEL.max_length = max_seq_length
+ # Keep the original generator size if not designated
+ if generator_layers_scale:
+ cfg.MODEL.generator_layers_scale = generator_layers_scale
+ if generator_units_scale:
+ cfg.MODEL.generator_units_scale = generator_units_scale
+ cfg.freeze()
+
+ model = ElectraForPretrain(cfg,
+ uniform_generator=False,
+ tied_generator=False,
+ tied_embeddings=True,
+ disallow_correct=False,
+ weight_initializer=TruncNorm(stdev=0.02))
+ model.initialize(ctx=ctx_l)
+ model.hybridize()
+ return cfg, tokenizer, model
+
+
+ElectraOutput = collections.namedtuple('ElectraOutput',
+ ['mlm_scores',
+ 'rtd_scores',
+ 'rtd_labels',
+ 'corrupted_tokens'])
+
+
+def final_save(model, save_dir, tokenizer):
+ if not os.path.exists(save_dir):
+ os.makedirs(save_dir)
+
+ with open(os.path.join(save_dir, 'model.yml'), 'w') as of:
+ of.write(model.disc_cfg.dump())
+ tokenizer.vocab.save(os.path.join(save_dir, 'vocab.json'))
+ model.disc_backbone.save_parameters(os.path.join(save_dir, 'model.params'))
+ model.discriminator.save_parameters(os.path.join(save_dir, 'disc_model.params'))
+ model.generator.save_parameters(os.path.join(save_dir, 'gen_model.params'))
+
+ logging.info('Statistics:')
+
+ old_names = os.listdir(save_dir)
+ for old_name in old_names:
+ new_name, long_hash = naming_convention(save_dir, old_name)
+ old_path = os.path.join(save_dir, old_name)
+ new_path = os.path.join(save_dir, new_name)
+ shutil.move(old_path, new_path)
+ file_size = os.path.getsize(new_path)
+ logging.info('\t{}/{} {} {}'.format(save_dir, new_name, long_hash, file_size))
+
+
+def parameters_option(step_num, model, ckpt_dir, option='Saving'):
+ """Save or load the model parameter, marked by step_num."""
+ param_path = os.path.join(
+ ckpt_dir, '{}.params'.format(str(step_num).zfill(7)))
+ logging.info('[step {}], {} model params to/from {}.'.format(
+ step_num, option, param_path))
+ if option == 'Saving':
+ model.save_parameters(param_path)
+ return param_path
+ elif option == 'Loading':
+ model.load_parameters(param_path)
+ return model
+ else:
+ raise NotImplementedError('Unknown Option: {}'.format(option))
+
+
+def states_option(step_num, trainer, ckpt_dir, local_rank=0, option='Saving'):
+ """Save or load the trainer states, marked by step_num and local rank."""
+ state_path = os.path.join(ckpt_dir, '{}.states.{}'.format(
+ str(step_num).zfill(7), str(local_rank).zfill(2)))
+ logging.info('[step {}], {} trainer states to/from {}.'.format(
+ step_num, option, state_path))
+ if option == 'Saving':
+ trainer.save_states(state_path)
+ return state_path
+ elif option == 'Loading':
+ trainer.load_states(state_path)
+ return trainer
+ else:
+ raise NotImplementedError('Unknown Option: {}'.format(option))
+
+
+def train(args):
+ store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm(
+ args.comm_backend, args.gpus)
+ logging_config(args.output_dir,
+ name='pretrain_owt_' + str(rank), # avoid race
+ console=(local_rank == 0))
+ logging.info(args)
+ logging.debug('Random seed set to {}'.format(args.seed))
+ set_seed(args.seed)
+ logging.info('Training info: num_buckets: {}, '
+ 'num_workers: {}, rank: {}'.format(
+ args.num_buckets, num_workers, rank))
+ cfg, tokenizer, model = get_pretraining_model(args.model_name, ctx_l,
+ args.max_seq_length,
+ args.hidden_dropout_prob,
+ args.attention_dropout_prob,
+ args.generator_units_scale,
+ args.generator_layers_scale)
+ data_masker = ElectraMasker(
+ tokenizer, args.max_seq_length, args.mask_prob)
+ if args.from_raw_text:
+ if args.cached_file_path and not os.path.exists(args.cached_file_path):
+ os.mkdir(args.cached_file_path)
+ get_dataset_fn = functools.partial(get_pretrain_data_text,
+ max_seq_length=args.max_seq_length,
+ short_seq_prob=args.short_seq_prob,
+ tokenizer=tokenizer,
+ circle_length=args.circle_length,
+ repeat=args.repeat,
+ cached_file_path=args.cached_file_path)
+
+ logging.info('Processing and loading the training dataset from raw text.')
+
+ else:
+ logging.info('Loading the training dataset from local Numpy file.')
+ get_dataset_fn = get_pretrain_data_npz
+
+ data_train = get_dataset_fn(args.data, args.batch_size, shuffle=True,
+ num_buckets=args.num_buckets, vocab=tokenizer.vocab,
+ num_parts=num_workers, part_idx=rank,
+ num_dataset_workers=args.num_dataset_workers,
+ num_batch_workers=args.num_batch_workers)
+
+ logging.info('Creating distributed trainer...')
+ param_dict = model.collect_params()
+ # Do not apply weight decay to all the LayerNorm and bias
+ for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
+ v.wd_mult = 0.0
+ # Collect differentiable parameters
+ params = [p for p in param_dict.values() if p.grad_req != 'null']
+ # Set grad_req if gradient accumulation is required
+ num_accumulated = args.num_accumulated
+ if num_accumulated > 1:
+ logging.info('Using gradient accumulation. Effective global batch size = {}'
+ .format(num_accumulated * args.batch_size * len(ctx_l) * num_workers))
+ for p in params:
+ p.grad_req = 'add'
+ # backend specific implementation
+ if args.comm_backend == 'horovod':
+ # Horovod: fetch and broadcast parameters
+ hvd.broadcast_parameters(param_dict, root_rank=0)
+
+ num_train_steps = args.num_train_steps
+ if args.warmup_steps is not None:
+ warmup_steps = args.warmup_steps
+ else:
+ warmup_steps = int(num_train_steps * args.warmup_ratio)
+ assert warmup_steps is not None, 'Must specify either warmup_steps or warmup_ratio'
+ log_interval = args.log_interval
+ save_interval = args.save_interval if args.save_interval is not None\
+ else num_train_steps // 50
+ logging.info('#Total Training Steps={}, Warmup={}, Save Interval={}'
+ .format(num_train_steps, warmup_steps, save_interval))
+
+ lr_scheduler = PolyScheduler(max_update=num_train_steps,
+ base_lr=args.lr,
+ warmup_begin_lr=0,
+ pwr=1,
+ final_lr=0,
+ warmup_steps=warmup_steps,
+ warmup_mode='linear')
+ optimizer_params = {'learning_rate': args.lr,
+ 'wd': args.wd,
+ 'lr_scheduler': lr_scheduler,
+ }
+ if args.optimizer == 'adamw':
+ optimizer_params.update({'beta1': 0.9,
+ 'beta2': 0.999,
+ 'epsilon': 1e-6,
+ 'correct_bias': False,
+ })
+ if args.comm_backend == 'horovod':
+ trainer = hvd.DistributedTrainer(param_dict, args.optimizer, optimizer_params)
+ else:
+ trainer = mx.gluon.Trainer(param_dict, args.optimizer, optimizer_params,
+ update_on_kvstore=False)
+ if args.start_step:
+ logging.info('Restart training from {}'.format(args.start_step))
+ # TODO(zheyuye), How about data splitting, where to start re-training
+ state_path = states_option(
+ args.start_step, trainer, args.output_dir, local_rank, 'Loading')
+ param_path = parameters_option(
+ args.start_step, model, args.output_dir, 'Loading')
+
+ # prepare the loss function
+ mlm_loss_fn = mx.gluon.loss.SoftmaxCELoss()
+ rtd_loss_fn = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()
+ mlm_loss_fn.hybridize()
+ rtd_loss_fn.hybridize()
+
+ # prepare the records writer
+ writer = None
+ # only one process on each worker will write the tensorboardX's records to avoid race
+ if args.do_eval and local_rank == 0:
+ from tensorboardX import SummaryWriter
+ record_path = os.path.join(args.output_dir, 'records')
+ logging.info('Evaluation records saved in {}'.format(record_path))
+ writer = SummaryWriter(record_path)
+
+ step_num = args.start_step
+ finish_flag = False
+
+ log_total_loss = 0
+ log_mlm_loss = 0
+ log_rtd_loss = 0
+ log_sample_num = 0
+ train_start_time = time.time()
+
+ # start training
+ train_loop_dataloader = grouper(repeat(data_train), len(ctx_l))
+ while step_num < num_train_steps:
+ tic = time.time()
+ for accum_idx in range(num_accumulated):
+ sample_l = next(train_loop_dataloader)
+ loss_l = []
+ mlm_loss_l = []
+ rtd_loss_l = []
+ for sample, ctx in zip(sample_l, ctx_l):
+ if sample is None:
+ continue
+ # prepare data
+ input_ids, segment_ids, valid_lengths = sample
+ input_ids = input_ids.as_in_ctx(ctx)
+ segment_ids = segment_ids.as_in_ctx(ctx)
+ valid_lengths = valid_lengths.as_in_ctx(ctx)
+ masked_input = data_masker.dynamic_masking(mx.nd, input_ids, valid_lengths)
+ masked_input_ids = masked_input.input_ids
+ length_masks = masked_input.masks
+ unmasked_tokens = masked_input.unmasked_tokens
+ masked_positions = masked_input.masked_positions
+ masked_weights = masked_input.masked_weights
+
+ log_sample_num += len(masked_input_ids)
+
+ with mx.autograd.record():
+ mlm_scores, rtd_scores, corrupted_tokens, labels = model(
+ masked_input_ids, segment_ids, valid_lengths, unmasked_tokens, masked_positions)
+ denominator = (masked_weights.sum() + 1e-6) * num_accumulated * len(ctx_l)
+ mlm_loss = mlm_loss_fn(
+ mx.npx.reshape(mlm_scores, (-5, -1)),
+ unmasked_tokens.reshape((-1,)),
+ masked_weights.reshape((-1, 1))).sum() / denominator
+ denominator = (length_masks.sum() + 1e-6) * num_accumulated * len(ctx_l)
+ rtd_loss = rtd_loss_fn(
+ rtd_scores, labels, length_masks).sum() / denominator
+ output = ElectraOutput(mlm_scores=mlm_scores,
+ rtd_scores=rtd_scores,
+ rtd_labels=labels,
+ corrupted_tokens=corrupted_tokens,
+ )
+ mlm_loss_l.append(mlm_loss)
+ rtd_loss_l.append(rtd_loss)
+ loss = (args.gen_weight * mlm_loss + args.disc_weight * rtd_loss)
+ loss_l.append(loss)
+
+ for loss in loss_l:
+ loss.backward()
+ # All Reduce the Step Loss
+ log_mlm_loss += sum([ele.as_in_ctx(ctx_l[0])
+ for ele in mlm_loss_l]).asnumpy()
+ log_rtd_loss += sum([ele.as_in_ctx(ctx_l[0])
+ for ele in rtd_loss_l]).asnumpy()
+ log_total_loss += sum([ele.as_in_ctx(ctx_l[0])
+ for ele in loss_l]).asnumpy()
+
+ # update
+ trainer.allreduce_grads()
+
+ total_norm, ratio, is_finite = clip_grad_global_norm(
+ params, args.max_grad_norm * num_workers)
+
+ if args.comm_backend == 'horovod':
+ # Note that horovod.trainer._scale is default to num_workers,
+ # thus trainer.update(1) will scale the gradients by 1./num_workers
+ trainer.update(1, ignore_stale_grad=True)
+ else:
+ # gluon.trainer._scale is default to 1
+ trainer.update(num_workers, ignore_stale_grad=True)
+
+ total_norm = total_norm / num_workers
+ step_num += 1
+ if num_accumulated > 1:
+ # set grad to zero for gradient accumulation
+ model.zero_grad()
+
+ # saving
+ if step_num % save_interval == 0 or step_num >= num_train_steps:
+ if is_master_node:
+ states_option(
+ step_num, trainer, args.output_dir, local_rank, 'Saving')
+ if local_rank == 0:
+ param_path = parameters_option(
+ step_num, model, args.output_dir, 'Saving')
+
+ # logging
+ if step_num % log_interval == 0:
+ # Output the loss of per step
+ log_mlm_loss /= log_interval
+ log_rtd_loss /= log_interval
+ log_total_loss /= log_interval
+ toc = time.time()
+ logging.info(
+ '[step {}], Loss mlm/rtd/total={:.4f}/{:.4f}/{:.4f},'
+ ' LR={:.6f}, grad_norm={:.4f}. Time cost={:.2f},'
+ ' Throughput={:.2f} samples/s, ETA={:.2f}h'.format(
+ step_num, log_mlm_loss, log_rtd_loss, log_total_loss,
+ trainer.learning_rate, total_norm, toc - tic, log_sample_num / (toc - tic),
+ (num_train_steps - step_num) / (step_num / (toc - train_start_time)) / 3600))
+ tic = time.time()
+
+ if args.do_eval:
+ evaluation(writer, step_num, masked_input, output)
+ if writer is not None:
+ writer.add_scalars('loss',
+ {'total_loss': log_total_loss,
+ 'mlm_loss': log_mlm_loss,
+ 'rtd_loss': log_rtd_loss},
+ step_num)
+ log_mlm_loss = 0
+ log_rtd_loss = 0
+ log_total_loss = 0
+ log_sample_num = 0
+
+
+ logging.info('Finish training step: %d', step_num)
+ if is_master_node:
+ state_path = states_option(step_num, trainer, args.output_dir, local_rank, 'Saving')
+ if local_rank == 0:
+ param_path = parameters_option(step_num, model, args.output_dir, 'Saving')
+
+ mx.npx.waitall()
+ train_end_time = time.time()
+ logging.info('Train cost={:.1f}s'.format(train_end_time - train_start_time))
+
+ if writer is not None:
+ writer.close()
+
+ if local_rank == 0:
+ model_name = args.model_name.replace('google', 'gluon')
+ save_dir = os.path.join(args.output_dir, model_name)
+ final_save(model, save_dir, tokenizer)
+
+
+# TODO(zheyuye), Directly implement a metric for weighted accuracy
+def accuracy(labels, predictions, weights=None):
+ if weights is None:
+ weights = mx.np.ones_like(labels)
+ is_correct = mx.np.equal(labels, predictions)
+ acc = (is_correct * weights).sum() / (weights.sum() + 1e-6)
+ return acc.asnumpy().item()
+
+# TODO(zheyuye), Directly implement a metric for weighted AUC
+def auc(labels, probs, weights=None):
+ if isinstance(labels, mx.np.ndarray):
+ labels = labels.asnumpy()
+ if isinstance(probs, mx.np.ndarray):
+ probs = probs.asnumpy()
+ if isinstance(weights, mx.np.ndarray):
+ weights = weights.asnumpy()
+ labels = labels.reshape(-1)
+ probs = probs.reshape(-1)
+ weights = weights.reshape(-1)
+
+ fpr, tpr, thresholds = metrics.roc_curve(labels, probs, sample_weight=weights)
+ return metrics.auc(fpr, tpr)
+
+
+def evaluation(writer, step_num, masked_input, eval_input):
+ length_masks = masked_input.masks
+ unmasked_tokens = masked_input.unmasked_tokens
+ masked_weights = masked_input.masked_weights
+ mlm_scores = eval_input.mlm_scores
+ rtd_scores = eval_input.rtd_scores
+ rtd_labels = eval_input.rtd_labels
+ corrupted_tokens = eval_input.corrupted_tokens
+
+ mlm_log_probs = mx.npx.log_softmax(mlm_scores)
+ mlm_preds = mx.np.argmax(mlm_log_probs, axis=-1).astype(np.int32)
+ rtd_probs = mx.npx.sigmoid(rtd_scores)
+ rtd_preds = mx.np.round((mx.np.sign(rtd_scores) + 1) / 2).astype(np.int32)
+
+ mlm_accuracy = accuracy(unmasked_tokens, mlm_preds, masked_weights)
+ corrupted_mlm_accuracy = accuracy(unmasked_tokens, corrupted_tokens, masked_weights)
+ rtd_accuracy = accuracy(rtd_labels, rtd_preds, length_masks)
+ rtd_precision = accuracy(rtd_labels, rtd_preds, length_masks * rtd_preds)
+ rtd_recall = accuracy(rtd_labels, rtd_preds, rtd_labels * rtd_preds)
+ rtd_auc = auc(rtd_labels, rtd_probs, length_masks)
+ logging.info(
+ 'Eval [step {}], mlm_accuracy={:.4f}, corrupted_mlm_accuracy={:.4f},'
+ ' rtd_accuracy={:.4f}, rtd_precision={:.4f}, rtd_recall={:.4f},'
+ ' rtd_auc={:.4f}.'.format(step_num,
+ mlm_accuracy, corrupted_mlm_accuracy,
+ rtd_accuracy, rtd_precision, rtd_recall, rtd_auc))
+ if writer is not None:
+ writer.add_scalars('results',
+ {'mlm_accuracy': mlm_accuracy,
+ 'corrupted_mlm_accuracy': corrupted_mlm_accuracy,
+ 'rtd_accuracy': rtd_accuracy,
+ 'rtd_precision': rtd_precision,
+ 'rtd_recall': rtd_recall,
+ 'rtd_auc': rtd_auc},
+ step_num)
+
+
+if __name__ == '__main__':
+ os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
+ args = parse_args()
+ if args.do_train:
+ train(args)
diff --git a/scripts/question_answering/README.md b/scripts/question_answering/README.md
new file mode 100644
index 0000000000..c6b8bd790f
--- /dev/null
+++ b/scripts/question_answering/README.md
@@ -0,0 +1,177 @@
+# Question Answering Examples
+
+# SQuAD
+The finetuning scripts for [Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer/) are available,
+supporting a variety of pre-training models including [BERT](https://github.com/google-research/electra), [ALBERT](https://github.com/google-research/albert),
+and [ELECTRA](https://github.com/google-research/bert). Free to choose one of them as `model_name`, listing below.
+
+| BERT | ALBERT | ELECTRA |
+|:--------------------------------:|:------------------------:|:--------------------:|
+| google_en_cased_bert_base | google_albert_base_v2 | google_electra_small |
+| google_en_uncased_bert_base | google_albert_large_v2 | google_electra_base |
+| google_en_cased_bert_large | google_albert_xalrge_v2 | google_electra_large |
+| google_en_uncased_bert_large | google_albert_xxlarge_v2 | |
+| google_zh_bert_base | | |
+| google_multi_cased_bert_base | | |
+| google_en_cased_bert_wwm_large | | |
+| google_en_uncased_bert_wwm_large | | |
+
+### Data and official evaluation scripts
+
+* [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+* [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
+* [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
+* [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
+
+download and move them under `$DATA_DIR`
+
+### Running Script
+We provide the script to train on the SQuAD dataset.
+
+```bash
+VERSION=2.0 # Either 2.0 or 1.1
+MODEL_NAME=google_albert_base_v2
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 4 \
+ --num_accumulated 3 \
+ --gpus 0,1,2,3 \
+ --epochs 3 \
+ --lr 2e-5 \
+ --warmup_ratio 0.1 \
+ --wd 0.01 \
+ --max_seq_length 512 \
+ --max_grad_norm 0.1 \
+ --overwrite_cache \
+```
+or evaluate SQuAD1.1 based on a SQuAD2.0 fine-tuned checkpoint as
+
+```bash
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir ${OUT_DIR} \
+ --param_checkpoint ${CKPT_PATH} \
+ --version 2.0 \
+ --do_eval \
+ --gpus 0,1,2,3 \
+ --eval_batch_size 16 \
+ --overwrite_cache \
+```
+
+We could speed up multi-GPU training via horovod.
+Compared to KVStore, training RoBERTa Large model on SQuAD 2.0 with 3 epochs will save roughly 1/4 training resources (8.48 vs 11.32 hours). Results may vary depending on the training instances.
+
+```bash
+horovodrun -np 4 -H localhost:4 python3 run_squad.py \
+ --comm_backend horovod \
+ ...
+```
+As for ELECTRA model, we fine-tune it with layer-wise learning rate decay as
+
+```bash
+VERSION=2.0 # Either 2.0 or 1.1
+MODEL_NAME=google_electra_small
+
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 32 \
+ --num_accumulated 1 \
+ --gpus 0 \
+ --epochs 2 \
+ --lr 3e-4 \
+ --layerwise_decay 0.8 \
+ --warmup_ratio 0.1 \
+ --wd 0 \
+ --max_seq_length 512 \
+ --max_grad_norm 0.1 \
+```
+
+For RoBERTa and XLMR, we remove 'segment_ids' and replace `[CLS]` and `[SEP]` with
+`` and `` which stand for the beginning and end of sentences respectively in original purpose.
+
+```bash
+VERSION=2.0 # Either 2.0 or 1.1
+MODEL_NAME=fairseq_roberta_large
+
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 2 \
+ --num_accumulated 6 \
+ --gpus 0,1,2,3 \
+ --epochs 3 \
+ --lr 3e-5 \
+ --warmup_ratio 0.2 \
+ --wd 0.01 \
+ --max_seq_length 512 \
+ --max_grad_norm 0.1 \
+```
+
+### Results
+We reproduced the ALBERT model which is released by Google, and fine-tune the the SQuAD with single models. ALBERT Version 2 are pre-trained without the dropout mechanism but with extra training steps compared to the version 1 (see the [original paper](https://arxiv.org/abs/1909.11942) for details).
+
+Fine-tuning the listed models with hyper-parameter learning rate 2e-5, epochs 3, warmup ratio 0.1 and max gradient norm 0.1 (as shown in command). Notice that the `batch_size` is set for each GPU and the global batch size is 48 for all experiments, besides that gradient accumulation (`num_accumulated`) is supported in the case of out of memory.
+
+Performance are shown in the table below, in which the SQuAD1.1 are evaluated with SQuAD2.0 checkpoints.
+Notice that the standard metrics of SQuAD are EM and F1. The former is an exact match score between predictions and references, while the latter is a token-level f1 score in which the common tokens are considered as True Positives.
+
+|Reproduced ALBERT Models (F1/EM) | SQuAD 1.1 dev | SQuAD 2.0 dev | Json | Log | Command |
+|----------------------------------|---------------|---------------|------|-----| --------|
+|ALBERT base | 90.55/83.83 | 82.09/79.40 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_base_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_base_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_base.sh) |
+|ALBERT large | 92.66/86.43 | 84.98/82.19 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_large_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_large_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_large.sh) |
+|ALBERT xlarge | 93.85/87.71 | 87.92/85.04 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xlarge_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xlarge_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_xlarge.sh) |
+|ALBERT xxlarge | 95.00/89.01 | 89.91/86.87 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xxlarge_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xxlarge_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_xxlarge.sh) |
+
+For reference, we've included the results from Google's Original Experiments
+
+| Model Name | SQuAD 1.1 dev | SQuAD 2.0 dev|
+|------------|---------------|--------------|
+|ALBERT base (googleresearch/albert) | 90.2/83.2 | 82.1/79.3 |
+|ALBERT large (googleresearch/albert) | 91.8/85.2 | 84.9/81.8 |
+|ALBERT xlarge (googleresearch/albert) | 92.9/86.4 | 87.9/84.1 |
+|ALBERT xxlarge (googleresearch/albert) | 94.6/89.1 | 89.8/86.9 |
+
+For the reset pretrained models, the results on SQuAD1.1 and SQuAD2.0 are given as follows.
+
+| Model Name | SQuAD1.1 dev | SQuAD2.0 dev | Json | Log | Command |
+|--------------------------|---------------|--------------|------|-----|--------|
+|BERT base | 88.40/81.24 | 76.43/73.59 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_base_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_base_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_uncased_bert_base.sh) |
+|BERT large | 90.45/83.55 | 81.41/78.46 | [json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_uncased_bert_large.sh) |
+|ELECTRA small | 85.42/78.95 | 73.93/71.36 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_small_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_small_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_small.sh) |
+|ELECTRA base | 92.63/87.34 | 86.65/83.95 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_base_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_base_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_small.sh) |
+|ELECTRA large | 94.95/89.94 | 90.67/88.32 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_base.sh) |
+|Mobile BERT | 82.45/88.99 | 79.60/74.11 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_uncased_mobilebert_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_uncased_mobilebert_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_mobilebert.sh) |
+|RoBERTa large | 94.58/88.86 | 89.69/86.80 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_fairseq_roberta_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_fairseq_roberta_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_large.sh) |
+
+For reference, we have also included the results of original version from Google and Fairseq
+
+| Model Name | SQuAD1.1 dev | SQuAD2.0 dev |
+|--------------------------|----------------|---------------|
+|Google BERT base | 88.5/80.8 | - / - |
+|Google BERT large | 90.9/84.1 | - / - |
+|Google ELECTRA small | - /75.8 | - /70.1 |
+|Google ELECTRA base | - /86.8 | - /83.7 |
+|Google ELECTRA large | - /89.7 | - /88.1 |
+|Google Mobile BERT | 81.4/88.6 | 74.4/77.1 |
+|Fairseq RoBERTa large | 94.6/88.9 | 89.4/86.5 |
diff --git a/scripts/question_answering/__init__.py b/scripts/question_answering/__init__.py
deleted file mode 100644
index 4f3fef8cc4..0000000000
--- a/scripts/question_answering/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Question answering example."""
diff --git a/scripts/question_answering/commands/run_squad2_albert_base.sh b/scripts/question_answering/commands/run_squad2_albert_base.sh
new file mode 100644
index 0000000000..69bee438f8
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_albert_base.sh
@@ -0,0 +1,25 @@
+VERSION=2.0 # Either 2.0 or 1.1
+MODEL_NAME=google_albert_base_v2
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 4 \
+ --num_accumulated 3 \
+ --gpus 0,1,2,3 \
+ --epochs 3 \
+ --lr 2e-5 \
+ --warmup_ratio 0.1 \
+ --wd 0.01 \
+ --max_seq_length 512 \
+ --max_grad_norm 0.1 \
+ --overwrite_cache \
diff --git a/scripts/question_answering/commands/run_squad2_albert_large.sh b/scripts/question_answering/commands/run_squad2_albert_large.sh
new file mode 100644
index 0000000000..f4c9d069c5
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_albert_large.sh
@@ -0,0 +1,25 @@
+VERSION=2.0 # Either 2.0 or 1.1
+MODEL_NAME=google_albert_large_v2
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 3 \
+ --num_accumulated 4 \
+ --gpus 0,1,2,3 \
+ --epochs 3 \
+ --lr 2e-5 \
+ --warmup_ratio 0.1 \
+ --wd 0.01 \
+ --max_seq_length 512 \
+ --max_grad_norm 0.1 \
+ --overwrite_cache \
diff --git a/scripts/question_answering/commands/run_squad2_albert_xlarge.sh b/scripts/question_answering/commands/run_squad2_albert_xlarge.sh
new file mode 100644
index 0000000000..d14994422d
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_albert_xlarge.sh
@@ -0,0 +1,25 @@
+VERSION=2.0 # Either 2.0 or 1.1
+MODEL_NAME=google_albert_xlarge_v2
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 1 \
+ --num_accumulated 12 \
+ --gpus 0,1,2,3 \
+ --epochs 3 \
+ --lr 2e-5 \
+ --warmup_ratio 0.1 \
+ --wd 0.01 \
+ --max_seq_length 512 \
+ --max_grad_norm 0.1 \
+ --overwrite_cache \
diff --git a/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh b/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh
new file mode 100644
index 0000000000..fdb6e89658
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh
@@ -0,0 +1,25 @@
+VERSION=2.0 # Either 2.0 or 1.1
+MODEL_NAME=google_albert_xxlarge_v2
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 1 \
+ --num_accumulated 12 \
+ --gpus 0,1,2,3 \
+ --epochs 3 \
+ --lr 2e-5 \
+ --warmup_ratio 0.1 \
+ --wd 0.01 \
+ --max_seq_length 512 \
+ --max_grad_norm 0.1 \
+ --overwrite_cache \
diff --git a/scripts/question_answering/commands/run_squad2_electra_base.sh b/scripts/question_answering/commands/run_squad2_electra_base.sh
new file mode 100644
index 0000000000..a500a3ae50
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_electra_base.sh
@@ -0,0 +1,25 @@
+VERSION=2.0 # Either 2.0 or 1.1
+MODEL_NAME=google_electra_base
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 8 \
+ --num_accumulated 1 \
+ --gpus 0,1,2,3 \
+ --epochs 2 \
+ --lr 1e-4 \
+ --layerwise_decay 0.8 \
+ --warmup_ratio 0.1 \
+ --wd 0 \
+ --max_seq_length 512 \
+ --max_grad_norm 0.1 \
diff --git a/scripts/question_answering/commands/run_squad2_electra_large.sh b/scripts/question_answering/commands/run_squad2_electra_large.sh
new file mode 100644
index 0000000000..61872f110b
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_electra_large.sh
@@ -0,0 +1,25 @@
+VERSION=2.0 # Either 2.0 or 1.1
+MODEL_NAME=google_electra_large
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 2 \
+ --num_accumulated 4 \
+ --gpus 0,1,2,3 \
+ --epochs 2 \
+ --lr 5e-5 \
+ --layerwise_decay 0.9 \
+ --warmup_ratio 0.1 \
+ --wd 0 \
+ --max_seq_length 512 \
+ --max_grad_norm 0.1 \
diff --git a/scripts/question_answering/commands/run_squad2_electra_small.sh b/scripts/question_answering/commands/run_squad2_electra_small.sh
new file mode 100644
index 0000000000..e174258c17
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_electra_small.sh
@@ -0,0 +1,24 @@
+VERSION=2.0 # Either 2.0 or 1.1
+MODEL_NAME=google_electra_small
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 32 \
+ --num_accumulated 1 \
+ --gpus 0 \
+ --epochs 2 \
+ --lr 3e-4 \
+ --layerwise_decay 0.8 \
+ --warmup_ratio 0.1 \
+ --wd 0 \
+ --max_seq_length 512 \
+ --max_grad_norm 0.1 \
diff --git a/scripts/question_answering/commands/run_squad2_mobilebert.sh b/scripts/question_answering/commands/run_squad2_mobilebert.sh
new file mode 100644
index 0000000000..cfeee56356
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_mobilebert.sh
@@ -0,0 +1,25 @@
+VERSION=2.0 # Either 2.0 or 1.1
+MODEL_NAME=google_uncased_mobilebert
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 8 \
+ --num_accumulated 1 \
+ --gpus 0,1,2,3 \
+ --epochs 5 \
+ --lr 4e-5 \
+ --warmup_steps 1400 \
+ --wd 0.0 \
+ --max_seq_length 384 \
+ --max_grad_norm 0.1 \
+ --overwrite_cache \
diff --git a/scripts/question_answering/commands/run_squad2_roberta_large.sh b/scripts/question_answering/commands/run_squad2_roberta_large.sh
new file mode 100644
index 0000000000..3cdf2cb6ea
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_roberta_large.sh
@@ -0,0 +1,23 @@
+VERSION=2.0 # Either 2.0 or 1.1
+MODEL_NAME=fairseq_roberta_large
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 2 \
+ --num_accumulated 6 \
+ --gpus 0,1,2,3 \
+ --epochs 3 \
+ --lr 3e-5 \
+ --warmup_ratio 0.2 \
+ --wd 0.01 \
+ --max_seq_length 512 \
+ --max_grad_norm 0.1 \
diff --git a/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh b/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh
new file mode 100644
index 0000000000..f087860014
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh
@@ -0,0 +1,25 @@
+VERSION=2.0 # Either 2.0 or 1.1
+MODEL_NAME=google_en_uncased_bert_base
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 6 \
+ --num_accumulated 2 \
+ --gpus 0,1,2,3 \
+ --epochs 3 \
+ --lr 3e-5 \
+ --warmup_ratio 0.1 \
+ --wd 0.01 \
+ --max_seq_length 512 \
+ --max_grad_norm 0.1 \
+ --overwrite_cache \
diff --git a/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh b/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh
new file mode 100644
index 0000000000..0e80da7688
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh
@@ -0,0 +1,25 @@
+VERSION=2.0 # Either 2.0 or 1.1
+MODEL_NAME=google_en_uncased_bert_large
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+ --model_name ${MODEL_NAME} \
+ --data_dir squad \
+ --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+ --version ${VERSION} \
+ --do_eval \
+ --do_train \
+ --batch_size 2 \
+ --num_accumulated 6 \
+ --gpus 0,1,2,3 \
+ --epochs 3 \
+ --lr 3e-5 \
+ --warmup_ratio 0.1 \
+ --wd 0.01 \
+ --max_seq_length 512 \
+ --max_grad_norm 0.1 \
+ --overwrite_cache \
diff --git a/scripts/question_answering/data_pipeline.py b/scripts/question_answering/data_pipeline.py
deleted file mode 100644
index bd42d05c2b..0000000000
--- a/scripts/question_answering/data_pipeline.py
+++ /dev/null
@@ -1,946 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=
-"""SQuAD data data preprocessing pipeline."""
-import collections
-import contextlib
-import itertools
-import json
-import multiprocessing as mp
-import os
-import re
-import time
-
-import nltk
-import numpy as np
-import tqdm
-from mxnet.gluon.data import Dataset
-
-import gluonnlp as nlp
-from gluonnlp import data, Vocab
-from gluonnlp.data import SQuAD
-
-
-class SQuADDataPipeline:
- """Main data processing pipeline class, which encapsulate all preprocessing logic. The class
- process the data in multiprocessing mode using Pool. It can save/load the result of processing,
- but since it happens in a single thread, it is usually faster to just process data from scratch.
- """
-
- def __init__(self, train_para_limit, train_ques_limit, dev_para_limit, dev_ques_limit,
- ans_limit, char_limit, emb_file_name, num_workers=None, save_load_data=False,
- data_root_path='./data'):
- """Method that creates a new instance. If an example is longer that provided limits it will
- be truncated for the dev set and filtered out for the training set.
-
- Parameters
- ----------
- train_para_limit : int
- Maximum characters of a paragraph for training dataset
- train_ques_limit : int
- Maximum characters of a question for training dataset
- dev_para_limit : int
- Maximum characters of a paragraph for dev dataset
- dev_ques_limit
- Maximum characters of a question for dev dataset
- ans_limit : int
- Maximum characters of an answer
- char_limit : int
- Maximum token (word) length of a paragraph, question or answer
- emb_file_name : str
- Glove embedding file name
- num_workers : int, default None
- Number of workers to use for multiprocessing. Default uses all available cores
- data_root_path : str
- Path to store the processed data or load existing processed data, if needed (depends on
- save_load_data flag)
- save_load_data : bool
- Shall save or load data from the ``data_root_path``
- """
- self._train_para_limit = train_para_limit
- self._train_ques_limit = train_ques_limit
- self._dev_para_limit = dev_para_limit
- self._dev_ques_limit = dev_ques_limit
- self._ans_limit = ans_limit
- self._char_limit = char_limit
- self._emb_file_name = emb_file_name
- self._is_cased_embedding = emb_file_name.startswith('glove.840')
- self._num_workers = num_workers
- self._save_load_data = save_load_data
- self._data_root_path = data_root_path
-
- self._processed_train_data_file_name = 'train_processed.json'
- self._processed_dev_data_file_name = 'dev_processed.json'
- self._word_vocab_file_name = 'word_vocab.bin'
- self._char_vocab_file_name = 'char_vocab.bin'
-
- def get_processed_data(self, use_spacy=True, shrink_word_vocab=True, squad_data_root=None):
- """Main method to start data processing
-
- Parameters
- ----------
- use_spacy : bool, default True
- Shall use Spacy as a tokenizer. If not, uses NLTK
- shrink_word_vocab : bool, default True
- When True, only tokens that have embeddings in the embedding file are remained in the
- word_vocab. Otherwise tokens with no embedding also stay
- squad_data_root : str, default None
- Data path to store downloaded original SQuAD data
- Returns
- -------
- train_json_data : dict
- Train JSON data of SQuAD dataset as is to run official evaluation script
- dev_json_data : dict
- Dev JSON data of SQuAD dataset as is to run official evaluation script
- train_examples : SQuADQADataset
- Processed examples to be used for training
- dev_examples : SQuADQADataset
- Processed examples to be used for evaluation
- word_vocab : Vocab
- Word vocabulary
- char_vocab : Vocab
- Char vocabulary
-
- """
- if self._save_load_data and self._has_processed_data():
- return self._load_processed_data()
-
- train_dataset = SQuAD(segment='train', root=squad_data_root) \
- if squad_data_root else SQuAD(segment='train')
- dev_dataset = SQuAD(segment='dev', root=squad_data_root) \
- if squad_data_root else SQuAD(segment='dev')
-
- with contextlib.closing(mp.Pool(processes=self._num_workers)) as pool:
- train_examples, dev_examples = SQuADDataPipeline._tokenize_data(train_dataset,
- dev_dataset,
- use_spacy, pool)
- word_vocab, char_vocab = SQuADDataPipeline._get_vocabs(train_examples, dev_examples,
- self._emb_file_name,
- self._is_cased_embedding,
- shrink_word_vocab,
- pool)
-
- filter_provider = SQuADDataFilter(self._train_para_limit,
- self._train_ques_limit,
- self._ans_limit)
- train_examples = list(filter(filter_provider.filter, train_examples))
-
- train_featurizer = SQuADDataFeaturizer(word_vocab,
- char_vocab,
- self._train_para_limit,
- self._train_ques_limit,
- self._char_limit,
- self._is_cased_embedding)
-
- dev_featuarizer = SQuADDataFeaturizer(word_vocab,
- char_vocab,
- self._dev_para_limit,
- self._dev_ques_limit,
- self._char_limit,
- self._is_cased_embedding)
-
- train_examples, dev_examples = SQuADDataPipeline._featurize_data(train_examples,
- dev_examples,
- train_featurizer,
- dev_featuarizer)
-
- if self._save_load_data:
- self._save_processed_data(train_examples, dev_examples, word_vocab, char_vocab)
-
- return train_dataset._read_data(), dev_dataset._read_data(), \
- SQuADQADataset(train_examples), SQuADQADataset(dev_examples), word_vocab, char_vocab
-
- @staticmethod
- def _tokenize_data(train_dataset, dev_dataset, use_spacy, pool):
- """Tokenize incoming paragpraphs and questions in incoming datsets using provided
- tokenizer withing the processes of the provided multiprocessing pool
-
- Parameters
- ----------
- train_dataset : SQuAD
- training dataset
- dev_dataset : SQuAD
- Dev dataset
- use_spacy : bool
- Use Spacy as a tokenizer. Otherwise uses NLTK
- pool : Pool
- Multiprocessing pool to use for the tokenization
-
- Returns
- -------
- train_examples : List[dict]
- List of tokenized training examples
- dev_examples : List[dict]
- List of tokenized dev examples
- """
- tokenizer = SQuADDataTokenizer(use_spacy)
-
- tic = time.time()
- print('Train examples [{}] transformation started.'.format(len(train_dataset)))
- train_examples = list(tqdm.tqdm(tokenizer.run_async(pool, train_dataset),
- total=len(train_dataset)))
- print('Train examples transformed [{}/{}] in {:.3f} sec'.format(len(train_examples),
- len(train_dataset),
- time.time() - tic))
- tic = time.time()
- print('Dev examples [{}] transformation started.'.format(len(dev_dataset)))
- dev_examples = list(tqdm.tqdm(tokenizer.run_async(pool, dev_dataset),
- total=len(dev_dataset)))
- print('Dev examples transformed [{}/{}] in {:.3f} sec'.format(len(dev_examples),
- len(dev_dataset),
- time.time() - tic))
- return train_examples, dev_examples
-
- @staticmethod
- def _featurize_data(train_examples, dev_examples, train_featurizer, dev_featuarizer):
- """Create features from incoming datasets by replacing tokens with indices.
-
- Parameters
- ----------
- train_examples : List[dict]
- Tokenized train examples
- dev_examples : List[dict]
- Tokenized dev examples
- train_featurizer : SQuADDataFeaturizer
- Parametrized featurizer for training examples
- dev_featuarizer : SQuADDataFeaturizer
- Parametrized featurizer for dev examples
-
- Returns
- -------
- train_ready : List[Tuple]
- Processed train examples. Each tuple consists of question_id, record_index,
- context_tokens_indices, question_tokens_indices, context_chars_indices,
- question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
- context, context_tokens_spans
- dev_ready : List[Tuple]
- Processed dev examples. Each tuple consists of question_id, record_index,
- context_tokens_indices, question_tokens_indices, context_chars_indices,
- question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
- context, context_tokens_spans
-
- """
- tic = time.time()
- print('Train examples [{}] featurization started.'.format(len(train_examples)))
- train_ready = [train_featurizer.build_features(example)
- for example in tqdm.tqdm(train_examples, total=len(train_examples))]
- print('Train examples featurized [{}] in {:.3f} sec'.format(len(train_examples),
- time.time() - tic))
- tic = time.time()
- print('Dev examples [{}] featurization started.'.format(len(dev_examples)))
- dev_ready = [dev_featuarizer.build_features(example)
- for example in tqdm.tqdm(dev_examples, total=len(dev_examples))]
- print('Dev examples featurized [{}] in {:.3f} sec'.format(len(dev_examples),
- time.time() - tic))
- return train_ready, dev_ready
-
- @staticmethod
- def _get_vocabs(train_examples, dev_examples, emb_file_name, is_cased_embedding,
- shrink_word_vocab, pool):
- """Create both word-level and character-level vocabularies. Vocabularies are built using
- data from both train and dev datasets.
-
- Parameters
- ----------
- train_examples : List[dict]
- Tokenized training examples
- dev_examples : List[dict]
- Tokenized dev examples
- emb_file_name : str
- Glove embedding file name
- is_cased_embedding : bool
- When True, provided embedding file is cased, uncased otherwise
- shrink_word_vocab : bool
- When True, only tokens that have embeddings in the embedding file are remained in the
- word_vocab. Otherwise tokens with no embedding also stay
- pool : Pool
- Multiprocessing pool to use
-
- Returns
- -------
- word_vocab : Vocab
- Word-level vocabulary
- char_vocab : Vocab
- Char-level vocabulary
- """
- tic = time.time()
- print('Word counters receiving started.')
-
- word_mapper = SQuADAsyncVocabMapper()
- word_reducer = SQuADAsyncVocabReducer()
- word_mapped = list(
- tqdm.tqdm(word_mapper.run_async(itertools.chain(train_examples, dev_examples), pool),
- total=len(train_examples) + len(dev_examples)))
- word_partitioned = tqdm.tqdm(SQuADDataPipeline._partition(itertools.chain(*word_mapped)),
- total=len(word_mapped))
- word_counts = list(tqdm.tqdm(word_reducer.run_async(word_partitioned, pool),
- total=len(word_partitioned)))
- print('Word counters received in {:.3f} sec'.format(time.time() - tic))
-
- tic = time.time()
- print('Char counters receiving started.')
- char_mapper = SQuADAsyncVocabMapper(iterate_over_example=True)
- char_reducer = SQuADAsyncVocabReducer()
- char_mapped = list(
- tqdm.tqdm(char_mapper.run_async(itertools.chain(train_examples, dev_examples), pool),
- total=len(train_examples) + len(dev_examples)))
- char_partitioned = SQuADDataPipeline._partition(itertools.chain(*char_mapped))
- char_counts = list(tqdm.tqdm(char_reducer.run_async(char_partitioned, pool),
- total=len(char_partitioned)))
- print('Char counters received in {:.3f} sec'.format(time.time() - tic))
-
- embedding = nlp.embedding.create('glove', source=emb_file_name)
-
- if is_cased_embedding:
- word_counts = itertools.chain(*[[(item[0], item[1]),
- (item[0].lower(), item[1]),
- (item[0].capitalize(), item[1]),
- (item[0].upper(), item[1])] for item in word_counts])
- else:
- word_counts = [(item[0].lower(), item[1]) for item in word_counts]
-
- word_vocab = Vocab({item[0]: item[1] for item in word_counts if
- not shrink_word_vocab or item[0] in embedding.token_to_idx},
- bos_token=None, eos_token=None)
- word_vocab.set_embedding(embedding)
- char_vocab = Vocab({item[0]: item[1] for item in char_counts},
- bos_token=None, eos_token=None)
-
- return word_vocab, char_vocab
-
- def _has_processed_data(self):
- """Check if the data was processed and stored already
-
- Returns
- -------
- ret: Boolean
- Is processed data already exists
- """
- return \
- os.path.exists(
- os.path.join(self._data_root_path, self._processed_train_data_file_name)) and \
- os.path.exists(
- os.path.join(self._data_root_path, self._processed_dev_data_file_name)) and \
- os.path.exists(
- os.path.join(self._data_root_path, self._word_vocab_file_name)) and \
- os.path.exists(
- os.path.join(self._data_root_path, self._char_vocab_file_name))
-
- def _load_processed_data(self):
- """ Load processed data from the disk
- Returns
- -------
- train_examples : List[Tuple]
- Processed train examples. Each tuple consists of question_id, record_index,
- context_tokens_indices, question_tokens_indices, context_chars_indices,
- question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
- context, context_tokens_spans
- dev_examples : List[Tuple]
- Processed dev examples. Each tuple consists of question_id, record_index,
- context_tokens_indices, question_tokens_indices, context_chars_indices,
- question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
- context, context_tokens_spans
- word_vocab : Vocab
- Word-level vocabulary
- char_vocab : Vocab
- Char-level vocabulary
- """
- with open(os.path.join(self._data_root_path, self._processed_train_data_file_name),
- 'r') as f:
- train_examples = json.load(f)
-
- with open(os.path.join(self._data_root_path, self._processed_dev_data_file_name), 'r') as f:
- dev_examples = json.load(f)
-
- with open(os.path.join(self._data_root_path, self._word_vocab_file_name), 'r') as f:
- word_vocab = Vocab.from_json(json.load(f))
-
- with open(os.path.join(self._data_root_path, self._char_vocab_file_name), 'r') as f:
- char_vocab = Vocab.from_json(json.load(f))
-
- return train_examples, dev_examples, word_vocab, char_vocab
-
- def _save_processed_data(self, train_examples, dev_examples, word_vocab, char_vocab):
- """Save processed data to disk
-
- Parameters
- ----------
- train_examples : List[Tuple]
- Processed train examples. Each tuple consists of question_id, record_index,
- context_tokens_indices, question_tokens_indices, context_chars_indices,
- question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
- context, context_tokens_spans
- dev_examples : List[Tuple]
- Processed dev examples. Each tuple consists of question_id, record_index,
- context_tokens_indices, question_tokens_indices, context_chars_indices,
- question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
- context, context_tokens_spans
- word_vocab : Vocab
- Word-level vocabulary
- char_vocab : Vocab
- Char-level vocabulary
- """
- with open(os.path.join(self._data_root_path, self._processed_train_data_file_name),
- 'w') as f:
- json.dump(train_examples, f)
-
- with open(os.path.join(self._data_root_path, self._processed_dev_data_file_name), 'w') as f:
- json.dump(dev_examples, f)
-
- with open(os.path.join(self._data_root_path, self._word_vocab_file_name), 'w') as f:
- f.write(word_vocab.to_json())
-
- with open(os.path.join(self._data_root_path, self._char_vocab_file_name), 'w') as f:
- f.write(char_vocab.to_json())
-
- @staticmethod
- def _partition(mapped_values):
- """Groups items with same keys into a single partition
-
- Parameters
- ----------
- mapped_values : List[Tuple]
- List of mapped (key, value) tuples
-
- Returns
- -------
- items: List[Tuple]
- List of partitions, where each partition is (key, List[value])
- """
- partitioned_data = collections.defaultdict(list)
-
- for key, value in mapped_values:
- partitioned_data[key].append(value)
-
- return partitioned_data.items()
-
-
-class SQuADDataTokenizer:
- """SQuAD data tokenizer, that encapsulate the splitting logic of each entry of SQuAD dataset"""
- try:
- _spacy_tokenizer = nlp.data.SpacyTokenizer()
- except (ImportError, AttributeError) as e:
- _spacy_error = e
- def _spacy_tokenizer(*args, **kwargs): # pylint: disable=no-method-argument
- raise SQuADDataTokenizer._spacy_error
-
- def __init__(self, use_spacy=True):
- """Init new SQuADDataTokenizer object
- Parameters
- ----------
- use_spacy : bool, default True
- Use Spacy as base tokenizer. Otherwise uses NLTK with some cleansing
- """
- self._use_spacy = use_spacy
-
- def run_async(self, pool, dataset):
- return pool.imap(self, dataset)
-
- def __call__(self, example):
- return self.tokenize_one_example(example)
-
- def tokenize_one_example(self, example):
- """Tokenize a single example
-
- Parameters
- ----------
- example : Tuple
- A tuple of SQuAD dataset in format (record_index, question_id, question, context,
- answer_list, answer_start)
-
- Returns
- -------
- ret : dict
- Tokenized example with the following keys: context_tokens, context_chars, ques_tokens,
- ques_chars, y1s, y2s, id, context, spans, record_idx
- """
- index, q_id, question, context, answer_list, answer_start = example
-
- context = context.replace('\'\'', '\" ').replace(r'``', '\" ')
- context_tokens = SQuADDataTokenizer._word_tokenize_spacy(context) if self._use_spacy else \
- SQuADDataTokenizer._word_tokenize_nltk(context)
- context_chars = [list(token) for token in context_tokens]
- spans = SQuADDataTokenizer._get_token_spans(context, context_tokens)
-
- ques = question.replace('\'\'', '\" ').replace('``', '\" ')
- ques_tokens = SQuADDataTokenizer._word_tokenize_spacy(ques) if self._use_spacy else \
- SQuADDataTokenizer._word_tokenize_nltk(ques)
- ques_chars = [list(token) for token in ques_tokens]
-
- y1s, y2s = [], []
- answer_texts = []
-
- for answer_text, answer_start in zip(answer_list, answer_start):
- answer_end = answer_start + len(answer_text)
- answer_texts.append(answer_text)
- answer_span = []
- for idx, span in enumerate(spans):
- if not (answer_end <= span[0] or answer_start >= span[1]):
- answer_span.append(idx)
- y1, y2 = answer_span[0], answer_span[-1]
- y1s.append(y1)
- y2s.append(y2)
-
- result = {'context_tokens': context_tokens, 'context_chars': context_chars,
- 'ques_tokens': ques_tokens, 'ques_chars': ques_chars, 'y1s': y1s,
- 'y2s': y2s, 'id': q_id, 'context': context, 'spans': spans, 'record_idx': index}
- return result
-
- @staticmethod
- def _word_tokenize_spacy(sent):
- """Default tokenization method that uses Spacy. Called only if not overridden by providing
- base_tokenizer to SQuADDataTokenizer.__init__
-
- Parameters
- ----------
- sent : str
- A text to tokenize
-
- Returns
- -------
- tokens : List[str]
- List of tokens
- """
- tokens = SQuADDataTokenizer._spacy_tokenizer(sent)
- return tokens
-
- @staticmethod
- def _word_tokenize_nltk(sent):
- """Tokenization method that uses NLTK.
-
- Parameters
- ----------
- sent : str
- A text to tokenize
-
- Returns
- -------
- tokens : List[str]
- List of tokens
- """
- tokens = []
- splitters = ('-', '\u2212', '\u2014', '\u2013', '/', '~', '"', '\'', '\u201C',
- '\u2019', '\u201D', '\u2018', '\u00B0')
-
- sample = sent.replace('\n', ' ').replace(u'\u000A', '').replace(u'\u00A0', '')
- temp_tokens = [token.replace('\'\'', '"').replace('``', '"') for token in
- nltk.word_tokenize(sample)]
-
- for token in temp_tokens:
- tokens.extend(re.split('([{}])'.format(''.join(splitters)), token))
-
- tokens = [token for token in tokens if len(token) > 0]
- return tokens
-
- @staticmethod
- def _get_token_spans(text, tokens):
- """Create a list of tuples that contains tokens character inidices. By using this output
- it is possible to find character-based indices of token start and end
-
- Parameters
- ----------
- text : str
- Original text
- tokens : List[str]
- List of tokens of the original text
-
- Returns
- -------
- ret: List[Tuple]
- List of tuple, where each tuple contains starting character index of the token in the
- text and end character index of the token in the text
- """
- current = 0
- spans = []
- for token in tokens:
- current = text.find(token, current)
- if current < 0:
- print('Token {} cannot be found'.format(token))
- raise Exception()
- spans.append((current, current + len(token)))
- current += len(token)
- return spans
-
-
-class SQuADDataFilter:
- """Filter an example based on the specified conditions"""
-
- def __init__(self, para_limit, ques_limit, ans_limit):
- """Init SQuADDataFilter object
-
- Parameters
- ----------
- para_limit : int
- Maximum allowed length of a paragraph
- ques_limit : int
- Maximum allowed length of a question
- ans_limit : int
- Maximum allowed length of an answer
- """
- self._para_limit = para_limit
- self._ques_limit = ques_limit
- self._ans_limit = ans_limit
-
- def filter(self, example):
- """Returns if the example should be filtered out or not
-
- Parameters
- ----------
- example : dict
- A dataset examples with context_tokens, ques_tokens, y1s and y2s keys
-
- Returns
- -------
- ret : Boolean
- True if an example should remain in the dataset, and False if it should be excluded from
- the dataset
- """
- return len(example['context_tokens']) <= self._para_limit and \
- len(example['ques_tokens']) <= self._ques_limit and \
- (example['y2s'][0] - example['y1s'][0]) <= self._ans_limit
-
-
-class SQuADAsyncVocabMapper:
- """A multiprocessing implementation of a Mapper for tokens counting"""
-
- def __init__(self, iterate_over_example=False):
- """Init MapReduce object
-
- Parameters
- ----------
- iterate_over_example : bool, default False
- Should use examples as is, or iterate over its content
- """
- self._iterate_over_example = iterate_over_example
-
- def run_async(self, examples, pool):
- """Run async processing over examples
-
- Parameters
- ----------
- examples : List[dict]
- List of dictionaries with context_tokens and ques_tokens keys
- pool : Pool
- Multiprocessing pool to use
-
- Returns
- -------
- ret : List[Tuple]
- List of tuples of tokens and counts: (str, int)
- """
- return pool.imap(self, examples)
-
- def __call__(self, example):
- """Maps examples into distinct tokens
-
- Parameters
- ----------
- example : dict
- Example to process with context_tokens and ques_tokens keys
-
- Returns
- -------
- mapped_values : List[Tuple]
- Result of mapping process. Each tuple of (token, count) format
- """
- para_counter = data.count_tokens(example['context_tokens'] if not self._iterate_over_example
- else [c for tkn in example['context_tokens'] for c in tkn])
- ques_counter = data.count_tokens(example['ques_tokens'] if not self._iterate_over_example
- else [c for tkn in example['ques_tokens'] for c in tkn])
- counter = para_counter + ques_counter
- return list(counter.items())
-
-
-class SQuADAsyncVocabReducer:
- """A multiprocessing implementation of a Reducing for tokens counting"""
-
- def run_async(self, items, pool):
- """Run async processing over examples
-
- Parameters
- ----------
- items : List[Tuple]
- List of tuples of (token, count) structure
- pool : Pool
- Multiprocessing pool to use
-
- Returns
- -------
- ret : List[Tuple]
- List of tuples of tokens and counts: (str, int)
- """
- return pool.imap(self, items)
-
- def __call__(self, item):
- """Sums up number of times a token was used
-
- Parameters
- ----------
- item : Tuple
- A tuple of (token, counts) format
-
- Returns
- -------
- ret : Tuple
- A tuple of (token, sum_of_counts)
-
- """
- token, counts = item
- return token, sum(counts)
-
-
-class SQuADDataFeaturizer:
- """Class that converts tokenized examples into featurized"""
-
- def __init__(self, word_vocab, char_vocab, para_limit, ques_limit, char_limit,
- is_cased_embedding):
- """Init SQuADDataFeaturizer object
-
- Parameters
- ----------
- word_vocab : Vocab
- Word-level vocabulary
- char_vocab : Vocab
- Char-level vocabulary
- para_limit : int
- Maximum characters in a paragraph
- ques_limit : int
- Maximum characters in a question
- char_limit : int
- Maximum characters in a token
- is_cased_embedding: bool
- Is underlying embedding is cased or uncased
- """
- self._para_limit = para_limit
- self._ques_limit = ques_limit
- self._char_limit = char_limit
-
- self._word_vocab = word_vocab
- self._char_vocab = char_vocab
-
- self._is_cased_embedding = is_cased_embedding
-
- def _get_words_emb(self, words):
- """Get embedding for the words
-
- Parameters
- ----------
- words : list[str]
- Words to embed
-
- Returns
- -------
- ret : np.array
- Array of embeddings for words
- """
-
- if not self._is_cased_embedding:
- return self._word_vocab[[word.lower() for word in words]]
-
- result = np.full([len(words)], fill_value=0, dtype=np.float32)
- word_emb_matrix = np.full([len(words), 4], fill_value=0, dtype=np.float32)
-
- for i, w in enumerate(words):
- word_emb_matrix[i, :] = self._word_vocab[[w, w.lower(), w.capitalize(), w.upper()]]
-
- mask = word_emb_matrix != 0
- first_non_zero_embeddings_indices = np.where(mask.any(axis=1), mask.argmax(axis=1), -1)
-
- for i, index in enumerate(first_non_zero_embeddings_indices):
- result[i] = word_emb_matrix[i, index]
-
- return result
-
- def build_features(self, example):
- """Generate features for a given example
-
- Parameters
- ----------
- example : dict
- A tokenized example of a dataset
-
- Returns
- -------
- ret : Tuple
- An example with tokens replaced with indices of the following format: question_id,
- record_index, context_tokens_indices, question_tokens_indices, context_chars_indices,
- question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
- context, context_tokens_spans
- """
- context_idxs = np.full([self._para_limit],
- fill_value=self._word_vocab[self._word_vocab.padding_token],
- dtype=np.float32)
-
- ctx_chars_idxs = np.full([self._para_limit, self._char_limit],
- fill_value=self._char_vocab[self._char_vocab.padding_token],
- dtype=np.float32)
-
- ques_idxs = np.full([self._ques_limit],
- fill_value=self._word_vocab[self._word_vocab.padding_token],
- dtype=np.float32)
-
- ques_char_idxs = np.full([self._ques_limit, self._char_limit],
- fill_value=self._char_vocab[self._char_vocab.padding_token],
- dtype=np.float32)
-
- context_len = min(len(example['context_tokens']), self._para_limit)
- context_idxs[:context_len] = self._get_words_emb(example['context_tokens'][:context_len])
-
- ques_len = min(len(example['ques_tokens']), self._ques_limit)
- ques_idxs[:ques_len] = self._get_words_emb(example['ques_tokens'][:ques_len])
-
- for i in range(0, context_len):
- char_len = min(len(example['context_chars'][i]), self._char_limit)
- ctx_chars_idxs[i, :char_len] = self._char_vocab[example['context_chars'][i][:char_len]]
-
- for i in range(0, ques_len):
- char_len = min(len(example['ques_chars'][i]), self._char_limit)
- ques_char_idxs[i, :char_len] = self._char_vocab[example['ques_tokens'][i][:char_len]]
-
- start, end = example['y1s'][-1], example['y2s'][-1]
-
- record = (example['id'],
- example['record_idx'],
- context_idxs,
- ques_idxs,
- ctx_chars_idxs,
- ques_char_idxs,
- start,
- end,
- example['context'],
- example['spans'])
-
- return record
-
-
-class SQuADQADataset(Dataset):
- """Dataset that wraps the featurized examples with standard Gluon API Dataset format. It allows
- to fetch a record by question id for the evaluation"""
-
- def __init__(self, records):
- super(SQuADQADataset, self).__init__()
- self._data = records
- self._record_idx_to_record = {}
-
- for record in records:
- self._record_idx_to_record[record[1]] = {'q_id': record[0], 'rec': record}
-
- def __getitem__(self, idx):
- """Get example by index in the original list
-
- Parameters
- ----------
- idx : int
-
- Returns
- -------
- ret : Tuple of question_id, record_index, context_tokens_indices, question_tokens_indices,
- context_chars_indices, question_char_indices, start_token_index_of_the_answer,
- end_token_index_of_the_answer, context, context_tokens_spans
- """
- return self._data[idx]
-
- def __len__(self):
- """Get the number of the examples in the dataset
-
- Returns
- -------
- ret : int
- Number of examples of the dataset
- """
- return len(self._data)
-
- def get_q_id_by_rec_idx(self, rec_idx):
- """Returns a question id associated with provided record index from original SQuAD dataset
-
- Parameters
- ----------
- rec_idx : int
- Record index in SQuAD dataset
-
- Returns
- -------
- question_id : str
- """
- return self._record_idx_to_record[rec_idx]['q_id']
-
- def get_record_by_idx(self, rec_idx):
- """Returns a record associated with provided record index from original SQuAD dataset
-
- Parameters
- ----------
- rec_idx : int
-
- Returns
- -------
- ret : Tuple of question_id, record_index, context_tokens_indices, question_tokens_indices,
- context_chars_indices, question_char_indices, start_token_index_of_the_answer,
- end_token_index_of_the_answer, context, context_tokens_spans
- """
- return self._record_idx_to_record[rec_idx]['rec']
-
-
-class SQuADDataLoaderTransformer:
- """Thin wrapper on SQuADQADataset that removed non-numeric values from the record. The output of
- that transformer can be provided to a DataLoader"""
-
- def __call__(self, q_id, record_idx, ctx_idxs, ques_idxs, ctx_chars_idxs, ques_char_idxs,
- start, end, context, spans):
- """Return the same record with non-numeric values removed from the output
-
- Parameters
- ----------
- q_id : str
- Question Id
- record_idx : int
- Record index
- ctx_idxs : NDArray
- Indices of context tokens
- ques_idxs : NDArray
- Indices of question tokens
- ctx_chars_idxs : NDArray
- Indices of context characters
- ques_char_idxs : NDArray
- Indices of question characters
- start : int
- Start of the answer
- end : int
- End of the answer
- context : str
- Original context string
- spans : List[Tuple]
- List of character indices of each token of the context.
-
- Returns
- -------
- record_idx : int
- Record index
- ctx_idxs : NDArray
- Indices of context tokens
- ques_idxs : NDArray
- Indices of question tokens
- ctx_chars_idxs : NDArray
- Indices of context characters
- ques_char_idxs : NDArray
- Indices of question characters
- start : int
- Start of the answer
- end : int
- End of the answer
- """
- return record_idx, ctx_idxs, ques_idxs, ctx_chars_idxs, ques_char_idxs, start, end
diff --git a/scripts/question_answering/eval_utils.py b/scripts/question_answering/eval_utils.py
new file mode 100644
index 0000000000..e28aecb7af
--- /dev/null
+++ b/scripts/question_answering/eval_utils.py
@@ -0,0 +1,267 @@
+"""Modification version of official evaluation script for SQuAD version 2.0.
+(https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
+
+"""
+import collections
+import json
+import copy
+import re
+import string
+
+
+def make_qid_to_has_ans(dataset):
+ qid_to_has_ans = {}
+ for article in dataset:
+ for p in article['paragraphs']:
+ for qa in p['qas']:
+ qid_to_has_ans[qa['id']] = bool(qa['answers'])
+ return qid_to_has_ans
+
+
+def normalize_answer(s):
+ """Lower text and remove punctuation, articles and extra whitespace."""
+
+ def remove_articles(text):
+ regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+ return re.sub(regex, ' ', text)
+
+ def white_space_fix(text):
+ return ' '.join(text.split())
+
+ def remove_punc(text):
+ exclude = set(string.punctuation)
+ return ''.join(ch for ch in text if ch not in exclude)
+
+ def lower(text):
+ return text.lower()
+
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+ if not s:
+ return []
+ return normalize_answer(s).split()
+
+
+def compute_exact(a_gold, a_pred):
+ return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+
+def compute_f1(a_gold, a_pred):
+ """
+ Compute the token-level f1 scores in which the common tokens are considered
+ as True Positives. Precision and recall are percentages of the number of
+ common tokens in the prediction and groud truth, respectively.
+ """
+ gold_toks = get_tokens(a_gold)
+ pred_toks = get_tokens(a_pred)
+ common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+ num_same = sum(common.values())
+ if len(gold_toks) == 0 or len(pred_toks) == 0:
+ # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+ return int(gold_toks == pred_toks)
+ if num_same == 0:
+ return 0
+ precision = 1.0 * num_same / len(pred_toks)
+ recall = 1.0 * num_same / len(gold_toks)
+ f1 = (2 * precision * recall) / (precision + recall)
+ return f1
+
+
+def get_raw_scores(dataset, preds):
+ exact_scores = {}
+ f1_scores = {}
+ for article in dataset:
+ for p in article['paragraphs']:
+ for qa in p['qas']:
+ qid = qa['id']
+ gold_answers = [a['text'] for a in qa['answers']
+ if normalize_answer(a['text'])]
+ if not gold_answers:
+ # For unanswerable questions, only correct answer is empty string
+ gold_answers = ['']
+ if qid not in preds:
+ print('Missing prediction for %s' % qid)
+ continue
+ a_pred = preds[qid]
+ # Take max over all gold answers
+ exact_scores[qid] = max(compute_exact(a, a_pred)
+ for a in gold_answers)
+ f1_scores[qid] = max(compute_f1(a, a_pred)
+ for a in gold_answers)
+ return exact_scores, f1_scores
+
+
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+ new_scores = {}
+ for qid, s in scores.items():
+ # Treat those whose logits exceeds the threshold as unanswerable
+ pred_na = na_probs[qid] > na_prob_thresh
+ if pred_na:
+ # The raw scores are converted to 1 if the answerability
+ # are predicted else 0
+ new_scores[qid] = float(not qid_to_has_ans[qid])
+ else:
+ new_scores[qid] = s
+ return new_scores
+
+
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+ if not qid_list:
+ total = len(exact_scores)
+ return collections.OrderedDict([
+ ('exact', 100.0 * sum(exact_scores.values()) / total),
+ ('f1', 100.0 * sum(f1_scores.values()) / total),
+ ('total', total),
+ ])
+ else:
+ total = len(qid_list)
+ return collections.OrderedDict([
+ ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+ ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+ ('total', total),
+ ])
+
+
+def merge_eval(main_eval, new_eval, prefix):
+ for k in new_eval:
+ main_eval['%s_%s' % (prefix, k)] = new_eval[k]
+
+
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+ """
+ Find the best threshold of the raw scores.
+
+ The initial score is set to the number of unanswerable questions,
+ assuming that each unanswerable question is successfully predicted.
+ In the following traverse, the best threshold is constantly adjusted
+ according to the difference from the assumption ('diff').
+ """
+ num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+ cur_score = num_no_ans
+ best_score = cur_score
+ best_thresh = 0.0
+ # Rearrange the na_probs in an ascending order, so that the questions
+ # with higher probability of answerability the sooner will be read.
+ qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+ for i, qid in enumerate(qid_list):
+ if qid not in scores:
+ continue
+ if qid_to_has_ans[qid]:
+ # For the answerable question
+ diff = scores[qid]
+ else:
+ # For the unanswerable question
+ if preds[qid]:
+ # Falsely predict the answerability
+ diff = -1
+ else:
+ # Correctly predict the answerability. This is Only true if the
+ # prediction is blank, which is no the case before revision
+ diff = 0
+ cur_score += diff
+ if cur_score > best_score:
+ # adjust the best thresh over current thresh (na_probs[qid])
+ best_score = cur_score
+ best_thresh = na_probs[qid]
+ return 100.0 * best_score / len(scores), best_thresh
+
+
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+ best_exact, exact_thresh = find_best_thresh(
+ preds, exact_raw, na_probs, qid_to_has_ans)
+ best_f1, f1_thresh = find_best_thresh(
+ preds, f1_raw, na_probs, qid_to_has_ans)
+ main_eval['best_exact'] = best_exact
+ main_eval['best_exact_thresh'] = exact_thresh
+ main_eval['best_f1'] = best_f1
+ main_eval['best_f1_thresh'] = f1_thresh
+
+
+def revise_unanswerable(preds, na_probs, na_prob_thresh):
+ """
+ Revise the predictions results and return a null string for unanswerable question
+ whose unanswerable probability above the threshold.
+
+ Parameters
+ ----------
+ preds: dict
+ A dictionary of full prediction of spans
+ na_probs: dict
+ A dictionary of unanswerable probabilities
+ na_prob_thresh: float
+ threshold of the unanswerable probability
+
+ Returns
+ -------
+ revised: dict
+ A dictionary of revised prediction
+ """
+ revised = copy.deepcopy(preds)
+ for q_id in na_probs.keys():
+ if na_probs[q_id] > na_prob_thresh:
+ revised[q_id] = ""
+ return revised
+
+
+def squad_eval(data_file, preds, na_probs, na_prob_thresh=0.0, revise=False):
+ """
+
+ Parameters
+ ----------
+ data_file
+ dataset(list) or data_file(str)
+ preds
+ predictions dictionary
+ na_probs
+ probabilities dictionary of unanswerable
+ na_prob_thresh
+ threshold of unanswerable
+ revise
+ Wether to get the final predictions with impossible answers replaced
+ with null string ''
+ Returns
+ -------
+ out_eval
+ A dictionary of output results
+ (preds_out)
+ A dictionary of final predictions
+ """
+ if isinstance(data_file, str):
+ with open(data_file) as f:
+ dataset_json = json.load(f)
+ dataset = dataset_json['data']
+ elif isinstance(data_file, list):
+ dataset = data_file
+ if na_probs is None:
+ na_probs = {k: 0.0 for k in preds}
+ # not necessary to revise results of SQuAD 1.1
+ revise = False
+ qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False
+ has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+ no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+ exact_raw, f1_raw = get_raw_scores(dataset, preds)
+ exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
+ na_prob_thresh)
+ f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
+ na_prob_thresh)
+ out_eval = make_eval_dict(exact_thresh, f1_thresh)
+ if has_ans_qids:
+ has_ans_eval = make_eval_dict(
+ exact_thresh, f1_thresh, qid_list=has_ans_qids)
+ merge_eval(out_eval, has_ans_eval, 'HasAns')
+ if no_ans_qids:
+ no_ans_eval = make_eval_dict(
+ exact_thresh, f1_thresh, qid_list=no_ans_qids)
+ merge_eval(out_eval, no_ans_eval, 'NoAns')
+ find_all_best_thresh(out_eval, preds, exact_raw,
+ f1_raw, na_probs, qid_to_has_ans)
+
+ if revise:
+ thresh = (out_eval['best_exact_thresh'] +
+ out_eval['best_f1_thresh']) * 0.5
+ preds_out = revise_unanswerable(preds, na_probs, thresh)
+ return out_eval, preds_out
+ else:
+ return out_eval, preds
diff --git a/scripts/question_answering/models.py b/scripts/question_answering/models.py
new file mode 100644
index 0000000000..58b156cbf3
--- /dev/null
+++ b/scripts/question_answering/models.py
@@ -0,0 +1,360 @@
+import mxnet as mx
+from mxnet.gluon import nn, HybridBlock
+from mxnet.util import use_np
+from gluonnlp.layers import get_activation
+from gluonnlp.op import select_vectors_by_position
+from gluonnlp.attention_cell import masked_logsoftmax, masked_softmax
+
+
+@use_np
+class ModelForQABasic(HybridBlock):
+ """The basic pretrained model for QA. It is used in the original BERT paper for SQuAD 1.1.
+
+ Here, we directly use the backbone network to extract the contextual embeddings and use
+ another dense layer to map the contextual embeddings to the start scores and end scores.
+
+ use_segmentation is used to mark whether we segment the input sentence. In RoBERTa and XLMR,
+ this flag is set to True, then the QA model no longer accept `token_types` as valid input.
+
+ - use_segmentation=True:
+ tokens : Question Context
+ token_typess: 0 0 0 1 1
+
+ - use_segmentation=False:
+ tokens : Question Context
+ token_typess: None
+ """
+ def __init__(self, backbone, weight_initializer=None, bias_initializer=None,
+ use_segmentation=True):
+ super().__init__()
+
+ self.backbone = backbone
+ self.use_segmentation = use_segmentation
+ self.qa_outputs = nn.Dense(units=2, flatten=False,
+ weight_initializer=weight_initializer,
+ bias_initializer=bias_initializer)
+
+ def hybrid_forward(self, F, tokens, token_types, valid_length, p_mask):
+ """
+
+ Parameters
+ ----------
+ F
+ tokens
+ Shape (batch_size, seq_length)
+ The merged input tokens
+ token_types
+ Shape (batch_size, seq_length)
+ Token types for the sequences, used to indicate whether the word belongs to the
+ first sentence or the second one.
+ valid_length
+ Shape (batch_size,)
+ Valid length of the sequence. This is used to mask the padded tokens.
+ p_mask
+ The mask that is associated with the tokens.
+
+ Returns
+ -------
+ start_logits
+ Shape (batch_size, sequence_length)
+ The log-softmax scores that the position is the start position.
+ end_logits
+ Shape (batch_size, sequence_length)
+ The log-softmax scores that the position is the end position.
+ """
+ # Get contextual embedding with the shape (batch_size, sequence_length, C)
+ if self.use_segmentation:
+ contextual_embeddings = self.backbone(tokens, token_types, valid_length)
+ else:
+ contextual_embeddings = self.backbone(tokens, valid_length)
+ scores = self.qa_outputs(contextual_embeddings)
+ start_scores = scores[:, :, 0]
+ end_scores = scores[:, :, 1]
+ start_logits = masked_logsoftmax(F, start_scores, mask=p_mask, axis=-1)
+ end_logits = masked_logsoftmax(F, end_scores, mask=p_mask, axis=-1)
+ return start_logits, end_logits
+
+ def inference(self, tokens, token_types, valid_length, p_mask,
+ start_top_n: int = 5, end_top_n: int = 5):
+ """Get the inference result with beam search
+
+ Parameters
+ ----------
+ tokens
+ The input tokens. Shape (batch_size, sequence_length)
+ token_types
+ The input token types. Shape (batch_size, sequence_length)
+ valid_length
+ The valid length of the tokens. Shape (batch_size,)
+ p_mask
+ The mask which indicates that some tokens won't be used in the calculation.
+ Shape (batch_size, sequence_length)
+ start_top_n
+ The number of candidates to select for the start position.
+ end_top_n
+ The number of candidates to select for the end position.
+
+ Returns
+ -------
+ start_top_logits
+ The top start logits
+ Shape (batch_size, start_top_n)
+ start_top_index
+ Index of the top start logits
+ Shape (batch_size, start_top_n)
+ end_top_logits
+ The top end logits.
+ Shape (batch_size, end_top_n)
+ end_top_index
+ Index of the top end logits
+ Shape (batch_size, end_top_n)
+ """
+ # Shape (batch_size, sequence_length, C)
+ if self.use_segmentation:
+ contextual_embeddings = self.backbone(tokens, token_types, valid_length)
+ else:
+ contextual_embeddings = self.backbone(tokens, valid_length)
+ scores = self.qa_outputs(contextual_embeddings)
+ start_scores = scores[:, :, 0]
+ end_scores = scores[:, :, 1]
+ start_logits = masked_logsoftmax(mx.nd, start_scores, mask=p_mask, axis=-1)
+ end_logits = masked_logsoftmax(mx.nd, end_scores, mask=p_mask, axis=-1)
+ # The shape of start_top_index will be (..., start_top_n)
+ start_top_logits, start_top_index = mx.npx.topk(start_logits, k=start_top_n, axis=-1,
+ ret_typ='both')
+ # Note that end_top_index and end_top_log_probs have shape (bsz, start_n_top, end_n_top)
+ # So that for each start position, there are end_n_top end positions on the third dim.
+ end_top_logits, end_top_index = mx.npx.topk(end_logits, k=end_top_n, axis=-1,
+ ret_typ='both')
+ return start_top_logits, start_top_index, end_top_logits, end_top_index
+
+
+@use_np
+class ModelForQAConditionalV1(HybridBlock):
+ """Here, we use three networks to predict the start scores, end scores and answerable scores.
+
+ We formulate p(start, end, answerable | contextual_embedding) as the product of the
+ following three terms:
+
+ - p(start | contextual_embedding)
+ - p(end | start, contextual_embedding)
+ - p(answerable | contextual_embedding)
+
+ In the inference phase, we are able to use beam search to do the inference.
+
+ use_segmentation is used to mark whether we segment the input sentence. In RoBERTa and XLMR,
+ this flag is set to True, then the QA model no longer accept `token_types` as valid input.
+
+ - use_segmentation=True:
+ tokens : Question Context
+ token_typess: 0 0 0 1 1
+
+ - use_segmentation=False:
+ tokens : Question Context
+ token_typess: None
+ """
+ def __init__(self, backbone, units=768, layer_norm_eps=1E-12, dropout_prob=0.1,
+ activation='tanh', weight_initializer=None, bias_initializer=None,
+ use_segmentation=True):
+ super().__init__()
+ self.backbone = backbone
+ self.use_segmentation = use_segmentation
+ self.start_scores = nn.Dense(1, flatten=False,
+ weight_initializer=weight_initializer,
+ bias_initializer=bias_initializer)
+ self.end_scores = nn.HybridSequential()
+ self.end_scores.add(nn.Dense(units, flatten=False,
+ weight_initializer=weight_initializer,
+ bias_initializer=bias_initializer))
+ self.end_scores.add(get_activation(activation))
+ self.end_scores.add(nn.LayerNorm(epsilon=layer_norm_eps))
+ self.end_scores.add(nn.Dense(1, flatten=False,
+ weight_initializer=weight_initializer,
+ bias_initializer=bias_initializer))
+ self.answerable_scores = nn.HybridSequential()
+ self.answerable_scores.add(nn.Dense(units, flatten=False,
+ weight_initializer=weight_initializer,
+ bias_initializer=bias_initializer))
+ self.answerable_scores.add(get_activation(activation))
+ self.answerable_scores.add(nn.Dropout(dropout_prob))
+ self.answerable_scores.add(nn.Dense(2, flatten=False,
+ weight_initializer=weight_initializer,
+ bias_initializer=bias_initializer))
+
+ def get_start_logits(self, F, contextual_embedding, p_mask):
+ """
+
+ Parameters
+ ----------
+ F
+ contextual_embedding
+ Shape (batch_size, sequence_length, C)
+
+ Returns
+ -------
+ start_logits
+ Shape (batch_size, sequence_length)
+ """
+ start_scores = F.np.squeeze(self.start_scores(contextual_embedding), -1)
+ start_logits = masked_logsoftmax(F, start_scores, mask=p_mask, axis=-1)
+ return start_logits
+
+ def get_end_logits(self, F, contextual_embedding, start_positions, p_mask):
+ """
+
+ Parameters
+ ----------
+ F
+ contextual_embedding
+ Shape (batch_size, sequence_length, C)
+ start_positions
+ Shape (batch_size, N)
+ We process multiple candidates simultaneously
+ p_mask
+ Shape (batch_size, sequence_length)
+
+ Returns
+ -------
+ end_logits
+ Shape (batch_size, N, sequence_length)
+ """
+ # Select the features at the start_positions
+ # start_feature will have shape (batch_size, N, C)
+ start_features = select_vectors_by_position(F, contextual_embedding, start_positions)
+ # Concatenate the start_feature and the contextual_embedding
+ contextual_embedding = F.np.expand_dims(contextual_embedding, axis=1) # (B, 1, T, C)
+ start_features = F.np.expand_dims(start_features, axis=2) # (B, N, 1, C)
+ concat_features = F.np.concatenate([F.npx.broadcast_like(start_features,
+ contextual_embedding, 2, 2),
+ F.npx.broadcast_like(contextual_embedding,
+ start_features, 1, 1)],
+ axis=-1) # (B, N, T, 2C)
+ end_scores = self.end_scores(concat_features)
+ end_scores = F.np.squeeze(end_scores, -1)
+ end_logits = masked_logsoftmax(F, end_scores, mask=F.np.expand_dims(p_mask, axis=1),
+ axis=-1)
+ return end_logits
+
+ def get_answerable_logits(self, F, contextual_embedding, p_mask):
+ """Get the answerable logits.
+
+ Parameters
+ ----------
+ F
+ contextual_embedding
+ Shape (batch_size, sequence_length, C)
+ p_mask
+ Shape (batch_size, sequence_length)
+ Mask the sequence.
+ 0 --> Denote that the element is masked,
+ 1 --> Denote that the element is not masked
+
+ Returns
+ -------
+ answerable_logits
+ Shape (batch_size, 2)
+ """
+ # Shape (batch_size, sequence_length)
+ start_scores = F.np.squeeze(self.start_scores(contextual_embedding), -1)
+ start_score_weights = masked_softmax(F, start_scores, p_mask, axis=-1)
+ start_agg_feature = F.npx.batch_dot(F.np.expand_dims(start_score_weights, axis=1),
+ contextual_embedding)
+ start_agg_feature = F.np.squeeze(start_agg_feature, 1)
+ cls_feature = contextual_embedding[:, 0, :]
+ answerable_scores = self.answerable_scores(F.np.concatenate([start_agg_feature,
+ cls_feature], axis=-1))
+ answerable_logits = F.npx.log_softmax(answerable_scores, axis=-1)
+ return answerable_logits
+
+ def hybrid_forward(self, F, tokens, token_types, valid_length, p_mask, start_position):
+ """
+
+ Parameters
+ ----------
+ F
+ tokens
+ Shape (batch_size, sequence_length)
+ token_types
+ Shape (batch_size, sequence_length)
+ valid_length
+ Shape (batch_size,)
+ p_mask
+ Shape (batch_size, sequence_length)
+ start_position
+ Shape (batch_size,)
+
+ Returns
+ -------
+ start_logits
+ Shape (batch_size, sequence_length)
+ end_logits
+ Shape (batch_size, sequence_length)
+ answerable_logits
+ """
+ if self.use_segmentation:
+ contextual_embeddings = self.backbone(tokens, token_types, valid_length)
+ else:
+ contextual_embeddings = self.backbone(tokens, valid_length)
+ start_logits = self.get_start_logits(F, contextual_embeddings, p_mask)
+ end_logits = self.get_end_logits(F, contextual_embeddings,
+ F.np.expand_dims(start_position, axis=1),
+ p_mask)
+ end_logits = F.np.squeeze(end_logits, axis=1)
+ answerable_logits = self.get_answerable_logits(F, contextual_embeddings, p_mask)
+ return start_logits, end_logits, answerable_logits
+
+ def inference(self, tokens, token_types, valid_length, p_mask,
+ start_top_n: int = 5, end_top_n: int = 5):
+ """Get the inference result with beam search
+
+ Parameters
+ ----------
+ tokens
+ The input tokens. Shape (batch_size, sequence_length)
+ token_types
+ The input token types. Shape (batch_size, sequence_length)
+ valid_length
+ The valid length of the tokens. Shape (batch_size,)
+ p_mask
+ The mask which indicates that some tokens won't be used in the calculation.
+ Shape (batch_size, sequence_length)
+ start_top_n
+ The number of candidates to select for the start position.
+ end_top_n
+ The number of candidates to select for the end position.
+
+ Returns
+ -------
+ start_top_logits
+ The top start logits
+ Shape (batch_size, start_top_n)
+ start_top_index
+ Index of the top start logits
+ Shape (batch_size, start_top_n)
+ end_top_logits
+ The top end logits.
+ Shape (batch_size, start_top_n, end_top_n)
+ end_top_index
+ Index of the top end logits
+ Shape (batch_size, start_top_n, end_top_n)
+ answerable_logits
+ The answerable logits. Here 0 --> answerable and 1 --> not answerable.
+ Shape (batch_size, sequence_length, 2)
+ """
+ # Shape (batch_size, sequence_length, C)
+ if self.use_segmentation:
+ contextual_embeddings = self.backbone(tokens, token_types, valid_length)
+ else:
+ contextual_embeddings = self.backbone(tokens, valid_length)
+ start_logits = self.get_start_logits(mx.nd, contextual_embeddings, p_mask)
+ # The shape of start_top_index will be (..., start_top_n)
+ start_top_logits, start_top_index = mx.npx.topk(start_logits, k=start_top_n, axis=-1,
+ ret_typ='both')
+ end_logits = self.get_end_logits(mx.nd, contextual_embeddings, start_top_index, p_mask)
+ # Note that end_top_index and end_top_log_probs have shape (bsz, start_n_top, end_n_top)
+ # So that for each start position, there are end_n_top end positions on the third dim.
+ end_top_logits, end_top_index = mx.npx.topk(end_logits, k=end_top_n, axis=-1,
+ ret_typ='both')
+ answerable_logits = self.get_answerable_logits(mx.nd, contextual_embeddings, p_mask)
+ return start_top_logits, start_top_index, end_top_logits, end_top_index, \
+ answerable_logits
diff --git a/scripts/question_answering/run_squad.py b/scripts/question_answering/run_squad.py
new file mode 100644
index 0000000000..e4eaf3f629
--- /dev/null
+++ b/scripts/question_answering/run_squad.py
@@ -0,0 +1,963 @@
+"""
+Question Answering with Pretrained Language Model
+"""
+# pylint:disable=redefined-outer-name,logging-format-interpolation
+
+import os
+import json
+import time
+import logging
+import argparse
+import functools
+import collections
+from multiprocessing import Pool, cpu_count
+
+import mxnet as mx
+import numpy as np
+from mxnet.lr_scheduler import PolyScheduler
+
+import gluonnlp.data.batchify as bf
+from models import ModelForQABasic, ModelForQAConditionalV1
+from eval_utils import squad_eval
+from squad_utils import SquadFeature, get_squad_examples, convert_squad_example_to_feature
+from gluonnlp.models import get_backbone
+from gluonnlp.utils.misc import repeat, grouper, set_seed, init_comm, \
+ logging_config, count_parameters, parse_ctx
+from gluonnlp.initializer import TruncNorm
+from gluonnlp.data.sampler import SplitSampler
+from gluonnlp.utils.parameter import grad_global_norm, clip_grad_global_norm
+
+try:
+ import horovod.mxnet as hvd
+except ImportError:
+ pass
+
+mx.npx.set_np()
+
+CACHE_PATH = os.path.realpath(os.path.join(os.path.realpath(__file__), '..', 'cached'))
+if not os.path.exists(CACHE_PATH):
+ os.makedirs(CACHE_PATH, exist_ok=True)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='Question Answering example. '
+ 'We fine-tune the pretrained model on SQuAD dataset.')
+ parser.add_argument('--model_name', type=str, default='google_albert_base_v2',
+ help='Name of the pretrained model.')
+ parser.add_argument('--do_train', action='store_true',
+ help='Whether to train the model')
+ parser.add_argument('--do_eval', action='store_true',
+ help='Whether to evaluate the model')
+ parser.add_argument('--data_dir', type=str, default='squad')
+ parser.add_argument('--version', default='2.0', choices=['1.1', '2.0'],
+ help='Version of the SQuAD dataset.')
+ parser.add_argument('--output_dir', type=str, default='squad_out',
+ help='The output directory where the model params will be written.'
+ ' default is squad_out')
+ # Communication
+ parser.add_argument('--comm_backend', type=str, default='device',
+ choices=['horovod', 'dist_sync_device', 'device'],
+ help='Communication backend.')
+ parser.add_argument('--gpus', type=str, default='0',
+ help='list of gpus to run, e.g. 0 or 0,2,5. -1 means using cpu.')
+ # Training hyperparameters
+ parser.add_argument('--seed', type=int, default=100, help='Random seed')
+ parser.add_argument('--log_interval', type=int, default=50,
+ help='The logging interval for training')
+ parser.add_argument('--eval_log_interval', type=int, default=10,
+ help='The logging interval for evaluation')
+ parser.add_argument('--save_interval', type=int, default=None,
+ help='the number of steps to save model parameters.'
+ 'default is every epoch')
+ parser.add_argument('--epochs', type=float, default=3.0,
+ help='Number of epochs, default is 3')
+ parser.add_argument('--num_train_steps', type=int, default=None,
+ help='The number of training steps. Note that epochs will be ignored '
+ 'if training steps are set')
+ parser.add_argument('--batch_size', type=int, default=8,
+ help='Batch size. Number of examples per gpu in a minibatch. default is 32')
+ parser.add_argument('--eval_batch_size', type=int, default=16,
+ help='Evaluate batch size. Number of examples per gpu in a minibatch for '
+ 'evaluation.')
+ parser.add_argument('--max_grad_norm', type=float, default=1.0,
+ help='Max gradient norm.')
+ parser.add_argument('--optimizer', type=str, default='adamw',
+ help='optimization algorithm. default is adamw')
+ parser.add_argument('--adam_epsilon', type=float, default=1e-6,
+ help='epsilon of AdamW optimizer')
+ parser.add_argument('--adam_betas', default='(0.9, 0.999)', metavar='B',
+ help='betas for Adam optimizer')
+ parser.add_argument('--num_accumulated', type=int, default=1,
+ help='The number of batches for gradients accumulation to '
+ 'simulate large batch size.')
+ parser.add_argument('--lr', type=float, default=2e-5,
+ help='Initial learning rate. default is 2e-5')
+ parser.add_argument('--warmup_ratio', type=float, default=0.1,
+ help='Ratio of warmup steps in the learning rate scheduler.')
+ parser.add_argument('--warmup_steps', type=int, default=None,
+ help='warmup steps. Note that either warmup_steps or warmup_ratio is set.')
+ parser.add_argument('--wd', type=float, default=0.01, help='weight decay')
+ parser.add_argument('--layerwise_decay', type=float, default=-1, help='Layer-wise lr decay')
+ parser.add_argument('--untunable_depth', type=float, default=-1,
+ help='Depth of untunable parameters')
+ parser.add_argument('--classifier_dropout', type=float, default=0.1,
+ help='dropout of classifier')
+ # Data pre/post processing
+ parser.add_argument('--max_seq_length', type=int, default=512,
+ help='The maximum total input sequence length after tokenization.'
+ 'Sequences longer than this will be truncated, and sequences shorter '
+ 'than this will be padded. default is 512')
+ parser.add_argument('--doc_stride', type=int, default=128,
+ help='When splitting up a long document into chunks, how much stride to '
+ 'take between chunks. default is 128')
+ parser.add_argument('--max_query_length', type=int, default=64,
+ help='The maximum number of tokens for the query. Questions longer than '
+ 'this will be truncated to this length. default is 64')
+ parser.add_argument('--round_to', type=int, default=None,
+ help='The length of padded sequences will be rounded up to be multiple'
+ ' of this argument. When round to is set to 8, training throughput '
+ 'may increase for mixed precision training on GPUs with TensorCores.')
+ parser.add_argument('--overwrite_cache', action='store_true',
+ help='Whether to overwrite the feature cache.')
+ # Evaluation hyperparameters
+ parser.add_argument('--start_top_n', type=int, default=5,
+ help='Number of start-position candidates')
+ parser.add_argument('--end_top_n', type=int, default=5,
+ help='Number of end-position candidates corresponding '
+ 'to a start position')
+ parser.add_argument('--n_best_size', type=int, default=20, help='Top N results written to file')
+ parser.add_argument('--max_answer_length', type=int, default=30,
+ help='The maximum length of an answer that can be generated. This is '
+ 'needed because the start and end predictions are not conditioned '
+ 'on one another. default is 30')
+ parser.add_argument('--param_checkpoint', type=str, default=None,
+ help='The parameter checkpoint for evaluating the model')
+ parser.add_argument('--backbone_path', type=str, default=None,
+ help='The parameter checkpoint of backbone model')
+ parser.add_argument('--all_evaluate', action='store_true',
+ help='Whether to evaluate all intermediate checkpoints '
+ 'instead of only last one')
+ parser.add_argument('--max_saved_ckpt', type=int, default=5,
+ help='The maximum number of saved checkpoints')
+ parser.add_argument('--eval_dtype', type=str, default='float32',
+ help='Data type used for evaluation. Either float32 or float16')
+ args = parser.parse_args()
+ return args
+
+
+class SquadDatasetProcessor:
+
+ def __init__(self, tokenizer, doc_stride, max_seq_length, max_query_length):
+ """
+
+ Parameters
+ ----------
+ tokenizer
+ The tokenizer
+ doc_stride
+ The stride to chunk the document
+ max_seq_length
+ Maximum length of the merged data
+ max_query_length
+ Maximum query length
+ """
+ self._tokenizer = tokenizer
+ self._doc_stride = doc_stride
+ self._max_seq_length = max_seq_length
+ self._max_query_length = max_query_length
+
+ vocab = tokenizer.vocab
+ self.pad_id = vocab.pad_id
+ # For roberta model, taking sepecial token as [CLS] and as [SEP]
+ self.cls_id = vocab.bos_id if 'cls_token' not in vocab.special_token_keys else vocab.cls_id
+ self.sep_id = vocab.eos_id if 'sep_token' not in vocab.special_token_keys else vocab.sep_id
+
+ # TODO(sxjscience) Consider to combine the NamedTuple and batchify functionality.
+ self.ChunkFeature = collections.namedtuple('ChunkFeature',
+ ['qas_id',
+ 'data',
+ 'valid_length',
+ 'segment_ids',
+ 'masks',
+ 'is_impossible',
+ 'gt_start',
+ 'gt_end',
+ 'context_offset',
+ 'chunk_start',
+ 'chunk_length'])
+ self.BatchifyFunction = bf.NamedTuple(self.ChunkFeature,
+ {'qas_id': bf.List(),
+ 'data': bf.Pad(val=self.pad_id),
+ 'valid_length': bf.Stack(),
+ 'segment_ids': bf.Pad(),
+ 'masks': bf.Pad(val=1),
+ 'is_impossible': bf.Stack(),
+ 'gt_start': bf.Stack(),
+ 'gt_end': bf.Stack(),
+ 'context_offset': bf.Stack(),
+ 'chunk_start': bf.Stack(),
+ 'chunk_length': bf.Stack()})
+
+ def process_sample(self, feature: SquadFeature):
+ """Process the data to the following format.
+
+ Note that we mask all the special tokens except the CLS token. The reason for not masking
+ the CLS token is that if the question is not answerable, we will set the start and end to
+ be 0.
+
+
+ Merged: Question Context
+ Segment IDs: 0 0 0 1 1
+ Mask: 0 1 1 0 1
+
+ Here, we need to emphasize that when mask = 1, the data are actually not masked!
+
+ Parameters
+ ----------
+ feature
+ Tokenized SQuAD feature
+
+ Returns
+ -------
+ ret
+ Divide the feature into multiple chunks and extract the feature which contains
+ the following:
+ - data
+ The data that concatenates the query and the context + special tokens
+ - valid_length
+ The valid_length of the data
+ - segment_ids
+ We assign the query part as segment 0 and the context part as segment 1.
+ - masks
+ We mask all the special tokens. 1 --> not masked, 0 --> masked.
+ - is_impossible
+ Whether the provided context is impossible to answer or not.
+ - gt_start
+ The ground-truth start location of the span
+ - gt_end
+ The ground-truth end location of the span
+ - chunk_start
+ The start of the chunk
+ - chunk_length
+ The length of the chunk
+ """
+ ret = []
+ truncated_query_ids = feature.query_token_ids[:self._max_query_length]
+ chunks = feature.get_chunks(
+ doc_stride=self._doc_stride,
+ max_chunk_length=self._max_seq_length - len(truncated_query_ids) - 3)
+ for chunk in chunks:
+ data = np.array([self.cls_id] + truncated_query_ids + [self.sep_id] +
+ feature.context_token_ids[chunk.start:(chunk.start + chunk.length)] +
+ [self.sep_id], dtype=np.int32)
+ valid_length = len(data)
+ segment_ids = np.array([0] + [0] * len(truncated_query_ids) +
+ [0] + [1] * chunk.length + [1], dtype=np.int32)
+ masks = np.array([0] + [1] * len(truncated_query_ids) + [1] + [0] * chunk.length + [1],
+ dtype=np.int32)
+ context_offset = len(truncated_query_ids) + 2
+ if chunk.gt_start_pos is None and chunk.gt_end_pos is None:
+ start_pos = 0
+ end_pos = 0
+ else:
+ # Here, we increase the start and end because we put query before context
+ start_pos = chunk.gt_start_pos + context_offset
+ end_pos = chunk.gt_end_pos + context_offset
+ chunk_feature = self.ChunkFeature(qas_id=feature.qas_id,
+ data=data,
+ valid_length=valid_length,
+ segment_ids=segment_ids,
+ masks=masks,
+ is_impossible=chunk.is_impossible,
+ gt_start=start_pos,
+ gt_end=end_pos,
+ context_offset=context_offset,
+ chunk_start=chunk.start,
+ chunk_length=chunk.length)
+ ret.append(chunk_feature)
+ return ret
+
+ def get_train(self, features, skip_unreliable=True):
+ """Get the training dataset
+
+ Parameters
+ ----------
+ features
+ skip_unreliable
+ Whether to skip the unreliable spans in the training set
+
+ Returns
+ -------
+ train_dataset
+ num_token_answer_mismatch
+ num_unreliable
+ """
+ train_dataset = []
+ num_token_answer_mismatch = 0
+ num_unreliable = 0
+ for feature in features:
+ if feature.token_answer_mismatch:
+ num_token_answer_mismatch += 1
+ if feature.unreliable_span:
+ num_unreliable += 1
+ if skip_unreliable and feature.unreliable_span:
+ # Skip when not reliable
+ continue
+ # Process the feature
+ chunk_features = self.process_sample(feature)
+ train_dataset.extend(chunk_features)
+ return train_dataset, num_token_answer_mismatch, num_unreliable
+
+
+def get_squad_features(args, tokenizer, segment):
+ """
+ Get processed data features of SQuADExampls
+
+ Parameters
+ ----------
+ args : argparse.Namespace
+ tokenizer:
+ Tokenizer instance
+ segment: str
+ train or dev
+
+ Returns
+ -------
+ data_features
+ The list of processed data features
+ """
+ data_cache_path = os.path.join(CACHE_PATH,
+ '{}_{}_squad_{}.ndjson'.format(
+ segment, args.model_name, args.version))
+ is_training = (segment == 'train')
+ if os.path.exists(data_cache_path) and not args.overwrite_cache:
+ data_features = []
+ with open(data_cache_path, 'r') as f:
+ for line in f:
+ data_features.append(SquadFeature.from_json(line))
+ logging.info('Found cached data features, load from {}'.format(data_cache_path))
+ else:
+ data_examples = get_squad_examples(args.data_dir, segment=segment, version=args.version)
+ start = time.time()
+ num_process = min(cpu_count(), 8)
+ logging.info('Tokenize Data:')
+ with Pool(num_process) as pool:
+ data_features = pool.map(functools.partial(convert_squad_example_to_feature,
+ tokenizer=tokenizer,
+ is_training=is_training), data_examples)
+ logging.info('Done! Time spent:{:.2f} seconds'.format(time.time() - start))
+ with open(data_cache_path, 'w') as f:
+ for feature in data_features:
+ f.write(feature.to_json() + '\n')
+
+ return data_features
+
+
+def get_network(model_name,
+ ctx_l,
+ dropout=0.1,
+ checkpoint_path=None,
+ backbone_path=None,
+ dtype='float32'):
+ """
+ Get the network that fine-tune the Question Answering Task
+
+ Parameters
+ ----------
+ model_name : str
+ The model name of the backbone model
+ ctx_l :
+ Context list of training device like [mx.gpu(0), mx.gpu(1)]
+ dropout : float
+ Dropout probability of the task specified layer
+ checkpoint_path: str
+ Path to a Fine-tuned checkpoint
+ backbone_path: str
+ Path to the backbone model to be loaded in qa_net
+
+ Returns
+ -------
+ cfg
+ tokenizer
+ qa_net
+ use_segmentation
+ """
+ # Create the network
+ use_segmentation = 'roberta' not in model_name and 'xlmr' not in model_name
+ Model, cfg, tokenizer, download_params_path, _ = \
+ get_backbone(model_name, load_backbone=not backbone_path)
+ backbone = Model.from_cfg(cfg, use_pooler=False, dtype=dtype)
+ # Load local backbone parameters if backbone_path provided.
+ # Otherwise, download backbone parameters from gluon zoo.
+
+ backbone_params_path = backbone_path if backbone_path else download_params_path
+ if checkpoint_path is None:
+ backbone.load_parameters(backbone_params_path, ignore_extra=True,
+ ctx=ctx_l, cast_dtype=True)
+ num_params, num_fixed_params = count_parameters(backbone.collect_params())
+ logging.info(
+ 'Loading Backbone Model from {}, with total/fixd parameters={}/{}'.format(
+ backbone_params_path, num_params, num_fixed_params))
+ qa_net = ModelForQAConditionalV1(backbone=backbone,
+ dropout_prob=dropout,
+ use_segmentation=use_segmentation,
+ weight_initializer=TruncNorm(stdev=0.02))
+ if checkpoint_path is None:
+ # Ignore the UserWarning during initialization,
+ # There is no need to re-initialize the parameters of backbone
+ qa_net.initialize(ctx=ctx_l)
+ else:
+ qa_net.load_parameters(checkpoint_path, ctx=ctx_l, cast_dtype=True)
+ qa_net.hybridize()
+
+ return cfg, tokenizer, qa_net, use_segmentation
+
+
+def train(args):
+ store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm(
+ args.comm_backend, args.gpus)
+ cfg, tokenizer, qa_net, use_segmentation = \
+ get_network(args.model_name, ctx_l,
+ args.classifier_dropout,
+ args.param_checkpoint,
+ args.backbone_path)
+
+ logging.info('Prepare training data')
+ train_features = get_squad_features(args, tokenizer, segment='train')
+ dataset_processor = SquadDatasetProcessor(tokenizer=tokenizer,
+ doc_stride=args.doc_stride,
+ max_seq_length=args.max_seq_length,
+ max_query_length=args.max_query_length)
+ logging.info('Processing the Training data:')
+ train_dataset, num_answer_mismatch, num_unreliable \
+ = dataset_processor.get_train(train_features, skip_unreliable=True)
+ logging.info('Done! #Unreliable Span={} / #Mismatched Answer={} / #Total={}'
+ .format(num_unreliable, num_answer_mismatch, len(train_features)))
+
+ # Get dataset statistics
+ num_impossible = 0
+ for sample in train_dataset:
+ num_impossible += sample.is_impossible
+ logging.info('Before Chunking, #Train/Is Impossible = {}/{}'
+ .format(len(train_features),
+ sum([ele.is_impossible for ele in train_features])))
+ logging.info('After Chunking, #Train Sample/Is Impossible = {}/{}'
+ .format(len(train_dataset), num_impossible))
+ sampler = SplitSampler(len(train_dataset), num_parts=num_workers,
+ part_index=rank, even_size=True)
+ train_dataloader = mx.gluon.data.DataLoader(
+ train_dataset,
+ batchify_fn=dataset_processor.BatchifyFunction,
+ batch_size=args.batch_size,
+ num_workers=0,
+ sampler=sampler)
+ if 'electra' in args.model_name:
+ # Froze parameters, does not work for albert model since parameters in all layers are shared
+ if args.untunable_depth > 0:
+ qa_net.backbone.frozen_params(args.untunable_depth)
+ if args.layerwise_decay > 0:
+ qa_net.backbone.apply_layerwise_decay(args.layerwise_decay)
+
+ logging.info('Creating distributed trainer...')
+ # Collect differentiable parameters
+ param_dict = qa_net.collect_params()
+ # Do not apply weight decay to all the LayerNorm and bias
+ for _, v in qa_net.collect_params('.*beta|.*gamma|.*bias').items():
+ v.wd_mult = 0.0
+ params = [p for p in param_dict.values() if p.grad_req != 'null']
+ # Set grad_req if gradient accumulation is required
+ if args.num_accumulated > 1:
+ logging.info('Using gradient accumulation. Effective global batch size = {}'
+ .format(args.num_accumulated * args.batch_size * len(ctx_l) * num_workers))
+ for p in params:
+ p.grad_req = 'add'
+ # backend specific implementation
+ if args.comm_backend == 'horovod':
+ # Horovod: fetch and broadcast parameters
+ hvd.broadcast_parameters(param_dict, root_rank=0)
+
+ epoch_size = (len(train_dataloader) + len(ctx_l) - 1) // len(ctx_l)
+ if args.num_train_steps is not None:
+ num_train_steps = args.num_train_steps
+ else:
+ num_train_steps = int(args.epochs * epoch_size / args.num_accumulated)
+ if args.warmup_steps is not None:
+ warmup_steps = args.warmup_steps
+ else:
+ warmup_steps = int(num_train_steps * args.warmup_ratio)
+ assert warmup_steps is not None, 'Must specify either warmup_steps or warmup_ratio'
+ log_interval = args.log_interval
+ save_interval = args.save_interval if args.save_interval is not None\
+ else epoch_size // args.num_accumulated
+ logging.info('#Total Training Steps={}, Warmup={}, Save Interval={}'
+ .format(num_train_steps, warmup_steps, save_interval))
+
+ # set up optimization
+ lr_scheduler = PolyScheduler(max_update=num_train_steps,
+ base_lr=args.lr,
+ warmup_begin_lr=0,
+ pwr=1,
+ final_lr=0,
+ warmup_steps=warmup_steps,
+ warmup_mode='linear')
+ optimizer_params = {'learning_rate': args.lr,
+ 'wd': args.wd,
+ 'lr_scheduler': lr_scheduler,
+ }
+ adam_betas = eval(args.adam_betas)
+ if args.optimizer == 'adamw':
+ optimizer_params.update({'beta1': adam_betas[0],
+ 'beta2': adam_betas[1],
+ 'epsilon': args.adam_epsilon,
+ 'correct_bias': False,
+ })
+ elif args.optimizer == 'adam':
+ optimizer_params.update({'beta1': adam_betas[0],
+ 'beta2': adam_betas[1],
+ 'epsilon': args.adam_epsilon,
+ })
+ if args.comm_backend == 'horovod':
+ trainer = hvd.DistributedTrainer(param_dict, args.optimizer, optimizer_params)
+ else:
+ trainer = mx.gluon.Trainer(param_dict, args.optimizer, optimizer_params,
+ update_on_kvstore=False)
+
+ num_samples_per_update = 0
+ loss_denom = float(len(ctx_l) * args.num_accumulated)
+
+ log_span_loss = 0
+ log_answerable_loss = 0
+ log_total_loss = 0
+ log_sample_num = 0
+ if args.num_accumulated != 1:
+ # set grad to zero for gradient accumulation
+ qa_net.zero_grad()
+ global_tic = time.time()
+ tic = time.time()
+ for step_num, batch_data in enumerate(
+ grouper(repeat(train_dataloader), len(ctx_l) * args.num_accumulated)):
+ for sample_l in grouper(batch_data, len(ctx_l)):
+ loss_l = []
+ span_loss_l = []
+ answerable_loss_l = []
+ for sample, ctx in zip(sample_l, ctx_l):
+ if sample is None:
+ continue
+ # Copy the data to device
+ tokens = sample.data.as_in_ctx(ctx)
+ log_sample_num += len(tokens)
+ num_samples_per_update += len(tokens)
+ segment_ids = sample.segment_ids.as_in_ctx(ctx) if use_segmentation else None
+ valid_length = sample.valid_length.as_in_ctx(ctx)
+ p_mask = sample.masks.as_in_ctx(ctx)
+ gt_start = sample.gt_start.as_in_ctx(ctx).astype(np.int32)
+ gt_end = sample.gt_end.as_in_ctx(ctx).astype(np.int32)
+ is_impossible = sample.is_impossible.as_in_ctx(ctx).astype(np.int32)
+ batch_idx = mx.np.arange(tokens.shape[0], dtype=np.int32, ctx=ctx)
+ p_mask = 1 - p_mask # In the network, we use 1 --> no_mask, 0 --> mask
+ with mx.autograd.record():
+ start_logits, end_logits, answerable_logits \
+ = qa_net(tokens, segment_ids, valid_length, p_mask, gt_start)
+ sel_start_logits = start_logits[batch_idx, gt_start]
+ sel_end_logits = end_logits[batch_idx, gt_end]
+ sel_answerable_logits = answerable_logits[batch_idx, is_impossible]
+ span_loss = - 0.5 * (sel_start_logits + sel_end_logits).sum()
+ answerable_loss = -0.5 * sel_answerable_logits.sum()
+ loss = (span_loss + answerable_loss) / loss_denom
+ loss_l.append(loss)
+ span_loss_l.append(span_loss)
+ answerable_loss_l.append(answerable_loss)
+
+ for loss in loss_l:
+ loss.backward()
+ # All Reduce the Step Loss
+ log_span_loss += sum([ele.as_in_ctx(ctx_l[0]) for ele in span_loss_l]).asnumpy()
+ log_total_loss += sum([ele.as_in_ctx(ctx_l[0])
+ for ele in loss_l]).asnumpy() * loss_denom
+ log_answerable_loss += sum([ele.as_in_ctx(ctx_l[0])
+ for ele in answerable_loss_l]).asnumpy()
+ # update
+ trainer.allreduce_grads()
+
+ if args.max_grad_norm > 0:
+ # Here, the accumulated gradients are
+ # \sum_{n=1}^N g_n / loss_denom
+ # Thus, in order to clip the average gradient
+ # \frac{1}{N} \sum_{n=1}^N --> clip to args.max_grad_norm
+ # We need to change the ratio to be
+ # \sum_{n=1}^N g_n / loss_denom --> clip to args.max_grad_norm * N / loss_denom
+ total_norm, ratio, is_finite = clip_grad_global_norm(
+ params, args.max_grad_norm * num_samples_per_update / loss_denom)
+ else:
+ total_norm = grad_global_norm(params)
+
+ total_norm = total_norm / (num_samples_per_update / loss_denom)
+ trainer.update(num_samples_per_update / loss_denom)
+ if args.num_accumulated != 1:
+ # set grad to zero for gradient accumulation
+ qa_net.zero_grad()
+
+ # saving
+ if local_rank == 0 and (step_num + 1) % save_interval == 0 or (
+ step_num + 1) >= num_train_steps:
+ version_prefix = 'squad' + args.version
+ ckpt_name = '{}_{}_{}.params'.format(args.model_name,
+ version_prefix,
+ (step_num + 1))
+ params_saved = os.path.join(args.output_dir, ckpt_name)
+ qa_net.save_parameters(params_saved)
+ ckpt_candidates = [
+ f for f in os.listdir(
+ args.output_dir) if f.endswith('.params')]
+ # keep last `max_saved_ckpt` checkpoints
+ if len(ckpt_candidates) > args.max_saved_ckpt:
+ ckpt_candidates.sort(key=lambda ele: (len(ele), ele))
+ os.remove(os.path.join(args.output_dir, ckpt_candidates[0]))
+ logging.info('Params saved in: {}'.format(params_saved))
+
+ # logging
+ if local_rank == 0 and (step_num + 1) % log_interval == 0:
+ log_span_loss /= log_sample_num
+ log_answerable_loss /= log_sample_num
+ log_total_loss /= log_sample_num
+ toc = time.time()
+ logging.info(
+ 'Step: {}/{}, Loss span/answer/total={:.4f}/{:.4f}/{:.4f},'
+ ' LR={:.8f}, grad_norm={:.4f}. Time cost={:.2f}, Throughput={:.2f} samples/s'
+ ' ETA={:.2f}h'.format((step_num + 1), num_train_steps, log_span_loss,
+ log_answerable_loss, log_total_loss, trainer.learning_rate,
+ total_norm, toc - tic, log_sample_num / (toc - tic),
+ (num_train_steps - (step_num + 1)) / ((step_num + 1) / (toc - global_tic)) / 3600))
+ tic = time.time()
+ log_span_loss = 0
+ log_answerable_loss = 0
+ log_total_loss = 0
+ log_sample_num = 0
+ num_samples_per_update = 0
+
+ if (step_num + 1) >= num_train_steps:
+ toc = time.time()
+ logging.info(
+ 'Finish training step: {} within {} hours'.format(
+ step_num + 1, (toc - global_tic) / 3600))
+ break
+
+ return params_saved
+
+
+RawResultExtended = collections.namedtuple(
+ 'RawResultExtended',
+ ['qas_id',
+ 'start_top_logits',
+ 'start_top_index',
+ 'end_top_logits',
+ 'end_top_index',
+ 'answerable_logits'])
+
+
+def predict_extended(original_feature,
+ chunked_features,
+ results,
+ n_best_size,
+ max_answer_length=64,
+ start_top_n=5,
+ end_top_n=5):
+ """Get prediction results for SQuAD.
+
+ Start Logits: (B, N_start)
+ End Logits: (B, N_start, N_end)
+
+ Parameters
+ ----------
+ original_feature:
+ The original SquadFeature before chunked
+ chunked_features
+ List of ChunkFeatures
+ results
+ List of model predictions for span start and span end.
+ n_best_size
+ Best N results written to file
+ max_answer_length
+ Maximum length of the answer tokens.
+ start_top_n
+ Number of start-position candidates
+ end_top_n
+ Number of end-position candidates
+ Returns
+ -------
+ not_answerable_score
+ Model's estimate that the question is not answerable.
+ prediction
+ The final prediction.
+ nbest_json
+ n-best predictions with their probabilities.
+ """
+ not_answerable_score = 1000000 # Score for not-answerable. We set it to be a large and positive
+ # If one chunk votes for answerable, we will treat the context as answerable,
+ # Thus, the overall not_answerable_score = min(chunk_not_answerable_score)
+ all_start_idx = []
+ all_end_idx = []
+ all_pred_score = []
+ context_length = len(original_feature.context_token_ids)
+ token_max_context_score = np.full((len(chunked_features), context_length),
+ -np.inf,
+ dtype=np.float32)
+ for i, chunked_feature in enumerate(chunked_features):
+ chunk_start = chunked_feature.chunk_start
+ chunk_length = chunked_feature.chunk_length
+ for j in range(chunk_start, chunk_start + chunk_length):
+ # This is a heuristic score
+ # TODO investigate the impact
+ token_max_context_score[i, j] = min(j - chunk_start,
+ chunk_start + chunk_length - 1 - j) \
+ + 0.01 * chunk_length
+ token_max_chunk_id = token_max_context_score.argmax(axis=0)
+
+ for chunk_id, (result, chunk_feature) in enumerate(zip(results, chunked_features)):
+ # We use the log-likelihood as the not answerable score.
+ # Thus, a high score indicates that the answer is not answerable
+ cur_not_answerable_score = float(result.answerable_logits[1])
+ not_answerable_score = min(not_answerable_score, cur_not_answerable_score)
+ # Calculate the start_logits + end_logits as the overall score
+ context_offset = chunk_feature.context_offset
+ chunk_start = chunk_feature.chunk_start
+ chunk_length = chunk_feature.chunk_length
+ for i in range(start_top_n):
+ for j in range(end_top_n):
+ pred_score = result.start_top_logits[i] + result.end_top_logits[i, j]
+ start_index = result.start_top_index[i]
+ end_index = result.end_top_index[i, j]
+ # We could hypothetically create invalid predictions, e.g., predict
+ # that the start of the answer span is in the query tokens or out of
+ # the chunk. We throw out all invalid predictions.
+ if not (context_offset <= start_index < context_offset + chunk_length) or \
+ not (context_offset <= end_index < context_offset + chunk_length) or \
+ end_index < start_index:
+ continue
+ pred_answer_length = end_index - start_index + 1
+ if pred_answer_length > max_answer_length:
+ continue
+ start_idx = int(start_index - context_offset + chunk_start)
+ end_idx = int(end_index - context_offset + chunk_start)
+ if token_max_chunk_id[start_idx] != chunk_id:
+ continue
+ all_start_idx.append(start_idx)
+ all_end_idx.append(end_idx)
+ all_pred_score.append(pred_score)
+ sorted_start_end_score = sorted(zip(all_start_idx, all_end_idx, all_pred_score),
+ key=lambda args: args[-1], reverse=True)
+ nbest = []
+ context_text = original_feature.context_text
+ context_token_offsets = original_feature.context_token_offsets
+ seen_predictions = set()
+ for start_idx, end_idx, pred_score in sorted_start_end_score:
+ if len(seen_predictions) >= n_best_size:
+ break
+ pred_answer = context_text[context_token_offsets[start_idx][0]:
+ context_token_offsets[end_idx][1]]
+ seen_predictions.add(pred_answer)
+ nbest.append((pred_answer, pred_score))
+
+ # In very rare edge cases we could have no valid predictions. So we
+ # just create a nonce prediction in this case to avoid failure.
+ if len(nbest) == 0:
+ nbest.append(('', float('-inf')))
+ all_scores = np.array([ele[1] for ele in nbest], dtype=np.float32)
+ probs = np.exp(all_scores) / np.sum(np.exp(all_scores))
+ nbest_json = []
+ for i, (entry, prob) in enumerate(zip(nbest, probs)):
+ output = collections.OrderedDict()
+ output['text'] = entry[0]
+ output['probability'] = float(prob)
+ nbest_json.append(output)
+
+ assert len(nbest_json) >= 1
+ return not_answerable_score, nbest[0][0], nbest_json
+
+
+def evaluate(args, last=True):
+ store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm(
+ args.comm_backend, args.gpus)
+ # only evaluate once
+ if rank != 0:
+ logging.info('Skipping node {}'.format(rank))
+ return
+ ctx_l = parse_ctx(args.gpus)
+ logging.info(
+ 'Srarting inference without horovod on the first node on device {}'.format(
+ str(ctx_l)))
+
+ cfg, tokenizer, qa_net, use_segmentation = get_network(
+ args.model_name, ctx_l, args.classifier_dropout)
+
+ logging.info('Prepare dev data')
+ dev_features = get_squad_features(args, tokenizer, segment='dev')
+ dev_data_path = os.path.join(args.data_dir, 'dev-v{}.json'.format(args.version))
+ dataset_processor = SquadDatasetProcessor(tokenizer=tokenizer,
+ doc_stride=args.doc_stride,
+ max_seq_length=args.max_seq_length,
+ max_query_length=args.max_query_length)
+ dev_all_chunk_features = []
+ dev_chunk_feature_ptr = [0]
+ for feature in dev_features:
+ chunk_features = dataset_processor.process_sample(feature)
+ dev_all_chunk_features.extend(chunk_features)
+ dev_chunk_feature_ptr.append(dev_chunk_feature_ptr[-1] + len(chunk_features))
+
+ def eval_validation(ckpt_name, best_eval):
+ """
+ Model inference during validation or final evaluation.
+ """
+ dev_dataloader = mx.gluon.data.DataLoader(
+ dev_all_chunk_features,
+ batchify_fn=dataset_processor.BatchifyFunction,
+ batch_size=args.eval_batch_size,
+ num_workers=0,
+ shuffle=False)
+
+ log_interval = args.eval_log_interval
+ all_results = []
+ epoch_tic = time.time()
+ tic = time.time()
+ epoch_size = len(dev_features)
+ total_num = 0
+ log_num = 0
+ for batch_idx, dev_batch in enumerate(grouper(dev_dataloader, len(ctx_l))):
+ # Predict for each chunk
+ for sample, ctx in zip(dev_batch, ctx_l):
+ if sample is None:
+ continue
+ # Copy the data to device
+ tokens = sample.data.as_in_ctx(ctx)
+ total_num += len(tokens)
+ log_num += len(tokens)
+ segment_ids = sample.segment_ids.as_in_ctx(ctx) if use_segmentation else None
+ valid_length = sample.valid_length.as_in_ctx(ctx)
+ p_mask = sample.masks.as_in_ctx(ctx)
+ p_mask = 1 - p_mask # In the network, we use 1 --> no_mask, 0 --> mask
+ start_top_logits, start_top_index, end_top_logits, end_top_index, answerable_logits \
+ = qa_net.inference(tokens, segment_ids, valid_length, p_mask,
+ args.start_top_n, args.end_top_n)
+ for i, qas_id in enumerate(sample.qas_id):
+ result = RawResultExtended(qas_id=qas_id,
+ start_top_logits=start_top_logits[i].asnumpy(),
+ start_top_index=start_top_index[i].asnumpy(),
+ end_top_logits=end_top_logits[i].asnumpy(),
+ end_top_index=end_top_index[i].asnumpy(),
+ answerable_logits=answerable_logits[i].asnumpy())
+
+ all_results.append(result)
+
+ # logging
+ if (batch_idx + 1) % log_interval == 0:
+ # Output the loss of per step
+ toc = time.time()
+ logging.info(
+ '[batch {}], Time cost={:.2f},'
+ ' Throughput={:.2f} samples/s, ETA={:.2f}h'.format(
+ batch_idx + 1, toc - tic, log_num / (toc - tic),
+ (epoch_size - total_num) / (total_num / (toc - epoch_tic)) / 3600))
+ tic = time.time()
+ log_num = 0
+
+ epoch_toc = time.time()
+ logging.info('Time cost=%2f s, Thoughput=%.2f samples/s', epoch_toc - epoch_tic,
+ total_num / (epoch_toc - epoch_tic))
+
+ all_predictions = collections.OrderedDict()
+ all_nbest_json = collections.OrderedDict()
+ no_answer_score_json = collections.OrderedDict()
+ for index, (left_index, right_index) in enumerate(zip(dev_chunk_feature_ptr[:-1],
+ dev_chunk_feature_ptr[1:])):
+ chunked_features = dev_all_chunk_features[left_index:right_index]
+ results = all_results[left_index:right_index]
+ original_feature = dev_features[index]
+ qas_ids = set([result.qas_id for result in results] +
+ [feature.qas_id for feature in chunked_features])
+ assert len(qas_ids) == 1, 'Mismatch Occured between features and results'
+ example_qas_id = list(qas_ids)[0]
+ assert example_qas_id == original_feature.qas_id, \
+ 'Mismatch Occured between original feature and chunked features'
+ not_answerable_score, best_pred, nbest_json = predict_extended(
+ original_feature=original_feature,
+ chunked_features=chunked_features,
+ results=results,
+ n_best_size=args.n_best_size,
+ max_answer_length=args.max_answer_length,
+ start_top_n=args.start_top_n,
+ end_top_n=args.end_top_n)
+ no_answer_score_json[example_qas_id] = not_answerable_score
+ all_predictions[example_qas_id] = best_pred
+ all_nbest_json[example_qas_id] = nbest_json
+
+ if args.version == '2.0':
+ exact = 'best_exact'
+ f1 = 'best_f1'
+ na_prob = no_answer_score_json
+ else:
+ exact = 'exact'
+ f1 = 'f1'
+ na_prob = None
+
+ cur_eval, revised_predictions = squad_eval(
+ dev_data_path, all_predictions, na_prob, revise=na_prob is not None)
+ logging.info('The evaluated results are {}'.format(json.dumps(cur_eval)))
+
+ cur_metrics = 0.5 * (cur_eval[exact] + cur_eval[f1])
+ if best_eval:
+ best_metrics = 0.5 * (best_eval[exact] + best_eval[f1])
+ else:
+ best_metrics = 0.
+
+ if cur_metrics > best_metrics:
+ logging.info('The evaluated files are saved in {}'.format(args.output_dir))
+ output_prediction_file = os.path.join(args.output_dir, 'predictions.json')
+ output_nbest_file = os.path.join(args.output_dir, 'nbest_predictions.json')
+ na_prob_file = os.path.join(args.output_dir, 'na_prob.json')
+ revised_prediction_file = os.path.join(args.output_dir, 'revised_predictions.json')
+
+ with open(output_prediction_file, 'w') as of:
+ of.write(json.dumps(all_predictions, indent=4) + '\n')
+ with open(output_nbest_file, 'w') as of:
+ of.write(json.dumps(all_nbest_json, indent=4) + '\n')
+ with open(na_prob_file, 'w') as of:
+ of.write(json.dumps(no_answer_score_json, indent=4) + '\n')
+ with open(revised_prediction_file, 'w') as of:
+ of.write(json.dumps(revised_predictions, indent=4) + '\n')
+
+ best_eval = cur_eval
+ best_eval.update({'best_ckpt': ckpt_name})
+ return best_eval
+
+ if args.param_checkpoint and args.param_checkpoint.endswith('.params'):
+ ckpt_candidates = [args.param_checkpoint]
+ else:
+ ckpt_candidates = [f for f in os.listdir(args.output_dir) if f.endswith('.params')]
+ ckpt_candidates.sort(key=lambda ele: (len(ele), ele))
+ if last:
+ ckpt_candidates = ckpt_candidates[-1:]
+
+ best_eval = {}
+ for ckpt_name in ckpt_candidates:
+ logging.info('Starting evaluate the checkpoint {}'.format(ckpt_name))
+ ckpt_path = os.path.join(args.output_dir, ckpt_name)
+ qa_net.load_parameters(ckpt_path, ctx=ctx_l, cast_dtype=True)
+ best_eval = eval_validation(ckpt_name, best_eval)
+
+ logging.info('The best evaluated results are {}'.format(json.dumps(best_eval)))
+ output_eval_results_file = os.path.join(args.output_dir, 'best_results.json')
+ with open(output_eval_results_file, 'w') as of:
+ of.write(json.dumps(best_eval, indent=4) + '\n')
+ return best_eval
+
+
+if __name__ == '__main__':
+ os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
+ args = parse_args()
+ logging_config(args.output_dir, name='finetune_squad{}'.format(args.version))
+ set_seed(args.seed)
+ if args.do_train:
+ train(args)
+ if args.do_eval:
+ evaluate(args, last=not args.all_evaluate)
diff --git a/scripts/question_answering/squad_utils.py b/scripts/question_answering/squad_utils.py
new file mode 100644
index 0000000000..80a27a9864
--- /dev/null
+++ b/scripts/question_answering/squad_utils.py
@@ -0,0 +1,455 @@
+"""Utility classes and functions for data processing"""
+from typing import Optional, List
+from collections import namedtuple
+import itertools
+import re
+import numpy as np
+import numpy.ma as ma
+import warnings
+import os
+from tqdm import tqdm
+import json
+import string
+from gluonnlp.data.tokenizers import BaseTokenizerWithVocab
+from gluonnlp.utils.preprocessing import match_tokens_with_char_spans
+from typing import Tuple
+from mxnet.gluon.utils import download
+
+int_float_regex = re.compile('^\d+\.{0,1}\d*$') # matches if a number is either integer or float
+
+import mxnet as mx
+mx.npx.set_np()
+
+
+def normalize_answer(s):
+ """Lower text and remove punctuation, articles and extra whitespace.
+ This is from the official evaluate-v2.0.py in SQuAD.
+ """
+
+ def remove_articles(text):
+ regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+ return re.sub(regex, ' ', text)
+
+ def white_space_fix(text):
+ return ' '.join(text.split())
+
+ def remove_punc(text):
+ exclude = set(string.punctuation)
+ return ''.join(ch for ch in text if ch not in exclude)
+
+ def lower(text):
+ return text.lower()
+
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_official_squad_eval_script(version='2.0', download_dir=None):
+ url_info = {'2.0': ['evaluate-v2.0.py',
+ 'https://worksheets.codalab.org/rest/bundles/'
+ '0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/',
+ '5a584f1952c88b4088be5b51f2046a2c337aa706']}
+ if version not in url_info:
+ raise ValueError('Version {} is not supported'.format(version))
+ if download_dir is None:
+ download_dir = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+ download_path = os.path.join(download_dir, url_info[version][0])
+ download(url_info[version][1], download_path, sha1_hash=url_info[version][2])
+ return download_path
+
+
+class SquadExample:
+ """A single training/test example for the Squad dataset, as loaded from disk."""
+ def __init__(self, qas_id: int,
+ query_text: str,
+ context_text: str,
+ answer_text: str,
+ start_position: int,
+ end_position: int,
+ title: str,
+ answers: Optional[List[str]] = None,
+ is_impossible: bool = False):
+ """
+
+ Parameters
+ ----------
+ qas_id
+ The example's unique identifier
+ query_text
+ The query string
+ context_text
+ The context string
+ answer_text
+ The answer string
+ start_position
+ The character position of the start of the answer
+ end_position
+ The character position of the end of the answer
+ title
+ The title of the example
+ answers
+ None by default, this is used during evaluation.
+ Holds answers as well as their start positions.
+ is_impossible
+ False by default, set to True if the example has no possible answer.
+ """
+ self.qas_id = qas_id
+ self.query_text = query_text
+ self.context_text = context_text
+ self.answer_text = answer_text
+ self.title = title
+ self.is_impossible = is_impossible
+ self.answers = answers
+ self.start_position = start_position
+ self.end_position = end_position
+
+ def to_json(self):
+ return json.dumps(self.__dict__)
+
+ @classmethod
+ def from_json(cls, s):
+ kwargs = json.loads(s)
+ return cls(**kwargs)
+
+
+DocChunk = namedtuple('DocChunk', ['start', 'length',
+ 'is_impossible',
+ 'gt_start_pos',
+ 'gt_end_pos'])
+
+
+class SquadFeature:
+ def __init__(self, qas_id,
+ query_token_ids,
+ context_text,
+ context_token_ids,
+ context_token_offsets,
+ is_impossible,
+ token_answer_mismatch,
+ unreliable_span,
+ gt_answer_text,
+ gt_start_pos,
+ gt_end_pos):
+ """The Squad Feature
+
+ Parameters
+ ----------
+ qas_id
+ The unique query/answer ID in the squad dataset
+ query_token_ids
+ The tokenized query.
+ context_text
+ The original text of the context
+ context_token_ids
+ The tokenized context.
+ context_token_offsets
+ The offsets of the tokens in the original context string
+ is_impossible
+ Whether the sample is impossible.
+ token_answer_mismatch
+ If this value is True, it means that we cannot reconstruct the ground-truth answer with
+ the tokenized version. Usually, the span-prediction-based approach won't be very
+ accurate and we should rely on the encoder-decoder approach.
+ For example:
+ GT: "japan", Tokenized Version: "japanese"
+ "six' "sixth"
+ "one" "iPhone"
+ "breed" "breeding"
+ "emotion" "emotional"
+
+ unreliable_span
+ If this value is True, it means that we cannot rely on the gt_start_pos and gt_end_pos.
+ In this scenario, we cannot utilize the span-prediction-based approach.
+ One example is the question about "how many", the answer will spread across the
+ whole document and there is no clear span.
+ gt_answer_text
+ The ground-truth answer text
+ gt_start_pos
+ The start position of the ground-truth span. None indicates that there is no valid
+ ground-truth span.
+ gt_end_pos
+ The end position of the ground-truth span. None indicates that there is no valid
+ ground-truth span.
+ """
+ self.qas_id = qas_id
+ self.query_token_ids = query_token_ids
+ self.context_text = context_text
+ self.context_token_ids = context_token_ids
+ self.context_token_offsets = context_token_offsets
+ self.is_impossible = is_impossible
+ self.token_answer_mismatch = token_answer_mismatch
+ self.unreliable_span = unreliable_span
+ self.gt_answer_text = gt_answer_text
+ self.gt_start_pos = gt_start_pos
+ self.gt_end_pos = gt_end_pos
+
+ def to_json(self):
+ return json.dumps(self.__dict__)
+
+ @classmethod
+ def from_json(cls, s):
+ kwargs = json.loads(s)
+ return cls(**kwargs)
+
+ def __repr__(self):
+ return self.to_json()
+
+ def get_chunks(self, doc_stride, max_chunk_length=None):
+ """Get a sequence of chunks for the squad feature.
+
+ In reality, the document will be too long for the NLP model, and we will split it into
+ multiple chunks.
+
+ For example, consider the following
+ Doc: the man went to the store and bought a gallon of milk
+
+ We may divide it into four chunks:
+
+ Chunk 1: the man went to the
+ Chunk 2: to the store and bought
+ Chunk 3: and bought a gallon of
+ Chunk 4: gallon of milk
+
+ We will use our network to extract features for each chunk,
+ and do the aggregation afterwards. Here, one token may appear in multiple chunks.
+ We can vote the output based on some heuristic score functions.
+
+ Parameters
+ ----------
+ doc_stride
+ The stride used when the context is too large and is split across several features.
+ max_chunk_length
+ The maximum size of the chunk
+
+ Returns
+ -------
+ ret
+ List of DocChunk objects
+ """
+ doc_ptr = 0
+ max_chunk_length = max_chunk_length if max_chunk_length is not None else \
+ len(self.context_token_ids)
+ ret = []
+ while doc_ptr < len(self.context_token_ids):
+ chunk_length = min(max_chunk_length, len(self.context_token_ids) - doc_ptr)
+ if self.gt_answer_text is None:
+ chunk_gt_start_pos = None
+ chunk_gt_end_pos = None
+ chunk_is_impossible = True
+ else:
+ if self.gt_start_pos is not None and self.gt_end_pos is not None and\
+ self.gt_start_pos >= doc_ptr and self.gt_end_pos < doc_ptr + chunk_length:
+ # The chunk contains the ground-truth annotation
+ chunk_gt_start_pos = self.gt_start_pos - doc_ptr
+ chunk_gt_end_pos = self.gt_end_pos - doc_ptr
+ chunk_is_impossible = False
+ else:
+ chunk_gt_start_pos = None
+ chunk_gt_end_pos = None
+ chunk_is_impossible = True
+ ret.append(DocChunk(start=doc_ptr,
+ length=chunk_length,
+ is_impossible=chunk_is_impossible,
+ gt_start_pos=chunk_gt_start_pos,
+ gt_end_pos=chunk_gt_end_pos))
+ if doc_ptr + chunk_length == len(self.context_token_ids):
+ break
+ doc_ptr += doc_stride
+ return ret
+
+
+def get_squad_examples_from_json(json_file: str, is_training: bool) -> List[SquadExample]:
+ """
+ Read the whole entry of raw json file and convert it to examples.
+
+ Parameters
+ ----------
+ json_file
+ The path to the json file
+ is_training
+ Whether or not training
+
+ Returns
+ -------
+ ret
+ List of SquadExample objects
+ """
+ with open(json_file, 'r') as f:
+ data = json.load(f)
+ examples = []
+ for entry in tqdm(data['data']):
+ title = entry['title']
+ for paragraph in entry['paragraphs']:
+ context_text = paragraph['context']
+ for qa in paragraph['qas']:
+ qas_id = qa['id']
+ query_text = qa['question']
+ start_position = None
+ end_position = None
+ answer_text = None
+ answers = None
+ if "is_impossible" in qa:
+ is_impossible = qa["is_impossible"]
+ else:
+ is_impossible = False
+
+ if not is_impossible:
+ if is_training:
+ answer = qa["answers"][0]
+ answer_text = answer["text"]
+ start_position = answer["answer_start"]
+ end_position = start_position + len(answer_text)
+ if context_text[start_position:end_position] != answer_text:
+ warnings.warn(
+ 'Mismatch start/end and answer_text, start/end={}/{},'
+ ' answer text={}. qas={}'
+ .format(start_position, end_position, answer_text, qas_id))
+ else:
+ answers = qa["answers"]
+ example = SquadExample(
+ qas_id=qas_id,
+ query_text=query_text,
+ context_text=context_text,
+ answer_text=answer_text,
+ start_position=start_position,
+ end_position=end_position,
+ title=title,
+ is_impossible=is_impossible,
+ answers=answers,
+ )
+ examples.append(example)
+ return examples
+
+
+def get_squad_examples(data_dir, segment='train', version='1.1'):
+ """
+
+ Parameters
+ ----------
+ data_dir
+ The directory of the data
+ segment
+ The segment
+ version
+ Version of the SQuAD
+
+ Returns
+ -------
+ examples
+ A list of SquadExampls objects
+ """
+ if version == '1.1':
+ train_path = os.path.join(data_dir, 'train-v1.1.json')
+ dev_path = os.path.join(data_dir, 'dev-v1.1.json')
+ elif version == '2.0':
+ train_path = os.path.join(data_dir, 'train-v2.0.json')
+ dev_path = os.path.join(data_dir, 'dev-v2.0.json')
+ else:
+ raise NotImplementedError
+
+ if segment == 'train':
+ examples = get_squad_examples_from_json(train_path, is_training=True)
+ elif segment == 'dev':
+ examples = get_squad_examples_from_json(dev_path, is_training=False)
+ else:
+ raise NotImplementedError
+
+ return examples
+
+
+def convert_squad_example_to_feature(example: SquadExample,
+ tokenizer: BaseTokenizerWithVocab,
+ is_training: bool):
+ """
+ Convert a SquadExample object to a SquadFeature object with the designated tokenizer.
+
+ There are accually few examples can not be converted properly with token level tokenization,
+ due to the ground-truth are given by the start position and the answer text, and some examples
+ are annotated with wrong labels. Thus, attribute unreliable_span and token_answer_mismatch are
+ used to indicate these senarios.
+
+ Parameters
+ ----------
+ example
+ A single squad example
+ tokenizer
+ The trained tokenizer
+ is_training
+ Whether to deal with the training case
+ Returns
+ -------
+ feature
+ A SquadFeature
+ """
+ context_text = example.context_text
+ answer_text = example.answer_text
+ query_text = example.query_text
+ context_token_ids, offsets = tokenizer.encode_with_offsets(context_text, int)
+ query_token_ids = tokenizer.encode(query_text, int)
+ gt_answer_text = answer_text
+ gt_span_start_pos, gt_span_end_pos = None, None
+ token_answer_mismatch = False
+ unreliable_span = False
+ np_offsets = np.array(offsets)
+ if is_training and not example.is_impossible:
+ assert example.start_position >= 0 and example.end_position >= 0
+ # We convert the character-level offsets to token-level offsets
+ # Also, if the answer after tokenization + detokenization is not the same as the original
+ # answer, we try to localize the answer text and do a rematch
+ candidates = [(example.start_position, example.end_position)]
+ all_possible_start_pos = {example.start_position}
+ find_all_candidates = False
+ lower_idx, upper_idx = None, None
+ first_lower_idx, first_upper_idx = None, None
+ while len(candidates) > 0:
+ start_position, end_position = candidates.pop()
+ # Match the token offsets
+ token_start_ends = match_tokens_with_char_spans(np_offsets,
+ np.array([[start_position,
+ end_position]]))
+ lower_idx = int(token_start_ends[0][0])
+ upper_idx = int(token_start_ends[0][1])
+ if not find_all_candidates:
+ first_lower_idx = lower_idx
+ first_upper_idx = upper_idx
+ # The new start pos and end_pos are the lower_idx and upper_idx
+ sliced_answer = context_text[offsets[lower_idx][0]:offsets[upper_idx][1]]
+ norm_sliced_answer = normalize_answer(sliced_answer)
+ norm_answer = normalize_answer(answer_text)
+ if norm_sliced_answer != norm_answer:
+ if not find_all_candidates:
+ # Try to find a better start+end of the answer and insert all positions to the
+ # candidates
+ find_all_candidates = True
+ pos = context_text.find(answer_text)
+ while pos != -1:
+ if pos not in all_possible_start_pos:
+ all_possible_start_pos.add(pos)
+ candidates.append((pos, pos + len(answer_text)))
+ pos = context_text.find(answer_text, pos + 1)
+ elif len(candidates) == 0:
+ token_answer_mismatch = True
+ lower_idx = first_lower_idx
+ upper_idx = first_upper_idx
+ if int_float_regex.match(answer_text):
+ # Find an integer/float and the sample won't be reliable.
+ # The span-based approach is not suitable for this scenario and we will
+ # set the unreliable span flag.
+ unreliable_span = True
+ else:
+ break
+
+ gt_span_start_pos = lower_idx
+ gt_span_end_pos = upper_idx
+
+ feature = SquadFeature(qas_id=example.qas_id,
+ query_token_ids=query_token_ids,
+ context_text=context_text,
+ context_token_ids=context_token_ids,
+ context_token_offsets=offsets,
+ is_impossible=example.is_impossible,
+ token_answer_mismatch=token_answer_mismatch,
+ unreliable_span=unreliable_span,
+ gt_answer_text=gt_answer_text,
+ gt_start_pos=gt_span_start_pos,
+ gt_end_pos=gt_span_end_pos)
+ return feature
diff --git a/scripts/question_answering/utils.py b/scripts/question_answering/utils.py
deleted file mode 100644
index 2e55f7e098..0000000000
--- a/scripts/question_answering/utils.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Various utility methods for Question Answering"""
-import math
-
-
-def warm_up_lr(base_lr, iteration, lr_warmup_steps):
- """Returns learning rate based on current iteration.
-
- This function is used to implement learning rate warm up technique.
-
- math::
-
- lr = min(base_lr, base_lr * (log(iteration) / log(lr_warmup_steps)))
-
- Parameters
- ----------
- base_lr : float
- Initial learning rage
- iteration : int
- Current iteration number
- lr_warmup_steps : int
- Learning rate warm up steps
-
- Returns
- -------
- learning_rate : float
- Learning rate
- """
- return min(base_lr, base_lr * (math.log(iteration) / math.log(lr_warmup_steps)))
diff --git a/scripts/sentiment_analysis/__init__.py b/scripts/sentiment_analysis/__init__.py
deleted file mode 100644
index 8d81276b5d..0000000000
--- a/scripts/sentiment_analysis/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Sentiment Analysis example."""
diff --git a/scripts/sentiment_analysis/finetune_lm.py b/scripts/sentiment_analysis/finetune_lm.py
deleted file mode 100644
index 3528663076..0000000000
--- a/scripts/sentiment_analysis/finetune_lm.py
+++ /dev/null
@@ -1,344 +0,0 @@
-"""
-Fine-tune Language Model for Sentiment Analysis
-===============================================
-
-This example shows how to load a language model pre-trained on wikitext-2 in Gluon NLP Toolkit model
-zoo, and reuse the language model encoder for sentiment analysis on IMDB movie reviews dataset.
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import time
-import random
-import glob
-import multiprocessing as mp
-
-import numpy as np
-
-import mxnet as mx
-from mxnet import gluon, autograd
-from mxnet.gluon import HybridBlock
-from mxnet.gluon.data import DataLoader
-
-import gluonnlp as nlp
-
-nlp.utils.check_version('0.7.0')
-
-np.random.seed(100)
-random.seed(100)
-mx.random.seed(10000)
-
-tokenizer = nlp.data.SpacyTokenizer('en')
-length_clip = nlp.data.ClipSequence(500)
-
-
-parser = argparse.ArgumentParser(description='MXNet Sentiment Analysis Example on IMDB. '
- 'We load a LSTM model that is pre-trained on '
- 'WikiText as our encoder.')
-parser.add_argument('--lm_model', type=str, default='standard_lstm_lm_200',
- help='type of the pre-trained model to load, can be "standard_lstm_200", '
- '"standard_lstm_200", etc.')
-parser.add_argument('--use-mean-pool', type=bool, default=True,
- help='whether to use mean pooling to aggregate the states from '
- 'different timestamps.')
-parser.add_argument('--no_pretrained', action='store_true',
- help='Turn on the option to just use the structure and '
- 'not load the pre-trained weights.')
-parser.add_argument('--lr', type=float, default=2.5E-3,
- help='initial learning rate')
-parser.add_argument('--clip', type=float, default=None, help='gradient clipping')
-parser.add_argument('--bucket_type', type=str, default=None,
- help='Can be "fixed" or "sorted"')
-parser.add_argument('--bucket_num', type=int, default=10,
- help='The bucket_num if bucket_type is "fixed".')
-parser.add_argument('--bucket_ratio', type=float, default=0.0,
- help='The ratio used in the FixedBucketSampler.')
-parser.add_argument('--bucket_mult', type=int, default=100,
- help='The mult used in the SortedBucketSampler.')
-parser.add_argument('--valid_ratio', type=float, default=0.05,
- help='Proportion [0, 1] of training samples to use for validation set.')
-parser.add_argument('--epochs', type=int, default=20,
- help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=16, metavar='N',
- help='batch size')
-parser.add_argument('--dropout', type=float, default=0.,
- help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--log-interval', type=int, default=30, metavar='N',
- help='report interval')
-parser.add_argument('--save-prefix', type=str, default='sa-model',
- help='path to save the final model')
-parser.add_argument('--gpu', type=int, default=None,
- help='id of the gpu to use. Set it to empty means to use cpu.')
-args = parser.parse_args()
-print(args)
-
-pretrained = not args.no_pretrained
-if args.gpu is None:
- print('Use cpu')
- context = mx.cpu()
-else:
- print('Use gpu%d' % args.gpu)
- context = mx.gpu(args.gpu)
-
-class AggregationLayer(HybridBlock):
- """A block for different ways of aggregating encoder features"""
- def __init__(self, use_mean_pool=False, prefix=None, params=None):
- super(AggregationLayer, self).__init__(prefix=prefix, params=params)
- self._use_mean_pool = use_mean_pool
-
- def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
- """Forward logic"""
- # Data will have shape (T, N, C)
- if self._use_mean_pool:
- masked_encoded = F.SequenceMask(data,
- sequence_length=valid_length,
- use_sequence_length=True)
- agg_state = F.broadcast_div(F.sum(masked_encoded, axis=0),
- F.expand_dims(valid_length, axis=1))
- else:
- agg_state = F.SequenceLast(data,
- sequence_length=valid_length,
- use_sequence_length=True)
- return agg_state
-
-
-class SentimentNet(HybridBlock):
- """Network for sentiment analysis."""
- def __init__(self, dropout, use_mean_pool=False, prefix=None, params=None):
- super(SentimentNet, self).__init__(prefix=prefix, params=params)
- self._use_mean_pool = use_mean_pool
- with self.name_scope():
- self.embedding = None
- self.encoder = None
- self.agg_layer = AggregationLayer(use_mean_pool=use_mean_pool)
- self.output = gluon.nn.HybridSequential()
- with self.output.name_scope():
- self.output.add(gluon.nn.Dropout(dropout))
- self.output.add(gluon.nn.Dense(1, flatten=False))
-
- def hybrid_forward(self, _, data, valid_length): # pylint: disable=arguments-differ
- encoded = self.encoder(self.embedding(data)) # Shape(T, N, C)
- agg_state = self.agg_layer(encoded, valid_length)
- out = self.output(agg_state)
- return out
-
-net = SentimentNet(dropout=args.dropout, use_mean_pool=args.use_mean_pool)
-with net.name_scope():
- lm_model, vocab = nlp.model.get_model(name=args.lm_model,
- dataset_name='wikitext-2',
- pretrained=pretrained,
- ctx=context,
- dropout=args.dropout)
-
-net.embedding = lm_model.embedding
-net.encoder = lm_model.encoder
-net.hybridize()
-
-
-# Dataset preprocessing
-def preprocess(x):
- data, label = x
- label = int(label > 5)
- data = vocab[length_clip(tokenizer(data))]
- return data, label
-
-def get_length(x):
- return float(len(x[0]))
-
-# Load the dataset
-train_dataset, test_dataset = [nlp.data.IMDB(root='data/imdb', segment=segment)
- for segment in ('train', 'test')]
-train_dataset, valid_dataset = nlp.data.train_valid_split(train_dataset, args.valid_ratio)
-print('Tokenize using spaCy...')
-
-def preprocess_dataset(dataset):
- start = time.time()
- pool = mp.Pool(8)
- dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
- lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
- end = time.time()
- print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
- return dataset, lengths
-
-# Preprocess the dataset
-train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
-valid_dataset, valid_data_lengths = preprocess_dataset(valid_dataset)
-test_dataset, test_data_lengths = preprocess_dataset(test_dataset)
-
-# Construct the DataLoader. Pad data and stack label
-batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0, pad_val=0, ret_length=True),
- nlp.data.batchify.Stack(dtype='float32'))
-if args.bucket_type is None:
- print('Bucketing strategy is not used!')
- train_dataloader = DataLoader(dataset=train_dataset,
- batch_size=args.batch_size,
- shuffle=True,
- batchify_fn=batchify_fn)
-else:
- if args.bucket_type == 'fixed':
- print('Use FixedBucketSampler')
- batch_sampler = nlp.data.FixedBucketSampler(train_data_lengths,
- batch_size=args.batch_size,
- num_buckets=args.bucket_num,
- ratio=args.bucket_ratio,
- shuffle=True)
- print(batch_sampler.stats())
- elif args.bucket_type == 'sorted':
- print('Use SortedBucketSampler')
- batch_sampler = nlp.data.SortedBucketSampler(train_data_lengths,
- batch_size=args.batch_size,
- mult=args.bucket_mult,
- shuffle=True)
- else:
- raise NotImplementedError
- train_dataloader = DataLoader(dataset=train_dataset,
- batch_sampler=batch_sampler,
- batchify_fn=batchify_fn)
-
-valid_dataloader = DataLoader(dataset=valid_dataset,
- batch_size=args.batch_size,
- shuffle=False,
- sampler=nlp.data.SortedSampler(valid_data_lengths),
- batchify_fn=batchify_fn)
-
-test_dataloader = DataLoader(dataset=test_dataset,
- batch_size=args.batch_size,
- shuffle=False,
- sampler=nlp.data.SortedSampler(test_data_lengths),
- batchify_fn=batchify_fn)
-
-
-net.hybridize()
-print(net)
-if args.no_pretrained:
- net.initialize(mx.init.Xavier(), ctx=context)
-else:
- net.output.initialize(mx.init.Xavier(), ctx=context)
-trainer = gluon.Trainer(net.collect_params(), 'ftml', {'learning_rate': args.lr})
-loss = gluon.loss.SigmoidBCELoss()
-
-
-def evaluate(dataloader):
- """Evaluate network on the specified dataset"""
- total_L = 0.0
- total_sample_num = 0
- total_correct_num = 0
- start_log_interval_time = time.time()
- print('Begin Testing...')
- for i, ((data, valid_length), label) in enumerate(dataloader):
- data = mx.nd.transpose(data.as_in_context(context))
- valid_length = valid_length.as_in_context(context).astype(np.float32)
- label = label.as_in_context(context)
- output = net(data, valid_length)
- L = loss(output, label)
- pred = (output > 0.5).reshape((-1,))
- total_L += L.sum().asscalar()
- total_sample_num += label.shape[0]
- total_correct_num += (pred == label).sum().asscalar()
- if (i + 1) % args.log_interval == 0:
- print('[Batch {}/{}] elapsed {:.2f} s'.format(
- i + 1, len(dataloader), time.time() - start_log_interval_time))
- start_log_interval_time = time.time()
- avg_L = total_L / float(total_sample_num)
- acc = total_correct_num / float(total_sample_num)
- return avg_L, acc
-
-
-def train():
- """Training process"""
- start_pipeline_time = time.time()
-
- # Training/Testing
- best_valid_acc = 0
- stop_early = 0
- for epoch in range(args.epochs):
- # Epoch training stats
- start_epoch_time = time.time()
- epoch_L = 0.0
- epoch_sent_num = 0
- epoch_wc = 0
- # Log interval training stats
- start_log_interval_time = time.time()
- log_interval_wc = 0
- log_interval_sent_num = 0
- log_interval_L = 0.0
-
- for i, ((data, valid_length), label) in enumerate(train_dataloader):
- data = mx.nd.transpose(data.as_in_context(context))
- label = label.as_in_context(context)
- valid_length = valid_length.as_in_context(context).astype(np.float32)
- wc = valid_length.sum().asscalar()
- log_interval_wc += wc
- epoch_wc += wc
- log_interval_sent_num += data.shape[1]
- epoch_sent_num += data.shape[1]
- with autograd.record():
- output = net(data, valid_length)
- L = loss(output, label).mean()
- L.backward()
- # Clip gradient
- if args.clip is not None:
- grads = [p.grad(context) for p in net.collect_params().values()]
- gluon.utils.clip_global_norm(grads, args.clip)
- # Update parameter
- trainer.step(1)
- log_interval_L += L.asscalar()
- epoch_L += L.asscalar()
- if (i + 1) % args.log_interval == 0:
- print('[Epoch %d Batch %d/%d] avg loss %g, throughput %gK wps' % (
- epoch, i + 1, len(train_dataloader),
- log_interval_L / log_interval_sent_num,
- log_interval_wc / 1000 / (time.time() - start_log_interval_time)))
- # Clear log interval training stats
- start_log_interval_time = time.time()
- log_interval_wc = 0
- log_interval_sent_num = 0
- log_interval_L = 0
- end_epoch_time = time.time()
- valid_avg_L, valid_acc = evaluate(valid_dataloader)
- test_avg_L, test_acc = evaluate(test_dataloader)
- print('[Epoch %d] train avg loss %g, '
- 'valid acc %.4f, valid avg loss %g, '
- 'test acc %.4f, test avg loss %g, throughput %gK wps' % (
- epoch, epoch_L / epoch_sent_num,
- valid_acc, valid_avg_L, test_acc, test_avg_L,
- epoch_wc / 1000 / (end_epoch_time - start_epoch_time)))
-
- if valid_acc < best_valid_acc:
- print('No Improvement.')
- stop_early += 1
- if stop_early == 3:
- break
- else:
- # Reset stop_early if the validation loss finds a new low value
- print('Observed Improvement.')
- stop_early = 0
- net.save_parameters(args.save_prefix + '_{:04d}.params'.format(epoch))
- best_valid_acc = valid_acc
-
- net.load_parameters(glob.glob(args.save_prefix+'_*.params')[-1], context)
- valid_avg_L, valid_acc = evaluate(valid_dataloader)
- test_avg_L, test_acc = evaluate(test_dataloader)
- print('Best validation loss %g, validation acc %.4f'%(valid_avg_L, valid_acc))
- print('Best test loss %g, test acc %.4f'%(test_avg_L, test_acc))
- print('Total time cost %.2fs'%(time.time()-start_pipeline_time))
-
-
-if __name__ == '__main__':
- train()
diff --git a/scripts/sentiment_analysis/index.rst b/scripts/sentiment_analysis/index.rst
deleted file mode 100644
index ef5208f019..0000000000
--- a/scripts/sentiment_analysis/index.rst
+++ /dev/null
@@ -1,247 +0,0 @@
-Sentiment Analysis
-------------------
-
-:download:`Download scripts `
-
-Through Fine-tuning Word Language Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This script can be used to train a sentiment analysis model from scratch, or fine-tune a pre-trained language model.
-The pre-trained language models are loaded from Gluon NLP Toolkit model zoo. It also showcases how to use different
-bucketing strategies to speed up training.
-
-Use the following command to run without using pre-trained model (`log