diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 33b5d41..92615bf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,10 +1,10 @@ repos: - repo: https://github.com/ambv/black - rev: stable + rev: 21.7b0 hooks: - id: black args: [--line-length=100] - repo: https://gitlab.com/pycqa/flake8 - rev: 3.7.9 + rev: 3.9.2 hooks: - id: flake8 \ No newline at end of file diff --git a/notebooks/models/selim/pytorch-lightning-en-pillars.ipynb b/notebooks/models/selim/pytorch-lightning-en-pillars.ipynb index 3c937cf..ba47c9d 100644 --- a/notebooks/models/selim/pytorch-lightning-en-pillars.ipynb +++ b/notebooks/models/selim/pytorch-lightning-en-pillars.ipynb @@ -249,7 +249,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-06-09T08:31:43.201910Z", @@ -260,13 +260,13 @@ }, "outputs": [], "source": [ - "sample = True # To make the computations faster, sample = True.\n", + "sample = False # To make the computations faster, sample = True.\n", "\n", "if sample:\n", - " train_df = train_df.sample(n=1000)\n", - " val_df = val_df.sample(n=1000)\n", + " train_df = train_df.sample(n=10000)\n", + " val_df = val_df.sample(n=10000)\n", " \n", - "job_name = f\"pytorch-{formatted_time()}-test\" # change it as you prefer\n", + "job_name = f\"pytorch-{formatted_time()}-subpillars-model\" # change it as you prefer\n", "input_path = DEV_BUCKET / 'training' / 'input_data' / job_name # Do not change this\n", "\n", "train_path = str(input_path / 'train.pickle')\n", @@ -286,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-06-09T08:31:43.284096Z", @@ -327,7 +327,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-06-09T08:31:43.458886Z", @@ -344,11 +344,13 @@ " 'tracking_uri': MLFLOW_SERVER,\n", " 'experiment_name': 'en_language_subpillars',\n", " 'max_len': 128,\n", - " 'epochs': 1,\n", + " 'epochs': 3,\n", " 'model_name': 'distilbert-base-uncased',\n", " 'tokenizer_name': 'distilbert-base-uncased',\n", + " 'dropout_rate': 0.4\n", " 'language_method': 'keep',\n", - " 'pred_threshold':0.38\n", + " 'pred_threshold':0.4,\n", + " 'output_length': 768\n", "}\n", "\n", "estimator = PyTorch(\n", @@ -370,7 +372,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-06-09T08:31:43.482969Z", @@ -388,7 +390,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-06-09T08:31:45.995868Z", @@ -397,346 +399,7 @@ "scrolled": true, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2021-07-21 10:04:43 Starting - Starting the training job...\n", - "2021-07-21 10:05:07 Starting - Launching requested ML instancesProfilerReport-1626861880: InProgress\n", - "...\n", - "2021-07-21 10:05:47 Starting - Preparing the instances for training.........\n", - "2021-07-21 10:07:27 Downloading - Downloading input data...\n", - "2021-07-21 10:07:47 Training - Downloading the training image.............................\u001b[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device\u001b[0m\n", - "\u001b[34mbash: no job control in this shell\u001b[0m\n", - "\u001b[34m2021-07-21 10:13:15,099 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training\u001b[0m\n", - "\u001b[34m2021-07-21 10:13:15,124 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed.\u001b[0m\n", - "\u001b[34m2021-07-21 10:13:18,229 sagemaker_pytorch_container.training INFO Invoking user training script.\u001b[0m\n", - "\u001b[34m2021-07-21 10:13:18,800 sagemaker-training-toolkit INFO Installing dependencies from requirements.txt:\u001b[0m\n", - "\u001b[34m/opt/conda/bin/python3.6 -m pip install -r requirements.txt\u001b[0m\n", - "\u001b[34mCollecting transformers==4.8.2\n", - " Downloading transformers-4.8.2-py3-none-any.whl (2.5 MB)\u001b[0m\n", - "\u001b[34mCollecting tensorflow==2.4.0\n", - " Downloading tensorflow-2.4.0-cp36-cp36m-manylinux2010_x86_64.whl (394.7 MB)\u001b[0m\n", - "\n", - "2021-07-21 10:13:29 Training - Training image download completed. Training in progress.\u001b[34mCollecting pytorch-lightning==1.3.8\n", - " Downloading pytorch_lightning-1.3.8-py3-none-any.whl (813 kB)\u001b[0m\n", - "\u001b[34mCollecting torchmetrics==0.4.1\n", - " Downloading torchmetrics-0.4.1-py3-none-any.whl (234 kB)\u001b[0m\n", - "\u001b[34mCollecting tqdm==4.41.1\n", - " Downloading tqdm-4.41.1-py2.py3-none-any.whl (56 kB)\u001b[0m\n", - "\u001b[34mCollecting nlpaug==1.1.6\n", - " Downloading nlpaug-1.1.6-py3-none-any.whl (405 kB)\u001b[0m\n", - "\u001b[34mCollecting nltk==3.2.5\n", - " Downloading nltk-3.2.5.tar.gz (1.2 MB)\u001b[0m\n", - "\u001b[34mCollecting filelock\n", - " Downloading filelock-3.0.12-py3-none-any.whl (7.6 kB)\u001b[0m\n", - "\u001b[34mCollecting tokenizers<0.11,>=0.10.1\n", - " Downloading tokenizers-0.10.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: dataclasses in /opt/conda/lib/python3.6/site-packages (from transformers==4.8.2->-r requirements.txt (line 1)) (0.8)\u001b[0m\n", - "\u001b[34mCollecting regex!=2019.12.17\n", - " Downloading regex-2021.7.6-cp36-cp36m-manylinux2014_x86_64.whl (722 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: packaging in /opt/conda/lib/python3.6/site-packages (from transformers==4.8.2->-r requirements.txt (line 1)) (21.0)\u001b[0m\n", - "\u001b[34mCollecting sacremoses\n", - " Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: importlib-metadata in /opt/conda/lib/python3.6/site-packages (from transformers==4.8.2->-r requirements.txt (line 1)) (4.6.1)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: requests in /opt/conda/lib/python3.6/site-packages (from transformers==4.8.2->-r requirements.txt (line 1)) (2.25.1)\u001b[0m\n", - "\u001b[34mCollecting huggingface-hub==0.0.12\n", - " Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.6/site-packages (from transformers==4.8.2->-r requirements.txt (line 1)) (1.19.1)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: pyyaml in /opt/conda/lib/python3.6/site-packages (from transformers==4.8.2->-r requirements.txt (line 1)) (5.4.1)\u001b[0m\n", - "\u001b[34mCollecting typing-extensions~=3.7.4\n", - " Downloading typing_extensions-3.7.4.3-py3-none-any.whl (22 kB)\u001b[0m\n", - "\u001b[34mCollecting astunparse~=1.6.3\n", - " Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)\u001b[0m\n", - "\u001b[34mCollecting wrapt~=1.12.1\n", - " Downloading wrapt-1.12.1.tar.gz (27 kB)\u001b[0m\n", - "\u001b[34mCollecting numpy>=1.17\n", - " Downloading numpy-1.19.5-cp36-cp36m-manylinux2010_x86_64.whl (14.8 MB)\u001b[0m\n", - "\u001b[34mCollecting absl-py~=0.10\n", - " Downloading absl_py-0.13.0-py3-none-any.whl (132 kB)\u001b[0m\n", - "\u001b[34mCollecting h5py~=2.10.0\n", - " Downloading h5py-2.10.0-cp36-cp36m-manylinux1_x86_64.whl (2.9 MB)\u001b[0m\n", - "\u001b[34mCollecting six~=1.15.0\n", - " Downloading six-1.15.0-py2.py3-none-any.whl (10 kB)\u001b[0m\n", - "\u001b[34mCollecting termcolor~=1.1.0\n", - " Downloading termcolor-1.1.0.tar.gz (3.9 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: wheel~=0.35 in /opt/conda/lib/python3.6/site-packages (from tensorflow==2.4.0->-r requirements.txt (line 2)) (0.35.1)\u001b[0m\n", - "\u001b[34mCollecting gast==0.3.3\n", - " Downloading gast-0.3.3-py2.py3-none-any.whl (9.7 kB)\u001b[0m\n", - "\u001b[34mCollecting keras-preprocessing~=1.1.2\n", - " Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)\u001b[0m\n", - "\u001b[34mCollecting tensorboard~=2.4\n", - " Downloading tensorboard-2.5.0-py3-none-any.whl (6.0 MB)\u001b[0m\n", - "\u001b[34mCollecting tensorflow-estimator<2.5.0,>=2.4.0rc0\n", - " Downloading tensorflow_estimator-2.4.0-py2.py3-none-any.whl (462 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: google-pasta~=0.2 in /opt/conda/lib/python3.6/site-packages (from tensorflow==2.4.0->-r requirements.txt (line 2)) (0.2.0)\u001b[0m\n", - "\u001b[34mCollecting flatbuffers~=1.12.0\n", - " Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)\u001b[0m\n", - "\u001b[34mCollecting grpcio~=1.32.0\n", - " Downloading grpcio-1.32.0-cp36-cp36m-manylinux2014_x86_64.whl (3.8 MB)\u001b[0m\n", - "\u001b[34mCollecting opt-einsum~=3.3.0\n", - " Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: protobuf>=3.9.2 in /opt/conda/lib/python3.6/site-packages (from tensorflow==2.4.0->-r requirements.txt (line 2)) (3.17.3)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: pillow!=8.3.0 in /opt/conda/lib/python3.6/site-packages (from pytorch-lightning==1.3.8->-r requirements.txt (line 3)) (8.3.1)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: fsspec[http]!=2021.06.0,>=2021.05.0 in /opt/conda/lib/python3.6/site-packages (from pytorch-lightning==1.3.8->-r requirements.txt (line 3)) (2021.7.0)\u001b[0m\n", - "\u001b[34mCollecting tensorboard~=2.4\n", - " Downloading tensorboard-2.4.1-py3-none-any.whl (10.6 MB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: future>=0.17.1 in /opt/conda/lib/python3.6/site-packages (from pytorch-lightning==1.3.8->-r requirements.txt (line 3)) (0.18.2)\u001b[0m\n", - "\u001b[34mCollecting pyDeprecate==0.3.0\n", - " Downloading pyDeprecate-0.3.0-py3-none-any.whl (10 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: torch>=1.4 in /opt/conda/lib/python3.6/site-packages (from pytorch-lightning==1.3.8->-r requirements.txt (line 3)) (1.8.1)\u001b[0m\n", - "\u001b[34mCollecting aiohttp\n", - " Downloading aiohttp-3.7.4.post0-cp36-cp36m-manylinux2014_x86_64.whl (1.3 MB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: pyparsing>=2.0.2 in /opt/conda/lib/python3.6/site-packages (from packaging->transformers==4.8.2->-r requirements.txt (line 1)) (2.4.7)\u001b[0m\n", - "\u001b[34mCollecting markdown>=2.6.8\n", - " Downloading Markdown-3.3.4-py3-none-any.whl (97 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: werkzeug>=0.11.15 in /opt/conda/lib/python3.6/site-packages (from tensorboard~=2.4->tensorflow==2.4.0->-r requirements.txt (line 2)) (2.0.1)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: setuptools>=41.0.0 in /opt/conda/lib/python3.6/site-packages (from tensorboard~=2.4->tensorflow==2.4.0->-r requirements.txt (line 2)) (49.6.0.post20210108)\u001b[0m\n", - "\u001b[34mCollecting google-auth<2,>=1.6.3\n", - " Downloading google_auth-1.33.1-py2.py3-none-any.whl (152 kB)\u001b[0m\n", - "\u001b[34mCollecting tensorboard-plugin-wit>=1.6.0\n", - " Downloading tensorboard_plugin_wit-1.8.0-py3-none-any.whl (781 kB)\u001b[0m\n", - "\u001b[34mCollecting google-auth-oauthlib<0.5,>=0.4.1\n", - " Downloading google_auth_oauthlib-0.4.4-py2.py3-none-any.whl (18 kB)\u001b[0m\n", - "\u001b[34mCollecting cachetools<5.0,>=2.0.0\n", - " Downloading cachetools-4.2.2-py3-none-any.whl (11 kB)\u001b[0m\n", - "\u001b[34mCollecting pyasn1-modules>=0.2.1\n", - " Downloading pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.6/site-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow==2.4.0->-r requirements.txt (line 2)) (4.7.2)\u001b[0m\n", - "\u001b[34mCollecting requests-oauthlib>=0.7.0\n", - " Downloading requests_oauthlib-1.3.0-py2.py3-none-any.whl (23 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.6/site-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow==2.4.0->-r requirements.txt (line 2)) (0.4.8)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.6/site-packages (from requests->transformers==4.8.2->-r requirements.txt (line 1)) (1.25.11)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.6/site-packages (from requests->transformers==4.8.2->-r requirements.txt (line 1)) (2.10)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.6/site-packages (from requests->transformers==4.8.2->-r requirements.txt (line 1)) (2021.5.30)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: chardet<5,>=3.0.2 in /opt/conda/lib/python3.6/site-packages (from requests->transformers==4.8.2->-r requirements.txt (line 1)) (3.0.4)\u001b[0m\n", - "\u001b[34mCollecting oauthlib>=3.0.0\n", - " Downloading oauthlib-3.1.1-py2.py3-none-any.whl (146 kB)\u001b[0m\n", - "\u001b[34mCollecting idna-ssl>=1.0\n", - " Downloading idna-ssl-1.1.0.tar.gz (3.4 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.6/site-packages (from aiohttp->fsspec[http]!=2021.06.0,>=2021.05.0->pytorch-lightning==1.3.8->-r requirements.txt (line 3)) (21.2.0)\u001b[0m\n", - "\u001b[34mCollecting multidict<7.0,>=4.5\n", - " Downloading multidict-5.1.0-cp36-cp36m-manylinux2014_x86_64.whl (141 kB)\u001b[0m\n", - "\u001b[34mCollecting yarl<2.0,>=1.0\n", - " Downloading yarl-1.6.3-cp36-cp36m-manylinux2014_x86_64.whl (293 kB)\u001b[0m\n", - "\u001b[34mCollecting async-timeout<4.0,>=3.0\n", - " Downloading async_timeout-3.0.1-py3-none-any.whl (8.2 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.6/site-packages (from importlib-metadata->transformers==4.8.2->-r requirements.txt (line 1)) (3.5.0)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: click in /opt/conda/lib/python3.6/site-packages (from sacremoses->transformers==4.8.2->-r requirements.txt (line 1)) (7.1.2)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: joblib in /opt/conda/lib/python3.6/site-packages (from sacremoses->transformers==4.8.2->-r requirements.txt (line 1)) (1.0.1)\u001b[0m\n", - "\u001b[34mBuilding wheels for collected packages: nltk, termcolor, wrapt, idna-ssl\n", - " Building wheel for nltk (setup.py): started\u001b[0m\n", - "\u001b[34m Building wheel for nltk (setup.py): finished with status 'done'\n", - " Created wheel for nltk: filename=nltk-3.2.5-py3-none-any.whl size=1392145 sha256=9714a3045f75b605113a6293be7734f152e3dfc984b9691a467ab240b5c65c78\n", - " Stored in directory: /root/.cache/pip/wheels/f2/7f/71/cb36468789a03b5e2908281c8e1ce093e6860258b6b61677d8\n", - " Building wheel for termcolor (setup.py): started\n", - " Building wheel for termcolor (setup.py): finished with status 'done'\n", - " Created wheel for termcolor: filename=termcolor-1.1.0-py3-none-any.whl size=4830 sha256=02618bbd1f51d46a44327305198f503aad3797e64c00afd564f59affcc65eb1e\n", - " Stored in directory: /root/.cache/pip/wheels/93/2a/eb/e58dbcbc963549ee4f065ff80a59f274cc7210b6eab962acdc\n", - " Building wheel for wrapt (setup.py): started\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[34m Building wheel for wrapt (setup.py): finished with status 'done'\n", - " Created wheel for wrapt: filename=wrapt-1.12.1-cp36-cp36m-linux_x86_64.whl size=69745 sha256=832a7237eca0923f28192c397a3ef4539b1743db6801ec363c365af7b34fecb3\n", - " Stored in directory: /root/.cache/pip/wheels/32/42/7f/23cae9ff6ef66798d00dc5d659088e57dbba01566f6c60db63\n", - " Building wheel for idna-ssl (setup.py): started\n", - " Building wheel for idna-ssl (setup.py): finished with status 'done'\n", - " Created wheel for idna-ssl: filename=idna_ssl-1.1.0-py3-none-any.whl size=3161 sha256=0131e2e71b70596af00c55cef8e55ab8b17066b6345ca8f9ff4a66d97e278b07\n", - " Stored in directory: /root/.cache/pip/wheels/6a/f5/9c/f8331a854f7a8739cf0e74c13854e4dd7b1af11b04fe1dde13\u001b[0m\n", - "\u001b[34mSuccessfully built nltk termcolor wrapt idna-ssl\u001b[0m\n", - "\u001b[34mInstalling collected packages: typing-extensions, six, pyasn1-modules, oauthlib, multidict, cachetools, yarl, requests-oauthlib, numpy, idna-ssl, google-auth, async-timeout, tqdm, tensorboard-plugin-wit, regex, markdown, grpcio, google-auth-oauthlib, filelock, aiohttp, absl-py, wrapt, torchmetrics, tokenizers, termcolor, tensorflow-estimator, tensorboard, sacremoses, pyDeprecate, opt-einsum, keras-preprocessing, huggingface-hub, h5py, gast, flatbuffers, astunparse, transformers, tensorflow, pytorch-lightning, nltk, nlpaug\n", - " Attempting uninstall: typing-extensions\n", - " Found existing installation: typing-extensions 3.10.0.0\n", - " Uninstalling typing-extensions-3.10.0.0:\u001b[0m\n", - "\u001b[34m Successfully uninstalled typing-extensions-3.10.0.0\n", - " Attempting uninstall: six\n", - " Found existing installation: six 1.16.0\n", - " Uninstalling six-1.16.0:\u001b[0m\n", - "\u001b[34m Successfully uninstalled six-1.16.0\n", - " Attempting uninstall: numpy\n", - " Found existing installation: numpy 1.19.1\n", - " Uninstalling numpy-1.19.1:\n", - " Successfully uninstalled numpy-1.19.1\u001b[0m\n", - "\u001b[34m Attempting uninstall: tqdm\n", - " Found existing installation: tqdm 4.51.0\n", - " Uninstalling tqdm-4.51.0:\n", - " Successfully uninstalled tqdm-4.51.0\u001b[0m\n", - "\u001b[34m Attempting uninstall: h5py\n", - " Found existing installation: h5py 2.8.0\n", - " Uninstalling h5py-2.8.0:\n", - " Successfully uninstalled h5py-2.8.0\u001b[0m\n", - "\u001b[34mSuccessfully installed absl-py-0.13.0 aiohttp-3.7.4.post0 astunparse-1.6.3 async-timeout-3.0.1 cachetools-4.2.2 filelock-3.0.12 flatbuffers-1.12 gast-0.3.3 google-auth-1.33.1 google-auth-oauthlib-0.4.4 grpcio-1.32.0 h5py-2.10.0 huggingface-hub-0.0.12 idna-ssl-1.1.0 keras-preprocessing-1.1.2 markdown-3.3.4 multidict-5.1.0 nlpaug-1.1.6 nltk-3.2.5 numpy-1.19.5 oauthlib-3.1.1 opt-einsum-3.3.0 pyDeprecate-0.3.0 pyasn1-modules-0.2.8 pytorch-lightning-1.3.8 regex-2021.7.6 requests-oauthlib-1.3.0 sacremoses-0.0.45 six-1.15.0 tensorboard-2.4.1 tensorboard-plugin-wit-1.8.0 tensorflow-2.4.0 tensorflow-estimator-2.4.0 termcolor-1.1.0 tokenizers-0.10.3 torchmetrics-0.4.1 tqdm-4.41.1 transformers-4.8.2 typing-extensions-3.7.4.3 wrapt-1.12.1 yarl-1.6.3\u001b[0m\n", - "\u001b[34mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n", - "\u001b[0m\n", - "\u001b[34m2021-07-21 10:14:25,854 sagemaker-training-toolkit INFO Invoking user script\n", - "\u001b[0m\n", - "\u001b[34mTraining Env:\n", - "\u001b[0m\n", - "\u001b[34m{\n", - " \"additional_framework_parameters\": {},\n", - " \"channel_input_dirs\": {\n", - " \"test\": \"/opt/ml/input/data/test\",\n", - " \"train\": \"/opt/ml/input/data/train\"\n", - " },\n", - " \"current_host\": \"algo-1\",\n", - " \"framework_module\": \"sagemaker_pytorch_container.training:main\",\n", - " \"hosts\": [\n", - " \"algo-1\"\n", - " ],\n", - " \"hyperparameters\": {\n", - " \"experiment_name\": \"en_language_subpillars\",\n", - " \"language_method\": \"keep\",\n", - " \"max_len\": 128,\n", - " \"model_name\": \"distilbert-base-uncased\",\n", - " \"tokenizer_name\": \"distilbert-base-uncased\",\n", - " \"epochs\": 1,\n", - " \"pred_threshold\": 0.38,\n", - " \"tracking_uri\": \"http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/\"\n", - " },\n", - " \"input_config_dir\": \"/opt/ml/input/config\",\n", - " \"input_data_config\": {\n", - " \"test\": {\n", - " \"TrainingInputMode\": \"File\",\n", - " \"S3DistributionType\": \"FullyReplicated\",\n", - " \"RecordWrapperType\": \"None\"\n", - " },\n", - " \"train\": {\n", - " \"TrainingInputMode\": \"File\",\n", - " \"S3DistributionType\": \"FullyReplicated\",\n", - " \"RecordWrapperType\": \"None\"\n", - " }\n", - " },\n", - " \"input_dir\": \"/opt/ml/input\",\n", - " \"is_master\": true,\n", - " \"job_name\": \"pytorch-2021-07-21-12-04-30-848-test\",\n", - " \"log_level\": 20,\n", - " \"master_hostname\": \"algo-1\",\n", - " \"model_dir\": \"/opt/ml/model\",\n", - " \"module_dir\": \"s3://sagemaker-deep-experiments-dev/training/input_data/pytorch-2021-07-21-12-04-30-848-test/pytorch-2021-07-21-12-04-30-848-test/source/sourcedir.tar.gz\",\n", - " \"module_name\": \"train\",\n", - " \"network_interface_name\": \"eth0\",\n", - " \"num_cpus\": 4,\n", - " \"num_gpus\": 1,\n", - " \"output_data_dir\": \"/opt/ml/output/data\",\n", - " \"output_dir\": \"/opt/ml/output\",\n", - " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", - " \"resource_config\": {\n", - " \"current_host\": \"algo-1\",\n", - " \"hosts\": [\n", - " \"algo-1\"\n", - " ],\n", - " \"network_interface_name\": \"eth0\"\n", - " },\n", - " \"user_entry_point\": \"train.py\"\u001b[0m\n", - "\u001b[34m}\n", - "\u001b[0m\n", - "\u001b[34mEnvironment variables:\n", - "\u001b[0m\n", - "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", - "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", - "\u001b[34mSM_HPS={\"epochs\":1,\"experiment_name\":\"en_language_subpillars\",\"language_method\":\"keep\",\"max_len\":128,\"model_name\":\"distilbert-base-uncased\",\"pred_threshold\":0.38,\"tokenizer_name\":\"distilbert-base-uncased\",\"tracking_uri\":\"http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/\"}\u001b[0m\n", - "\u001b[34mSM_USER_ENTRY_POINT=train.py\u001b[0m\n", - "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", - "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", - "\u001b[34mSM_INPUT_DATA_CONFIG={\"test\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", - "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", - "\u001b[34mSM_CHANNELS=[\"test\",\"train\"]\u001b[0m\n", - "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", - "\u001b[34mSM_MODULE_NAME=train\u001b[0m\n", - "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", - "\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main\u001b[0m\n", - "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", - "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", - "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", - "\u001b[34mSM_NUM_CPUS=4\u001b[0m\n", - "\u001b[34mSM_NUM_GPUS=1\u001b[0m\n", - "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", - "\u001b[34mSM_MODULE_DIR=s3://sagemaker-deep-experiments-dev/training/input_data/pytorch-2021-07-21-12-04-30-848-test/pytorch-2021-07-21-12-04-30-848-test/source/sourcedir.tar.gz\u001b[0m\n", - "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"test\":\"/opt/ml/input/data/test\",\"train\":\"/opt/ml/input/data/train\"},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_pytorch_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{\"epochs\":1,\"experiment_name\":\"en_language_subpillars\",\"language_method\":\"keep\",\"max_len\":128,\"model_name\":\"distilbert-base-uncased\",\"pred_threshold\":0.38,\"tokenizer_name\":\"distilbert-base-uncased\",\"tracking_uri\":\"http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/\"},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"test\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"pytorch-2021-07-21-12-04-30-848-test\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-deep-experiments-dev/training/input_data/pytorch-2021-07-21-12-04-30-848-test/pytorch-2021-07-21-12-04-30-848-test/source/sourcedir.tar.gz\",\"module_name\":\"train\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":1,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.py\"}\u001b[0m\n", - "\u001b[34mSM_USER_ARGS=[\"--epochs\",\"1\",\"--experiment_name\",\"en_language_subpillars\",\"--language_method\",\"keep\",\"--max_len\",\"128\",\"--model_name\",\"distilbert-base-uncased\",\"--pred_threshold\",\"0.38\",\"--tokenizer_name\",\"distilbert-base-uncased\",\"--tracking_uri\",\"http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/\"]\u001b[0m\n", - "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", - "\u001b[34mSM_CHANNEL_TEST=/opt/ml/input/data/test\u001b[0m\n", - "\u001b[34mSM_CHANNEL_TRAIN=/opt/ml/input/data/train\u001b[0m\n", - "\u001b[34mSM_HP_EXPERIMENT_NAME=en_language_subpillars\u001b[0m\n", - "\u001b[34mSM_HP_LANGUAGE_METHOD=keep\u001b[0m\n", - "\u001b[34mSM_HP_MAX_LEN=128\u001b[0m\n", - "\u001b[34mSM_HP_MODEL_NAME=distilbert-base-uncased\u001b[0m\n", - "\u001b[34mSM_HP_TOKENIZER_NAME=distilbert-base-uncased\u001b[0m\n", - "\u001b[34mSM_HP_EPOCHS=1\u001b[0m\n", - "\u001b[34mSM_HP_PRED_THRESHOLD=0.38\u001b[0m\n", - "\u001b[34mSM_HP_TRACKING_URI=http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/\u001b[0m\n", - "\u001b[34mPYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages\n", - "\u001b[0m\n", - "\u001b[34mInvoking script with the following command:\n", - "\u001b[0m\n", - "\u001b[34m/opt/conda/bin/python3.6 train.py --epochs 1 --experiment_name en_language_subpillars --language_method keep --max_len 128 --model_name distilbert-base-uncased --pred_threshold 0.38 --tokenizer_name distilbert-base-uncased --tracking_uri http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/\n", - "\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "2021-07-21 10:14:49 Uploading - Uploading generated training model\n", - "2021-07-21 10:14:49 Failed - Training job failed\n", - "\u001b[34m[nltk_data] Downloading package averaged_perceptron_tagger to\u001b[0m\n", - "\u001b[34m[nltk_data] /root/nltk_data...\u001b[0m\n", - "\u001b[34m[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\u001b[0m\n", - "\u001b[34m[nltk_data] Downloading package wordnet to /root/nltk_data...\u001b[0m\n", - "\u001b[34m[nltk_data] Unzipping corpora/wordnet.zip.\u001b[0m\n", - "\u001b[34m[nltk_data] Downloading package omw to /root/nltk_data...\u001b[0m\n", - "\u001b[34m[nltk_data] Unzipping corpora/omw.zip.\u001b[0m\n", - "\u001b[34mimporting data ............\u001b[0m\n", - "\u001b[34m2021-07-21 10:14:30.789458: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\u001b[0m\n", - "\u001b[34mTraceback (most recent call last):\n", - " File \"train.py\", line 64, in \n", - " all_dataset = read_merge_data (args.training_dir, args.val_dir, data_format='pickle')\n", - " File \"/opt/ml/code/utils.py\", line 41, in read_merge_data\n", - " train_df = pd.read_pickle(\"f{TRAIN_PATH}/train.pickle\")\n", - " File \"/opt/conda/lib/python3.6/site-packages/pandas/io/pickle.py\", line 169, in read_pickle\n", - " f, fh = get_handle(fp_or_buf, \"rb\", compression=compression, is_text=False)\n", - " File \"/opt/conda/lib/python3.6/site-packages/pandas/io/common.py\", line 499, in get_handle\n", - " f = open(path_or_buf, mode)\u001b[0m\n", - "\u001b[34mFileNotFoundError: [Errno 2] No such file or directory: 'f{TRAIN_PATH}/train.pickle'\n", - "\u001b[0m\n", - "\u001b[34m2021-07-21 10:14:36,485 sagemaker-training-toolkit ERROR ExecuteUserScriptError:\u001b[0m\n", - "\u001b[34mCommand \"/opt/conda/bin/python3.6 train.py --epochs 1 --experiment_name en_language_subpillars --language_method keep --max_len 128 --model_name distilbert-base-uncased --pred_threshold 0.38 --tokenizer_name distilbert-base-uncased --tracking_uri http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/\"\u001b[0m\n", - "\u001b[34m2021-07-21 10:14:30.789458: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\u001b[0m\n", - "\u001b[34mTraceback (most recent call last):\n", - " File \"train.py\", line 64, in \n", - " all_dataset = read_merge_data (args.training_dir, args.val_dir, data_format='pickle')\n", - " File \"/opt/ml/code/utils.py\", line 41, in read_merge_data\n", - " train_df = pd.read_pickle(\"f{TRAIN_PATH}/train.pickle\")\n", - " File \"/opt/conda/lib/python3.6/site-packages/pandas/io/pickle.py\", line 169, in read_pickle\n", - " f, fh = get_handle(fp_or_buf, \"rb\", compression=compression, is_text=False)\n", - " File \"/opt/conda/lib/python3.6/site-packages/pandas/io/common.py\", line 499, in get_handle\n", - " f = open(path_or_buf, mode)\u001b[0m\n", - "\u001b[34mFileNotFoundError: [Errno 2] No such file or directory: 'f{TRAIN_PATH}/train.pickle'\u001b[0m\n" - ] - }, - { - "ename": "UnexpectedStatusException", - "evalue": "Error for Training job pytorch-2021-07-21-12-04-30-848-test: Failed. Reason: AlgorithmError: ExecuteUserScriptError:\nCommand \"/opt/conda/bin/python3.6 train.py --epochs 1 --experiment_name en_language_subpillars --language_method keep --max_len 128 --model_name distilbert-base-uncased --pred_threshold 0.38 --tokenizer_name distilbert-base-uncased --tracking_uri http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/\"\n2021-07-21 10:14:30.789458: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\nTraceback (most recent call last):\n File \"train.py\", line 64, in \n all_dataset = read_merge_data (args.training_dir, args.val_dir, data_format='pickle')\n File \"/opt/ml/code/utils.py\", line 41, in read_merge_data\n train_df = pd.read_pickle(\"f{TRAIN_PATH}/train.pickle\")\n File \"/opt/conda/lib/python3.6/site-packages/pandas/io/pickle.py\", line 169, in read_pickle\n f, fh = get_handle(fp_or_buf, \"rb\", compression=compression, is_text=False)\n File \"/opt/conda/lib/python3.6/site-packages/pandas/io/common", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mUnexpectedStatusException\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Fit the estimator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfit_arguments\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, inputs, wait, logs, job_name, experiment_config)\u001b[0m\n\u001b[1;32m 681\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjobs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 682\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 683\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 684\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 685\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_compilation_job_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, logs)\u001b[0m\n\u001b[1;32m 1626\u001b[0m \u001b[0;31m# If logs are requested, call logs_for_jobs.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1627\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlogs\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m\"None\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1628\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogs_for_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlog_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1629\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1630\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36mlogs_for_job\u001b[0;34m(self, job_name, wait, poll, log_type)\u001b[0m\n\u001b[1;32m 3658\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3659\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3660\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_job_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdescription\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"TrainingJobStatus\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3661\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdot\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3662\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36m_check_job_status\u001b[0;34m(self, job, desc, status_key_name)\u001b[0m\n\u001b[1;32m 3213\u001b[0m \u001b[0mreason\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdesc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"FailureReason\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"(No reason provided)\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3214\u001b[0m \u001b[0mjob_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstatus_key_name\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"JobStatus\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\" job\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3215\u001b[0;31m raise exceptions.UnexpectedStatusException(\n\u001b[0m\u001b[1;32m 3216\u001b[0m message=\"Error for {job_type} {job_name}: {status}. Reason: {reason}\".format(\n\u001b[1;32m 3217\u001b[0m \u001b[0mjob_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjob_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreason\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreason\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mUnexpectedStatusException\u001b[0m: Error for Training job pytorch-2021-07-21-12-04-30-848-test: Failed. Reason: AlgorithmError: ExecuteUserScriptError:\nCommand \"/opt/conda/bin/python3.6 train.py --epochs 1 --experiment_name en_language_subpillars --language_method keep --max_len 128 --model_name distilbert-base-uncased --pred_threshold 0.38 --tokenizer_name distilbert-base-uncased --tracking_uri http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/\"\n2021-07-21 10:14:30.789458: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\nTraceback (most recent call last):\n File \"train.py\", line 64, in \n all_dataset = read_merge_data (args.training_dir, args.val_dir, data_format='pickle')\n File \"/opt/ml/code/utils.py\", line 41, in read_merge_data\n train_df = pd.read_pickle(\"f{TRAIN_PATH}/train.pickle\")\n File \"/opt/conda/lib/python3.6/site-packages/pandas/io/pickle.py\", line 169, in read_pickle\n f, fh = get_handle(fp_or_buf, \"rb\", compression=compression, is_text=False)\n File \"/opt/conda/lib/python3.6/site-packages/pandas/io/common" - ] - } - ], + "outputs": [], "source": [ "# Fit the estimator\n", "\n", diff --git a/scripts/training/selim/multiclass-lightning/classes.py b/scripts/training/selim/multiclass-lightning/classes.py index e62efe5..6c6816f 100644 --- a/scripts/training/selim/multiclass-lightning/classes.py +++ b/scripts/training/selim/multiclass-lightning/classes.py @@ -1,11 +1,9 @@ -import os - from typing import Optional from tqdm.auto import tqdm import torchmetrics -from torchmetrics.functional import accuracy, f1, auroc +from torchmetrics.functional import auroc import pytorch_lightning as pl from pytorch_lightning.core.decorators import auto_move_data @@ -19,7 +17,6 @@ import pandas as pd from sklearn import metrics -import transformers from transformers import ( AdamW, AutoModel, @@ -28,42 +25,37 @@ get_linear_schedule_with_warmup, ) -import tensorflow as tf class CustomDataset(Dataset): - def __init__(self, dataframe, tagname_to_tagid, tokenizer, max_len:int=128): + def __init__(self, dataframe, tagname_to_tagid, tokenizer, max_len: int = 128): self.tokenizer = tokenizer self.data = dataframe - self.excerpt_text = dataframe["excerpt"].tolist( - ) if dataframe is not None else None + self.excerpt_text = dataframe["excerpt"].tolist() if dataframe is not None else None - self.targets = self.data['target'].tolist( - ) if dataframe is not None else None + self.targets = self.data["target"].tolist() if dataframe is not None else None - self.entry_ids = self.data['entry_id'].tolist( - ) if dataframe is not None else None + self.entry_ids = self.data["entry_id"].tolist() if dataframe is not None else None self.tagname_to_tagid = tagname_to_tagid self.tagid_to_tagname = list(tagname_to_tagid.keys()) self.max_len = max_len - def encode_example(self, - excerpt_text: str, - index=None, - as_batch: bool = False): - - inputs = self.tokenizer(excerpt_text, - None, - truncation=True, - add_special_tokens=True, - max_length=self.max_len, - padding="max_length", - return_token_type_ids=True) - ids = inputs['input_ids'] - mask = inputs['attention_mask'] + def encode_example(self, excerpt_text: str, index=None, as_batch: bool = False): + + inputs = self.tokenizer( + excerpt_text, + None, + truncation=True, + add_special_tokens=True, + max_length=self.max_len, + padding="max_length", + return_token_type_ids=True, + ) + ids = inputs["input_ids"] + mask = inputs["attention_mask"] token_type_ids = inputs["token_type_ids"] - + targets = None if self.targets: target_indices = [ @@ -74,24 +66,18 @@ def encode_example(self, targets = np.zeros(len(self.tagname_to_tagid), dtype=np.int) targets[target_indices] = 1 encoded = { - 'ids': - torch.tensor(ids, dtype=torch.long), - 'mask': - torch.tensor(mask, dtype=torch.long), - 'token_type_ids': - torch.tensor(token_type_ids, dtype=torch.long), - 'targets': - torch.tensor(targets, dtype=torch.float32) - if targets is not None else None, - 'entry_id': - self.entry_ids[index] + "ids": torch.tensor(ids, dtype=torch.long), + "mask": torch.tensor(mask, dtype=torch.long), + "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long), + "targets": torch.tensor(targets, dtype=torch.float32) if targets is not None else None, + "entry_id": self.entry_ids[index], } if as_batch: return { "ids": encoded["ids"].unsqueeze(0), "mask": encoded["mask"].unsqueeze(0), - "token_type_ids": encoded["ids"].unsqueeze(0) + "token_type_ids": encoded["ids"].unsqueeze(0), } return encoded @@ -104,15 +90,19 @@ def __getitem__(self, index): class Model(nn.Module): - def __init__(self, model_name_or_path: str, num_labels:int, dropout_rate=0.3, output_length=384): + def __init__( + self, model_name_or_path: str, num_labels: int, dropout_rate=0.3, output_length=384 + ): super().__init__() self.l1 = AutoModel.from_pretrained(model_name_or_path) self.l2 = torch.nn.Dropout(dropout_rate) self.l3 = torch.nn.Linear(output_length, num_labels) - + def forward(self, inputs): - output = self.l1(inputs["ids"], - attention_mask=inputs["mask"],) + output = self.l1( + inputs["ids"], + attention_mask=inputs["mask"], + ) output = output.last_hidden_state output = self.l2(output) output = self.l3(output) @@ -120,44 +110,58 @@ def forward(self, inputs): class Transformer(pl.LightningModule): - def __init__(self, - model_name_or_path: str, - num_labels: int, - empty_dataset: CustomDataset, - training_loader, - val_loader, - weight_classes, - pred_threshold: float = .5, - learning_rate: float = 1e-5, - adam_epsilon: float = 1e-8, - warmup_steps: int = 500, - weight_decay: float = 0.1, - train_batch_size: int = 32, - eval_batch_size: int = 32, - eval_splits: Optional[list] = None, - dropout_rate: float = 0.3, - output_length=384, - - **kwargs): + def __init__( + self, + model_name_or_path: str, + tagname_to_tagid, + empty_dataset: CustomDataset, + train_dataset, + val_dataset, + train_params, + val_params, + weight_classes, + tokenizer, + pred_threshold: float = 0.5, + learning_rate: float = 1e-5, + adam_epsilon: float = 1e-8, + warmup_steps: int = 500, + weight_decay: float = 0.1, + train_batch_size: int = 32, + eval_batch_size: int = 32, + eval_splits: Optional[list] = None, + dropout_rate: float = 0.3, + max_len: int = 128, + output_length=384, + **kwargs, + ): + super().__init__() self.output_length = output_length self.save_hyperparameters() - self.num_labels = num_labels - self.model = Model(model_name_or_path, num_labels, dropout_rate, self.output_length) + self.tagname_to_tagid = tagname_to_tagid + self.num_labels = len(tagname_to_tagid) + self.max_len = max_len + self.model = Model(model_name_or_path, self.num_labels, dropout_rate, self.output_length) + self.tokenizer = tokenizer + self.val_params = val_params + if any(weight_classes): self.use_weights = True - self.weight_classes = torch.tensor(weight_classes).to('cuda') + self.weight_classes = torch.tensor(weight_classes).to("cuda") else: self.use_weights = False self.empty_dataset = empty_dataset self.pred_threshold = pred_threshold - self.val_loader = val_loader - self.training_loader = training_loader - + self.training_loader = self.get_loaders( + train_dataset, train_params, tagname_to_tagid, self.tokenizer, max_len + ) + self.val_loader = self.get_loaders( + val_dataset, val_params, tagname_to_tagid, self.tokenizer, max_len + ) self.f1_score_train = torchmetrics.F1( num_classes=2, - threshold=0.5, - average='macro', + threshold=self.pred_threshold, + average="macro", mdmc_average="samplewise", ignore_index=None, top_k=None, @@ -170,8 +174,8 @@ def __init__(self, self.f1_score_val = torchmetrics.F1( num_classes=2, - threshold=0.5, - average='macro', + threshold=self.pred_threshold, + average="macro", mdmc_average="samplewise", ignore_index=None, top_k=None, @@ -181,7 +185,7 @@ def __init__(self, process_group=None, dist_sync_fn=None, ) - + @auto_move_data def forward(self, inputs): output = self.model(inputs) @@ -190,60 +194,49 @@ def forward(self, inputs): def training_step(self, batch, batch_idx): outputs = self(batch) if self.use_weights: - loss = F.binary_cross_entropy_with_logits(outputs, - batch["targets"], - weight=self.weight_classes) + loss = F.binary_cross_entropy_with_logits( + outputs, batch["targets"], weight=self.weight_classes + ) else: - loss = F.binary_cross_entropy_with_logits(outputs, - batch["targets"]) + loss = F.binary_cross_entropy_with_logits(outputs, batch["targets"]) - self.f1_score_train(torch.sigmoid(outputs), - batch["targets"].to(dtype=torch.long)) + self.f1_score_train(torch.sigmoid(outputs), batch["targets"].to(dtype=torch.long)) self.log("train_f1", self.f1_score_train, prog_bar=True) return loss def validation_step(self, batch, batch_idx, dataloader_idx=0): outputs = self(batch) - val_loss = F.binary_cross_entropy_with_logits(outputs, - batch["targets"]) - - self.f1_score_val(torch.sigmoid(outputs), - batch["targets"].to(dtype=torch.long)) - self.log("val_f1", - self.f1_score_val, - on_step=True, - on_epoch=True, - prog_bar=True, - logger=False) - - self.log("val_loss", - val_loss, - on_step=True, - on_epoch=True, - prog_bar=True, - logger=False) - return {'val_loss': val_loss, 'val_f1': self.f1_score_val} + val_loss = F.binary_cross_entropy_with_logits(outputs, batch["targets"]) + + self.f1_score_val(torch.sigmoid(outputs), batch["targets"].to(dtype=torch.long)) + self.log( + "val_f1", self.f1_score_val, on_step=True, on_epoch=True, prog_bar=True, logger=False + ) + + self.log("val_loss", val_loss, on_step=True, on_epoch=True, prog_bar=True, logger=False) + return {"val_loss": val_loss, "val_f1": self.f1_score_val} def test_step(self, batch, batch_nb): logits = self(batch) - preds = (torch.sigmoid(logits) > .5) + preds = torch.sigmoid(logits) > 0.5 return {"preds": preds, "targets_i": batch["targets"]} def on_test_epoch_end(self, outputs): preds = torch.cat([output["preds"] for output in outputs]).cpu() targets = torch.cat([output["targets_i"] for output in outputs]).cpu() - recalls = [] - precisions = [] - f1_scores = [] + for i in range(targets.shape[1]): class_roc_auc = auroc(preds[:, i], targets[:, i]) - self.log( - f"{self.empty_dataset.sectorid_to_sectorname[i]}_roc_auc/Train", - class_roc_auc) + self.log(f"{self.empty_dataset.sectorid_to_sectorname[i]}_roc_auc/Train", class_roc_auc) class_f1 = metrics.f1_score(targets[:, i], preds[:, i]) - self.log( - f"{self.empty_dataset.sectorid_to_sectorname[i]}_f1/Train", - class_f1) + self.log(f"{self.empty_dataset.sectorid_to_sectorname[i]}_f1/Train", class_f1) + + def get_loaders(self, dataset, params, tagname_to_tagid, tokenizer, max_len: int = 128): + + set = CustomDataset(dataset, tagname_to_tagid, tokenizer, max_len) + + loader = DataLoader(set, **params) + return loader def predict_step(self, batch, batch_idx, dataloader_idx=None): output = self(batch) @@ -255,64 +248,72 @@ def on_predict_epoch_end(self, outputs): pred_classes = [] for pred in preds: pred_classes_i = [ - self.empty_dataset.sectorid_to_sectorname[i] - for i, p in enumerate(pred) if p + self.empty_dataset.sectorid_to_sectorname[i] for i, p in enumerate(pred) if p ] pred_classes.append(pred_classes_i) self.log({"pred_classes": pred_classes}) - def custom_predict(self, validation_loader, name:str, return_logits=False): + def custom_predict(self, validation_dataset, return_all=False): + validation_loader = self.get_loaders( + validation_dataset, self.val_params, self.tagname_to_tagid, self.tokenizer, self.max_len + ) if self.device.type == "cpu": self.to("cuda") self.eval() self.freeze() - indexes=torch.tensor([]) + indexes = [] + y_true = [] + logit_predictions = [] with torch.no_grad(): - iter=0 - for batch in tqdm(validation_loader, total=len(validation_loader.dataset)//validation_loader.batch_size): - - logits = self({"ids": batch["ids"].to('cuda'), - "mask": batch["mask"].to('cuda'), - "token_type_ids": batch["token_type_ids"].to('cuda')}) + for batch in tqdm( + validation_loader, + total=len(validation_loader.dataset) // validation_loader.batch_size, + ): + + logits = self( + { + "ids": batch["ids"].to("cuda"), + "mask": batch["mask"].to("cuda"), + "token_type_ids": batch["token_type_ids"].to("cuda"), + } + ) + + y_true.append(batch["targets"].numpy().astype(np.int)) + indexes.append(batch["entry_id"].numpy().astype(np.int)) + logits_to_array = np.array([np.array(t) for t in logits.cpu()]) - - if return_logits: - if iter==0: - - predictions = logits_to_array - indexes = batch["entry_id"] - - else: - predictions = np.concatenate([predictions, logits_to_array], 0) #.append(preds_batch) - indexes = tf.concat([indexes, batch["entry_id"]], 0) - - iter += 1 - - else: - preds_batch = np.zeros(logits.shape, dtype=np.int) - preds_batch[(torch.sigmoid(logits) >= self.pred_threshold).cpu().nonzero(as_tuple=True)] = 1 - if iter==0: - predictions = preds_batch - indexes = batch["entry_id"] - - else: - predictions = np.concatenate([predictions,preds_batch], 0) #.append(preds_batch) - indexes = tf.concat([indexes, batch["entry_id"]], 0) - - iter += 1 - - np.save('predictions-'+name, np.array(predictions)) - np.save('indexes-'+name, np.array(indexes)) - return np.array(predictions), np.array(indexes) + logit_predictions.append(logits_to_array) + + y_true = np.concatenate(y_true) + indexes = np.concatenate(indexes) + logit_predictions = np.concatenate(logit_predictions) + + predictions = [] + # postprocess predictions + for i in range(logit_predictions.shape[0]): + row_pred = np.array([0] * self.num_labels) + row_logits = logit_predictions[i, :] + if np.all(row_logits < self.pred_threshold): + row_pred[np.argmax(row_logits)] = 1 + else: + row_pred[row_logits > self.pred_threshold] = 1 + + predictions.append(row_pred) + + if return_all: + return np.array(predictions), y_true, indexes, logit_predictions + + return np.array(predictions), y_true def total_steps(self) -> int: """The number of total training steps that will be run. Used for lr scheduler purposes.""" self.dataset_size = len(self.train_dataloader().dataset) num_devices = max(1, self.hparams.gpus) # TODO: consider num_tpu_cores - effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices - return (self.dataset_size / - effective_batch_size) * self.hparams.max_epochs + effective_batch_size = ( + self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices + ) + return (self.dataset_size / effective_batch_size) * self.hparams.max_epochs def configure_optimizers(self): "Prepare optimizer and schedule (linear warmup and decay)" @@ -321,34 +322,29 @@ def configure_optimizers(self): optimizer_grouped_parameters = [ { "params": [ - p for n, p in model.named_parameters() - if not any(nd in n for nd in no_decay) + p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], - "weight_decay": - self.hparams.weight_decay, + "weight_decay": self.hparams.weight_decay, }, { "params": [ - p for n, p in model.named_parameters() - if any(nd in n for nd in no_decay) + p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], - "weight_decay": - 0.0, + "weight_decay": 0.0, }, ] - optimizer = AdamW(optimizer_grouped_parameters, - lr=self.hparams.learning_rate, - eps=self.hparams.adam_epsilon) + optimizer = AdamW( + optimizer_grouped_parameters, + lr=self.hparams.learning_rate, + eps=self.hparams.adam_epsilon, + ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.hparams.warmup_steps, - num_training_steps=self.total_steps()) - scheduler = { - 'scheduler': scheduler, - 'interval': 'step', - 'frequency': 1 - } + num_training_steps=self.total_steps(), + ) + scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1} return [optimizer], [scheduler] def train_dataloader(self): @@ -356,44 +352,85 @@ def train_dataloader(self): def val_dataloader(self): return self.val_loader - - def custom_eval(self, eval_dataloader): - if self.device.type == "cpu": - self.to("cuda") - self.eval() - self.freeze() - preds_val_all = [] - y_true = [] - - with torch.no_grad(): - for batch in tqdm(eval_dataloader, total=len(eval_dataloader.dataset)//eval_dataloader.batch_size): - logits = self({"ids": batch["ids"].to("cuda"), "mask": batch["mask"].to("cuda"), "token_type_ids": batch["token_type_ids"].to("cuda")}) - preds_batch = np.zeros(logits.shape, dtype=np.int) - preds_batch[(torch.sigmoid(logits) > self.pred_threshold).cpu().nonzero(as_tuple=True)] = 1 - preds_val_all.append(preds_batch) - y_true.append(batch["targets"].numpy().astype(np.int)) - - preds_val_all = np.concatenate(preds_val_all) - y_true = np.concatenate(y_true) + def compute_metrics(self, preds_val_all, y_true, tagname_to_tagid): f1_scores = [] recalls = [] precisions = [] accuracies = [] - supports = [] - tagname_to_tagid = self.empty_dataset.tagname_to_tagid + for tag_name, tag_id in tagname_to_tagid.items(): - cls_rprt = metrics.classification_report(y_true[:, tag_id], preds_val_all[:, tag_id], output_dict=True) + cls_rprt = metrics.classification_report( + y_true[:, tag_id], preds_val_all[:, tag_id], output_dict=True + ) precisions.append(cls_rprt["macro avg"]["precision"]) recalls.append(cls_rprt["macro avg"]["recall"]) f1_scores.append(cls_rprt["macro avg"]["f1-score"]) accuracies.append(cls_rprt["accuracy"]) - metrics_df = pd.DataFrame({ - "Sector": list(tagname_to_tagid.keys()), - "Precision": precisions, - "Recall": recalls, - "F1 Score": f1_scores, - "Accuracy": accuracies, - }) + metrics_df = pd.DataFrame( + { + "Sector": list(tagname_to_tagid.keys()), + "Precision": precisions, + "Recall": recalls, + "F1 Score": f1_scores, + "Accuracy": accuracies, + } + ) + metrics_df.loc["mean"] = metrics_df.mean() return metrics_df + + def get_results_pillar(self, preds_val_all, y_true): + list_tags_subpillars = list(self.tagname_to_tagid.keys()) + list_tags_pillars = sorted( + list(set([tags.split("->")[0] for tags in list_tags_subpillars])) + ) + pillars_tagname_to_tagid = {tag: i for i, tag in enumerate(list(sorted(list_tags_pillars)))} + + n_subpillars_per_pillar = [ + len(list(filter(lambda x: pillar in x, list_tags_subpillars))) + for pillar in list_tags_pillars + ] + + def subpillars_to_pillars(y): + print(y.shape) + pillars_true_y = [] + for row_nb in range(y.shape[0]): + row = y[row_nb, :] + nb_pillars = len(n_subpillars_per_pillar) + result = np.array([0] * nb_pillars) + count = 0 + for pillar_nb in range(nb_pillars): + + supillar_nb_tmp = n_subpillars_per_pillar[pillar_nb] + + if np.any(row[count : count + supillar_nb_tmp]): + result[pillar_nb] = 1 + + count += supillar_nb_tmp + + if np.any(row[count:]): + result[-1] = 1 + + pillars_true_y.append(result) + + return np.array(pillars_true_y) + + y_true_pillars = subpillars_to_pillars(y_true) + preds_val_pillars = subpillars_to_pillars(preds_val_all) + + return self.compute_metrics(preds_val_pillars, y_true_pillars, pillars_tagname_to_tagid) + + def custom_eval( + self, validation_dataset, save_results=True, subpillar_path=None, pillar_path=None + ): + + preds_val_all, y_true = self.custom_predict(validation_dataset) + metrics_subpillars = self.compute_metrics(preds_val_all, y_true, self.tagname_to_tagid) + metrics_pillars = self.get_results_pillar(preds_val_all, y_true) + + if save_results: + metrics_subpillars.to_csv(subpillar_path) + metrics_pillars.to_csv(pillar_path) + + return metrics_pillars, metrics_subpillars diff --git a/scripts/training/selim/multiclass-lightning/generate_models.py b/scripts/training/selim/multiclass-lightning/generate_models.py index 84f450d..7e69723 100644 --- a/scripts/training/selim/multiclass-lightning/generate_models.py +++ b/scripts/training/selim/multiclass-lightning/generate_models.py @@ -1,69 +1,52 @@ import os from pytorch_lightning.loggers import TensorBoardLogger import pytorch_lightning as pl +from transformers import AutoTokenizer -from classes import * -from utils import tagname_to_id -def get_loaders (train_dataset, - val_dataset, - train_params, - val_params, - tagname_to_tagid, - tokenizer, - max_len:int): - - training_set = CustomDataset(train_dataset, tagname_to_tagid, tokenizer, max_len) - val_set = CustomDataset(val_dataset, tagname_to_tagid, tokenizer, max_len) - - training_loader = DataLoader(training_set, **train_params) - val_loader = DataLoader(val_set, **val_params) - return training_loader, val_loader +from classes import CustomDataset, Transformer +from utils import ( + tagname_to_id, +) -def train_on_specific_targets (train_dataset, - val_dataset, - name_classifier:str, - dirpath:str, - MODEL_NAME:str, - tokenizer, - early_stopping_callback, - checkpoint_callback, - dropout_rate:float, - train_params, - val_params, - gpu_nb:int, - MAX_EPOCHS:int, - weight_decay=0.01, - warmup_steps=500, - output_length=384, - max_len=128, - weight_classes=None, - learning_rate=3e-5, - pred_threshold:float=0.5): + +def train_on_specific_targets( + train_dataset, + val_dataset, + name_classifier: str, + dirpath: str, + MODEL_NAME: str, + TOKENIZER_NAME: str, + early_stopping_callback, + checkpoint_callback, + dropout_rate: float, + train_params, + val_params, + gpu_nb: int, + MAX_EPOCHS: int, + weight_decay=0.02, + warmup_steps=500, + output_length=384, + max_len=128, + weight_classes=None, + learning_rate=3e-5, + pred_threshold: float = 0.5, +): """ main function used to train model """ if not os.path.exists(dirpath): os.makedirs(dirpath) - train_dataset = train_dataset[['entry_id', 'excerpt', 'target']] - val_dataset = val_dataset[['entry_id', 'excerpt', 'target']] + train_dataset = train_dataset[["entry_id", "excerpt", "target"]] + val_dataset = val_dataset[["entry_id", "excerpt", "target"]] logger = TensorBoardLogger("lightning_logs", name=name_classifier) - tagname_to_tagid = tagname_to_id (train_dataset["target"]) + tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) + tagname_to_tagid = tagname_to_id(train_dataset["target"]) empty_dataset = CustomDataset(None, tagname_to_tagid, tokenizer, max_len) - - training_loader, val_loader = get_loaders (train_dataset, - val_dataset, - train_params, - val_params, - tagname_to_tagid, - tokenizer, - max_len - ) - trainer = pl.Trainer( logger=logger, @@ -76,36 +59,39 @@ def train_on_specific_targets (train_dataset, accumulate_grad_batches=1, max_epochs=MAX_EPOCHS, gradient_clip_val=1, - gradient_clip_algorithm='norm' - #overfit_batches=1, - #limit_predict_batches=2, - #limit_test_batches=2, - #fast_dev_run=True, - #limit_train_batches=1, - #limit_val_batches=1, - #limit_test_batches: Union[int, float] = 1.0, + gradient_clip_algorithm="norm" + # overfit_batches=1, + # limit_predict_batches=2, + # limit_test_batches=2, + # fast_dev_run=True, + # limit_train_batches=1, + # limit_val_batches=1, + # limit_test_batches: Union[int, float] = 1.0, ) - - model = Transformer(MODEL_NAME, - len(tagname_to_tagid), - empty_dataset, - training_loader, - val_loader, - weight_classes = weight_classes, - gpus=gpu_nb, - precision=16, - plugin='deepspeed_stage_3_offload', - accumulate_grad_batches=1, - max_epochs=MAX_EPOCHS, - dropout_rate=dropout_rate, - weight_decay=weight_decay, - warmup_steps=warmup_steps, - output_length=output_length, - learning_rate=learning_rate, - pred_threshold=pred_threshold - ) + model = Transformer( + MODEL_NAME, + tagname_to_tagid, + empty_dataset, + train_dataset=train_dataset, + val_dataset=val_dataset, + train_params=train_params, + val_params=val_params, + weight_classes=weight_classes, + tokenizer=tokenizer, + gpus=gpu_nb, + precision=16, + plugin="deepspeed_stage_3_offload", + accumulate_grad_batches=1, + max_epochs=MAX_EPOCHS, + dropout_rate=dropout_rate, + weight_decay=weight_decay, + warmup_steps=warmup_steps, + output_length=output_length, + learning_rate=learning_rate, + pred_threshold=pred_threshold, + ) trainer.fit(model) - return model \ No newline at end of file + return model diff --git a/scripts/training/selim/multiclass-lightning/train.py b/scripts/training/selim/multiclass-lightning/train.py index 89a5300..80592f0 100644 --- a/scripts/training/selim/multiclass-lightning/train.py +++ b/scripts/training/selim/multiclass-lightning/train.py @@ -2,22 +2,18 @@ import os import argparse import logging -import pickle -from pathlib import Path -import random -import pytorch_lightning as pl from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping -import pandas as pd -import torch -from transformers import ( - AutoTokenizer, - AdamW, -) +import timeit -from utils import * -from generate_models import * +from utils import ( + read_merge_data, + preprocess_data, + tagname_to_id, + compute_weights, +) +from generate_models import train_on_specific_targets if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -30,15 +26,18 @@ parser.add_argument("--max_len", type=int, default=128) parser.add_argument("--warmup_steps", type=int, default=100) parser.add_argument("--weight_decay", type=float, default=0.01) - parser.add_argument("--learning_rate", type=str, default=3e-5) - parser.add_argument("--dropout_rate", type=str, default=0.3) - parser.add_argument("--pred_threshold", type=str, default=0.5) - - parser.add_argument("--model_name", type=str, default='microsoft/xtremedistil-l6-h384-uncased') - parser.add_argument("--tokenizer_name", type=str, default='microsoft/xtremedistil-l6-h384-uncased') - #parser.add_argument("--log_every_n_steps", type=int, default=10) - #parser.add_argument("--n_classes", type=int, default=6) - parser.add_argument("--method_language", type=str, default='keep all') + parser.add_argument("--learning_rate", type=float, default=3e-5) + parser.add_argument("--dropout_rate", type=float, default=0.3) + parser.add_argument("--pred_threshold", type=float, default=0.5) + parser.add_argument("--output_length", type=int, default=384) + + parser.add_argument("--model_name", type=str, default="microsoft/xtremedistil-l6-h384-uncased") + parser.add_argument( + "--tokenizer_name", type=str, default="microsoft/xtremedistil-l6-h384-uncased" + ) + # parser.add_argument("--log_every_n_steps", type=int, default=10) + # parser.add_argument("--n_classes", type=int, default=6) + parser.add_argument("--method_language", type=str, default="keep all") # Data, model, and output directories parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) @@ -48,7 +47,6 @@ parser.add_argument("--val_dir", type=str, default=os.environ["SM_CHANNEL_TEST"]) args, _ = parser.parse_known_args() - # Set up logging logger = logging.getLogger(__name__) logging.basicConfig( @@ -61,47 +59,18 @@ ######################################## - def read_merge_data (TRAIN_PATH, VAL_PATH, data_format:str='csv'): - print(f"{TRAIN_PATH}/train.pickle") - - if data_format=='pickle': - train_df = pd.read_pickle(f"{TRAIN_PATH}/train.pickle") - val_df = pd.read_pickle(f"{VAL_PATH}/val.pickle") - - else: - train_df = pd.read_csv(TRAIN_PATH) - val_df = pd.read_csv(VAL_PATH) - - all_dataset = pd.concat([train_df, val_df])[['entry_id', 'excerpt', 'subpillars', 'language']]\ - .rename(columns={'subpillars':'target'}) - - # Keep only unique values in pillars - all_dataset["target"] = all_dataset["target"].apply(lambda x: clean_rows (x)) - - # Keep only rows with a not empty pillar - all_dataset = all_dataset[all_dataset.target.apply(lambda x: len(x)>0)][['entry_id', 'excerpt', 'target', 'language']] - return all_dataset - - print('importing data ............') - all_dataset = read_merge_data (args.training_dir, args.val_dir, data_format='pickle') - - train_params = { - 'batch_size': args.train_batch_size, - 'shuffle': True, - 'num_workers': 2 - } + print("importing data ............") + all_dataset = read_merge_data(args.training_dir, args.val_dir, data_format="pickle") + + train_params = {"batch_size": args.train_batch_size, "shuffle": True, "num_workers": 4} - val_params = { - 'batch_size': args.val_batch_size, - 'shuffle': False, - 'num_workers': 2 - } - - train_df, val_df = preprocess_data(all_dataset, - perform_augmentation=False, - method='keep en') - - tags_ids = tagname_to_id (all_dataset.target) + val_params = {"batch_size": args.val_batch_size, "shuffle": False, "num_workers": 4} + + train_df, val_df = preprocess_data( + all_dataset, perform_augmentation=False, method=args.method_language + ) + + tags_ids = tagname_to_id(all_dataset.target) list_tags = list(tags_ids.keys()) number_data_classes = [] @@ -109,63 +78,62 @@ def read_merge_data (TRAIN_PATH, VAL_PATH, data_format:str='csv'): nb_data_in_class = train_df.target.apply(lambda x: tag in (x)).sum() number_data_classes.append(nb_data_in_class) - weights = compute_weights (number_data_classes, train_df.shape[0]) - - over_sampled_targets = [] - for i in range (len(weights)): - if weights[i]>5: - weights[i]=weights[i]**1.5 + weights = compute_weights(number_data_classes, train_df.shape[0]) + weights = [weight if weight < 5 else weight ** 2 for weight in weights] log_dir_name = "-".join(args.model_name.split("/")) - PATH_NAME = log_dir_name + '-subpillars-' + args.method_language + PATH_NAME = log_dir_name + "-subpillars-" + args.method_language if not os.path.exists(PATH_NAME): os.makedirs(PATH_NAME) os.chdir(PATH_NAME) - early_stopping_callback = EarlyStopping(monitor='val_f1', - patience=2, - mode='max') + early_stopping_callback = EarlyStopping(monitor="val_f1", patience=2, mode="max") checkpoint_callback_params = { - 'save_top_k': 1, - 'verbose': True, - 'monitor': "val_f1", - 'mode': "max" + "save_top_k": 1, + "verbose": True, + "monitor": "val_f1", + "mode": "max", } dirpath_pillars = f"./checkpoints-subpillars-{log_dir_name}" checkpoint_callback_pillars = ModelCheckpoint( - dirpath=dirpath_pillars, - **checkpoint_callback_params + dirpath=dirpath_pillars, **checkpoint_callback_params + ) + + en_model_subpillars = train_on_specific_targets( + train_df, + val_df, + f"subpillars-{log_dir_name}-", + dirpath_pillars, + args.model_name, + args.tokenizer_name, + early_stopping_callback, + checkpoint_callback_pillars, + gpu_nb=1, + train_params=train_params, + val_params=val_params, + MAX_EPOCHS=args.epochs, + dropout_rate=args.dropout_rate, + weight_classes=weights, + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + max_len=args.max_len, + warmup_steps=args.warmup_steps, + pred_threshold=float(args.pred_threshold), + output_length=args.output_length, ) - en_tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name) - print('begin training ............') - en_model_pillars = train_on_specific_targets(train_df, - val_df, - f"subpillars-{log_dir_name}-", - dirpath_pillars, - args.model_name, - en_tokenizer, - early_stopping_callback, - checkpoint_callback_pillars, - gpu_nb=args.n_gpus, - train_params=train_params, - val_params=val_params, - MAX_EPOCHS=args.epochs, - dropout_rate=args.dropout_rate, - weight_classes=weights, - weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - max_len=args.max_len, - warmup_steps=args.warmup_steps, - pred_threshold=args.pred_threshold) - - _ , val_loader = get_loaders (train_df, val_df, train_params, val_params, tags_ids, en_tokenizer) - - - - metrics = en_model_pillars.custom_eval(val_loader, ) - metrics.loc['mean'] = metrics.mean() - - print (metrics.loc['mean']) + start = timeit.default_timer() + + metrics_pillars, metrics_subpillars = en_model_subpillars.custom_eval( + val_df, + f"{args.output_data_dir} / results_subpillars.csv", + f"{args.output_data_dir} / results_pillars.csv", + ) + + stop = timeit.default_timer() + + print("Time to predict 100 sentences: ", 100 * (stop - start) / val_df.shape[0]) + print("subpillars: ", metrics_subpillars.loc["mean"]) + print("pillars: ", metrics_pillars.loc["mean"]) diff --git a/scripts/training/selim/multiclass-lightning/utils.py b/scripts/training/selim/multiclass-lightning/utils.py index 81e9647..7c1a1cc 100644 --- a/scripts/training/selim/multiclass-lightning/utils.py +++ b/scripts/training/selim/multiclass-lightning/utils.py @@ -1,5 +1,5 @@ from ast import literal_eval -import os + from sklearn.model_selection import train_test_split import numpy as np import pandas as pd @@ -7,40 +7,70 @@ import nlpaug.augmenter.word as naw import nltk -nltk.download('averaged_perceptron_tagger') -nltk.download('wordnet') -nltk.download('omw') + +nltk.download("averaged_perceptron_tagger") +nltk.download("wordnet") +nltk.download("omw") import warnings -warnings.filterwarnings('ignore') -###################################### GENERAL UTIL FUNCTIONS ############################################ +warnings.filterwarnings("ignore") + -def clean_rows (row): +def clean_rows(row): """ 1) Apply litteral evaluation 2) Drop values that are repeated multiple times in rows """ return list(set(literal_eval(row))) -def tagname_to_id (target): + +def tagname_to_id(target): """ Assign id to each tag """ tag_set = set() for tags_i in target: tag_set.update(tags_i) - tagname_to_tagid = {tag:i for i, tag in enumerate(list(sorted(tag_set)))} + tagname_to_tagid = {tag: i for i, tag in enumerate(list(sorted(tag_set)))} return tagname_to_tagid -########################################### DATA PREPROCESSING AND AUGMENTATION #################################### -def preprocess_data (dataset, - n_synonym_augmenter=1, - n_swap=1, - perform_augmentation:bool=True, - method='keep all', - language_chosen:str='en'): +def read_merge_data(TRAIN_PATH, VAL_PATH, data_format: str = "csv"): + + if data_format == "pickle": + train_df = pd.read_pickle(f"{TRAIN_PATH}/train.pickle") + val_df = pd.read_pickle(f"{VAL_PATH}/val.pickle") + + else: + train_df = pd.read_csv(TRAIN_PATH) + val_df = pd.read_csv(VAL_PATH) + + all_dataset = pd.concat([train_df, val_df])[ + ["entry_id", "excerpt", "subpillars", "language"] + ].rename(columns={"subpillars": "target"}) + + # Keep only unique values in pillars + all_dataset["target"] = all_dataset["target"].apply(lambda x: clean_rows(x)) + + # Keep only rows with a not empty pillar + all_dataset = all_dataset[all_dataset.target.apply(lambda x: len(x) > 0)][ + ["entry_id", "excerpt", "target", "language"] + ] + return all_dataset + + +# DATA PREPROCESSING AND AUGMENTATION + + +def preprocess_data( + dataset, + n_synonym_augmenter=1, + n_swap=1, + perform_augmentation: bool = True, + method="keep all", + language_chosen: str = "en", +): """ 1) filter with respect to language 2) perform augmentation @@ -49,60 +79,61 @@ def preprocess_data (dataset, df = dataset.copy() - if method=='keep en': - df = df[df.language==language_chosen] - elif method=='omit en': - df = df[df.language!=language_chosen] + if method == "keep en": + df = df[df.language == language_chosen] + elif method == "omit en": + df = df[df.language != language_chosen] - df = df[['entry_id', 'excerpt', 'target']] + df = df[["entry_id", "excerpt", "target"]] if perform_augmentation: train_data, test_data = train_test_split(df, test_size=0.3) return augment_data(train_data, n_synonym_augmenter, n_swap), test_data - else: + else: return train_test_split(df, test_size=0.2) -def augment_data (df, n_synonym, n_swap): + +def augment_data(df, n_synonym, n_swap): """ 1) Augment with synonym 2) Apply swap on new (augmented with synonym) dataframe """ if n_synonym: - syn_aug_en = naw.SynonymAug(lang='eng', aug_min=3, aug_p=0.4) - syn_aug_fr = naw.SynonymAug(lang='fra', aug_min=3, aug_p=0.4) - syn_aug_es = naw.SynonymAug(lang='spa', aug_min=3, aug_p=0.4) + syn_aug_en = naw.SynonymAug(lang="eng", aug_min=3, aug_p=0.4) + syn_aug_fr = naw.SynonymAug(lang="fra", aug_min=3, aug_p=0.4) + syn_aug_es = naw.SynonymAug(lang="spa", aug_min=3, aug_p=0.4) - en_syn = df[df.language=='en'] - fr_syn = df[df.language=='fr'] - es_syn = df[df.language=='es'] + en_syn = df[df.language == "en"] + fr_syn = df[df.language == "fr"] + es_syn = df[df.language == "es"] en_syn.excerpt = en_syn.excerpt.apply(lambda x: syn_aug_en.augment(x, n=n_synonym)) fr_syn.excerpt = fr_syn.excerpt.apply(lambda x: syn_aug_fr.augment(x, n=n_synonym)) es_syn.excerpt = es_syn.excerpt.apply(lambda x: syn_aug_es.augment(x, n=n_synonym)) whole_synoynm = pd.concat([en_syn, fr_syn, es_syn]) - + for _, row in whole_synoynm.iterrows(): excerpts = row.excerpt - for i in range (0,n_synonym): + for i in range(0, n_synonym): row.excerpt = excerpts[i] df = df.append(row) if n_swap: - swap = naw.RandomWordAug(action='swap', aug_min=3, aug_max=5) + swap = naw.RandomWordAug(action="swap", aug_min=3, aug_max=5) swap_df = df swap_df.excerpt = swap_df.excerpt.apply(lambda x: swap.augment(x, n=n_swap)) for _, row in swap_df.iterrows(): excerpts = row.excerpt - for i in range (0,n_swap): + for i in range(0, n_swap): row.excerpt = excerpts[i] df = df.append(row) - return df -def compute_weights (number_data_classes, n_tot): + +def compute_weights(number_data_classes, n_tot): """ weights computation for weighted loss function INPUTS: @@ -112,86 +143,94 @@ def compute_weights (number_data_classes, n_tot): OUTPUT: list of weights used for training """ - + number_classes = len(number_data_classes) - return list([np.sqrt(n_tot / (number_classes * number_data_class)) for number_data_class in number_data_classes]) + return list( + [ + np.sqrt(n_tot / (number_classes * number_data_class)) + for number_data_class in number_data_classes + ] + ) -########################################### EVALUATION ######################################### +# EVALUATION + def perfectEval(anonstring): try: ev = literal_eval(anonstring) return ev except ValueError: - corrected = "\'" + anonstring + "\'" + corrected = "'" + anonstring + "'" ev = literal_eval(corrected) return ev -def fill_column (row, tagname_to_tagid): + +def fill_column(row, tagname_to_tagid): """ function to return proper labels (for relevance column and for sectors column) """ values_to_fill = row n_labels = len(tagname_to_tagid) - row = [0]*n_labels + row = [0] * n_labels for target_tmp in values_to_fill: - row[tagname_to_tagid[target_tmp]]=1 + row[tagname_to_tagid[target_tmp]] = 1 return row -def custom_concat (row): + +def custom_concat(row): sample = row[0] for array in row[1:]: sample = np.concatenate((sample, array), axis=0) return sample -def return_results_matrixes (VAL_PATH, INDEXES_PATH, PREDICTIONS_PATH): + +def return_results_matrixes(VAL_PATH, INDEXES_PATH, PREDICTIONS_PATH): val_dataset = pd.read_csv(VAL_PATH) indexes = np.load(INDEXES_PATH) predictions_pillars = np.load(PREDICTIONS_PATH) - val_dataset['target'] = val_dataset['target'].apply(lambda x: literal_eval(x)) + val_dataset["target"] = val_dataset["target"].apply(lambda x: literal_eval(x)) - tagname_to_tagid = tagname_to_id (val_dataset['target']) + tagname_to_tagid = tagname_to_id(val_dataset["target"]) - val_dataset['target'] = val_dataset['target'].apply(lambda x: fill_column(x, tagname_to_tagid)) - val_dataset_restrained = val_dataset[['entry_id', 'target']] + val_dataset["target"] = val_dataset["target"].apply(lambda x: fill_column(x, tagname_to_tagid)) - preds = val_dataset[['entry_id']] - preds['predictions'] = preds['entry_id'].apply(lambda x: [0]*len(tagname_to_tagid)) + preds = val_dataset[["entry_id"]] + preds["predictions"] = preds["entry_id"].apply(lambda x: [0] * len(tagname_to_tagid)) - for i in range (len(indexes)): - preds.loc[preds['entry_id']==indexes[i], 'predictions']=\ - preds.loc[preds['entry_id']==indexes[i], 'predictions'].\ - apply(lambda x: predictions_pillars[i,:]) + for i in range(len(indexes)): + preds.loc[preds["entry_id"] == indexes[i], "predictions"] = preds.loc[ + preds["entry_id"] == indexes[i], "predictions" + ].apply(lambda x: predictions_pillars[i, :]) - true_y = np.array([true_yss for true_yss in val_dataset['target']]) - pred_y = np.array([true_yss for true_yss in preds['predictions']]) + true_y = np.array([true_yss for true_yss in val_dataset["target"]]) + pred_y = np.array([true_yss for true_yss in preds["predictions"]]) return true_y, pred_y, tagname_to_tagid -def print_results (true_y, pred_y, tagname_to_tagid): +def print_results(true_y, pred_y, tagname_to_tagid): precisions = [] recalls = [] f1_scores = [] accuracies = [] - - for i in range (true_y.shape[1]): + + for i in range(true_y.shape[1]): cls_rprt = classification_report(true_y[:, i], pred_y[:, i], output_dict=True) precisions.append(cls_rprt["macro avg"]["precision"]) recalls.append(cls_rprt["macro avg"]["recall"]) f1_scores.append(cls_rprt["macro avg"]["f1-score"]) accuracies.append(cls_rprt["accuracy"]) - results_df = pd.DataFrame.from_dict(tagname_to_tagid, orient='index') - results_df['accuracy'] = accuracies - results_df['recalls'] = recalls - results_df['precisions'] = precisions - results_df['f1_scores'] = f1_scores - results_df = results_df[['accuracy', 'f1_scores', 'precisions', 'recalls']] - results_df.loc['mean'] = np.average(results_df, axis=0) + results_df = pd.DataFrame.from_dict(tagname_to_tagid, orient="index") + results_df["accuracy"] = accuracies + results_df["recalls"] = recalls + results_df["precisions"] = precisions + results_df["f1_scores"] = f1_scores + results_df = results_df[["accuracy", "f1_scores", "precisions", "recalls"]] + results_df.loc["mean"] = np.average(results_df, axis=0) - return results_df \ No newline at end of file + return results_df