From d7e8e4056723525daf86ea5f38853d01c3eb7184 Mon Sep 17 00:00:00 2001
From: Caglar Demir <caglardemir8@gmail.com>
Date: Wed, 23 Feb 2022 13:19:41 +0100
Subject: [PATCH 1/6] instalation and description are updated.

---
 README.md        | 43 +++++++++++++++++++++----------------------
 environment.yml  | 34 +++++++++++++++++++++-------------
 requirements.txt | 34 +++++++++++++++++++++-------------
 3 files changed, 63 insertions(+), 48 deletions(-)

diff --git a/README.md b/README.md
index 38061fdc..3392f4d9 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,28 @@
 # Knowledge Graph Embeddings at Scale
-This open-source project facilitates learning embeddings of large knowledge graphs. 
-To scale on large knowledge graphs, we rely on [DASK](https://dask.org/) and [PytorchLightning](https://www.pytorchlightning.ai/). 
-Through [DASK](https://dask.org/), we utilize multi-CPUs at processing the input data, while
-[PytorchLightning](https://www.pytorchlightning.ai/) allow us to use knowledge graph embedding model in hardware-agnostic manner.
+
+This open-source project is designed to ease real-world applications of knowledge graph embeddings. 
+Wit this aim, we apply rely on
+1. [DASK](https://dask.org/) to use multi-CPUs at preprocessing a large knowledge graph,
+2. [PytorchLightning](https://www.pytorchlightning.ai/) to learn knowledge graph embeddings via multi-CPUs, GPUs, TPUs or  computing cluster, and
+3. [Gradio](https://gradio.app/) to ease the deployment of pre-trained models.
+
 
 ### Installation
-First clone the repository:
+Clone the repository:
 ```
 git clone https://github.com/dice-group/DAIKIRI-Embedding.git
 ```
-Then obtain the required libraries:
+Install dependencies via conda:
 ```
 conda env create -f environment.yml
 conda activate daikiri
-wget https://hobbitdata.informatik.uni-leipzig.de/KG/KGs.zip
-unzip KGs.zip
-python -m pytest -x tests
 ```
-### Manuel Installation
+or via pip:
+```
+# ensure that python 3.9 is available
+pip install -r requirements.txt
+```
+or manually
 ```
 conda create -n daikiri python=3.9
 conda activate daikiri
@@ -28,6 +33,9 @@ pip install scikit-learn==1.0.2
 pip install pytest==6.2.5
 pip install gradio==2.7.5.2
 pip install pyarrow==6.0.1
+```
+To test the Installation
+```
 wget https://hobbitdata.informatik.uni-leipzig.de/KG/KGs.zip
 unzip KGs.zip
 python -m pytest -x tests
@@ -62,20 +70,11 @@ Please contact:  ```caglar.demir@upb.de ``` or ```caglardemir8@gmai.com ``` , if
 - For more please look at [Hobbit Data](https://hobbitdata.informatik.uni-leipzig.de/KGE/)
 
 ### Available Models
-1. Multiplicative based KGE models:
-   1. [DistMult](https://arxiv.org/pdf/1412.6575.pdf)
-   2. [ComplEx](https://arxiv.org/pdf/1606.06357.pdf)
-   3. [QMult](https://proceedings.mlr.press/v157/demir21a.html)
-   4. [OMult](https://proceedings.mlr.press/v157/demir21a.html) 
-2. Feed Forward Neural Models 
-   1. [Shallom](https://arxiv.org/pdf/2101.09090.pdf)
-3. Convolutional Neural models
-   1. [ConEx](https://openreview.net/forum?id=6T45-4TFqaX&invitationId=eswc-conferences.org/ESWC/2021/Conference/Research_Track/Paper49/-/Camera_Ready_Revision&referrer=%5BTasks%5D(%2Ftasks))
-   2. [ConvQ](https://proceedings.mlr.press/v157/demir21a.html) 
-   3. [ConvO](https://proceedings.mlr.press/v157/demir21a.html)
+1. Multiplicative based KGE models: [DistMult](https://arxiv.org/pdf/1412.6575.pdf), [ComplEx](https://arxiv.org/pdf/1606.06357.pdf), [QMult](https://proceedings.mlr.press/v157/demir21a.html), and [OMult](https://proceedings.mlr.press/v157/demir21a.html) 
+2. Feed Forward Neural Models: [Shallom](https://arxiv.org/pdf/2101.09090.pdf)
+3. Convolutional Neural models [ConEx](https://openreview.net/forum?id=6T45-4TFqaX&invitationId=eswc-conferences.org/ESWC/2021/Conference/Research_Track/Paper49/-/Camera_Ready_Revision&referrer=%5BTasks%5D(%2Ftasks)), [ConvQ](https://proceedings.mlr.press/v157/demir21a.html), [ConvO](https://proceedings.mlr.press/v157/demir21a.html)
 4. Contact us to add your favorite one :)
 
-
 ### Training
 1. A dataset must be located in a folder, e.g. 'KGs/YAGO3-10'.
 
diff --git a/environment.yml b/environment.yml
index 6439dd42..f669c9b1 100644
--- a/environment.yml
+++ b/environment.yml
@@ -28,28 +28,31 @@ dependencies:
     - aiosignal==1.2.0
     - analytics-python==1.4.0
     - anyio==3.5.0
+    - asdfghjkl==0.1a2
     - asgiref==3.5.0
     - async-timeout==4.0.2
     - attrs==21.4.0
     - backoff==1.10.0
+    - backpack-for-pytorch==1.4.0
     - bcrypt==3.2.0
     - bokeh==2.4.2
     - cachetools==5.0.0
     - cffi==1.15.0
-    - charset-normalizer==2.0.10
+    - charset-normalizer==2.0.11
     - click==8.0.3
     - cloudpickle==2.0.0
     - cryptography==36.0.1
     - cycler==0.11.0
     - dask==2022.1.0
     - distributed==2022.1.0
+    - einops==0.4.0
     - fastapi==0.73.0
     - ffmpy==0.3.0
-    - fonttools==4.29.0
+    - fonttools==4.29.1
     - frozenlist==1.3.0
     - fsspec==2022.1.0
     - future==0.18.2
-    - google-auth==2.5.0
+    - google-auth==2.6.0
     - google-auth-oauthlib==0.4.6
     - gradio==2.7.5.2
     - grpcio==1.43.0
@@ -58,9 +61,11 @@ dependencies:
     - idna==3.3
     - importlib-metadata==4.10.1
     - iniconfig==1.1.1
+    - isodate==0.6.1
     - jinja2==3.0.3
     - joblib==1.1.0
     - kiwisolver==1.3.2
+    - laplace-torch==0.1a2
     - locket==0.2.1
     - markdown==3.3.6
     - markdown2==2.4.2
@@ -69,22 +74,22 @@ dependencies:
     - monotonic==1.6
     - msgpack==1.0.3
     - multidict==6.0.2
-    - numpy==1.22.1
-    - oauthlib==3.1.1
+    - numpy==1.22.2
+    - oauthlib==3.2.0
     - packaging==21.3
     - pandas==1.4.0
     - paramiko==2.9.2
     - partd==1.2.0
-    - pillow==9.0.0
+    - pillow==9.0.1
     - pluggy==1.0.0
-    - protobuf==3.19.3
+    - protobuf==3.19.4
     - psutil==5.9.0
     - py==1.11.0
     - pyarrow==6.0.1
     - pyasn1==0.4.8
     - pyasn1-modules==0.2.8
     - pycparser==2.21
-    - pycryptodome==3.13.0
+    - pycryptodome==3.14.1
     - pydantic==1.9.0
     - pydeprecate==0.3.1
     - pydub==0.25.1
@@ -96,8 +101,9 @@ dependencies:
     - pytorch-lightning==1.5.9
     - pytz==2021.3
     - pyyaml==6.0
+    - rdflib==6.1.1
     - requests==2.27.1
-    - requests-oauthlib==1.3.0
+    - requests-oauthlib==1.3.1
     - rsa==4.8
     - scikit-learn==1.0.2
     - scipy==1.7.3
@@ -110,16 +116,18 @@ dependencies:
     - tensorboard==2.8.0
     - tensorboard-data-server==0.6.1
     - tensorboard-plugin-wit==1.8.1
-    - threadpoolctl==3.0.0
+    - threadpoolctl==3.1.0
     - toml==0.10.2
     - toolz==0.11.2
-    - torch==1.10.1+cu113
-    - torchmetrics==0.7.0
+    - torch==1.10.2
+    - torchaudio==0.10.2
+    - torchmetrics==0.7.1
+    - torchvision==0.11.3
     - tornado==6.1
     - tqdm==4.62.3
     - typing-extensions==4.0.1
     - urllib3==1.26.8
-    - uvicorn==0.17.0.post1
+    - uvicorn==0.17.4
     - werkzeug==2.0.2
     - yarl==1.7.2
     - zict==2.0.0
diff --git a/requirements.txt b/requirements.txt
index b8ed753e..b23d3dc1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,29 +3,32 @@ aiohttp==3.8.1
 aiosignal==1.2.0
 analytics-python==1.4.0
 anyio==3.5.0
+asdfghjkl==0.1a2
 asgiref==3.5.0
 async-timeout==4.0.2
 attrs==21.4.0
 backoff==1.10.0
+backpack-for-pytorch==1.4.0
 bcrypt==3.2.0
 bokeh==2.4.2
 cachetools==5.0.0
 certifi==2021.10.8
 cffi==1.15.0
-charset-normalizer==2.0.10
+charset-normalizer==2.0.11
 click==8.0.3
 cloudpickle==2.0.0
 cryptography==36.0.1
 cycler==0.11.0
 dask==2022.1.0
 distributed==2022.1.0
+einops==0.4.0
 fastapi==0.73.0
 ffmpy==0.3.0
-fonttools==4.29.0
+fonttools==4.29.1
 frozenlist==1.3.0
 fsspec==2022.1.0
 future==0.18.2
-google-auth==2.5.0
+google-auth==2.6.0
 google-auth-oauthlib==0.4.6
 gradio==2.7.5.2
 grpcio==1.43.0
@@ -34,9 +37,11 @@ HeapDict==1.0.1
 idna==3.3
 importlib-metadata==4.10.1
 iniconfig==1.1.1
+isodate==0.6.1
 Jinja2==3.0.3
 joblib==1.1.0
 kiwisolver==1.3.2
+laplace-torch==0.1a2
 locket==0.2.1
 Markdown==3.3.6
 markdown2==2.4.2
@@ -45,22 +50,22 @@ matplotlib==3.5.1
 monotonic==1.6
 msgpack==1.0.3
 multidict==6.0.2
-numpy==1.22.1
-oauthlib==3.1.1
+numpy==1.22.2
+oauthlib==3.2.0
 packaging==21.3
 pandas==1.4.0
 paramiko==2.9.2
 partd==1.2.0
-Pillow==9.0.0
+Pillow==9.0.1
 pluggy==1.0.0
-protobuf==3.19.3
+protobuf==3.19.4
 psutil==5.9.0
 py==1.11.0
 pyarrow==6.0.1
 pyasn1==0.4.8
 pyasn1-modules==0.2.8
 pycparser==2.21
-pycryptodome==3.13.0
+pycryptodome==3.14.1
 pydantic==1.9.0
 pyDeprecate==0.3.1
 pydub==0.25.1
@@ -72,8 +77,9 @@ python-multipart==0.0.5
 pytorch-lightning==1.5.9
 pytz==2021.3
 PyYAML==6.0
+rdflib==6.1.1
 requests==2.27.1
-requests-oauthlib==1.3.0
+requests-oauthlib==1.3.1
 rsa==4.8
 scikit-learn==1.0.2
 scipy==1.7.3
@@ -85,16 +91,18 @@ tblib==1.7.0
 tensorboard==2.8.0
 tensorboard-data-server==0.6.1
 tensorboard-plugin-wit==1.8.1
-threadpoolctl==3.0.0
+threadpoolctl==3.1.0
 toml==0.10.2
 toolz==0.11.2
-torch==1.10.1+cu113
-torchmetrics==0.7.0
+torch==1.10.2
+torchaudio==0.10.2
+torchmetrics==0.7.1
+torchvision==0.11.3
 tornado==6.1
 tqdm==4.62.3
 typing_extensions==4.0.1
 urllib3==1.26.8
-uvicorn==0.17.0.post1
+uvicorn==0.17.4
 Werkzeug==2.0.2
 yarl==1.7.2
 zict==2.0.0

From 983df6158a46cb874d0a7657473cbcec3f234514 Mon Sep 17 00:00:00 2001
From: Caglar Demir <caglardemir8@gmail.com>
Date: Wed, 23 Feb 2022 14:33:35 +0100
Subject: [PATCH 2/6] GridSearch in shell script is implemented.

---
 core/executer.py               | 13 ++++---
 main.py                        |  6 ++--
 scripts/config_runner.sh       |  4 +--
 scripts/dummy.sh               | 44 -----------------------
 scripts/experiments.sh         | 16 ---------
 scripts/grid_search_starter.sh | 64 +++++++++++++---------------------
 6 files changed, 35 insertions(+), 112 deletions(-)
 delete mode 100644 scripts/dummy.sh
 delete mode 100644 scripts/experiments.sh

diff --git a/core/executer.py b/core/executer.py
index a0739823..7169ec62 100644
--- a/core/executer.py
+++ b/core/executer.py
@@ -4,6 +4,7 @@
 from .helper_classes import LabelRelaxationLoss, LabelSmoothingLossCanonical
 from .dataset_classes import StandardDataModule, KvsAll, CVDataModule
 from .knowledge_graph import KG
+from .callbacks import PrintCallback
 import torch
 from torch import nn
 from torch.nn import functional as F
@@ -18,7 +19,6 @@
 import dask.dataframe as dd
 import time
 from pytorch_lightning.plugins import DDPPlugin
-from pytorch_lightning.callbacks import Callback
 from pytorch_lightning import Trainer, seed_everything
 import logging
 from collections import defaultdict
@@ -28,6 +28,7 @@
 warnings.filterwarnings(action="ignore", category=DeprecationWarning)
 seed_everything(1, workers=True)
 
+
 # TODO: Execute can inherit from Trainer and Evaluator Classes
 # By doing so we can increase the modularity of our code.
 class Execute:
@@ -118,11 +119,13 @@ def train_and_eval(self) -> BaseKGE:
         2c. Train a model
         """
         print('------------------- Train & Eval -------------------')
-        # 1. Create Pytorch-lightning Trainer object from input configuration
+
+
         if self.args.gpus:
             self.trainer = pl.Trainer.from_argparse_args(self.args, plugins=[DDPPlugin(find_unused_parameters=False)])
         else:
-            self.trainer = pl.Trainer.from_argparse_args(self.args)
+            self.trainer = pl.Trainer.from_argparse_args(self.args,callbacks=[PrintCallback()])
+
         # 2. Check whether validation and test datasets are available.
         if self.dataset.is_valid_test_available():
             if self.args.scoring_technique == 'NegSample':
@@ -249,11 +252,7 @@ def get_batch_1_to_N(self, input_vocab, triples, idx, output_dim) -> Tuple[np.ar
 
     @staticmethod
     def model_fitting(trainer, model, train_dataloaders) -> None:
-        print(model)
-        print(model.summarize())
-        print("Model fitting...")
         trainer.fit(model, train_dataloaders=train_dataloaders)
-        print("Done!")
 
     def training_kvsall(self):
         """
diff --git a/main.py b/main.py
index ff103521..aeecebe2 100644
--- a/main.py
+++ b/main.py
@@ -20,7 +20,7 @@ def argparse_default(description=None):
     # Models.
     parser.add_argument("--model", type=str,
                         default='DistMult',
-                        help="Available models: KronE, ConEx, ConvQ, ConvO,  QMult, OMult, Shallom, ConEx, ComplEx, DistMult,KPDistMult")
+                        help="Available models: ConEx, ConvQ, ConvO,  QMult, OMult, Shallom, ConEx, ComplEx, DistMult, KronE, KPDistMult")
     # Training Parameters
     parser.add_argument("--num_epochs", type=int, default=10, help='Number of epochs for training. '
                                                                    'This disables max_epochs and '
@@ -30,7 +30,7 @@ def argparse_default(description=None):
     parser.add_argument("--label_smoothing_rate", type=float, default=None, help='None for not using it.')
     parser.add_argument("--label_relaxation_rate", type=float, default=None, help='None for not using it.')
     parser.add_argument("--add_noise_rate", type=float, default=None, help='None for not using it. '
-                                                                           '.1 means extand train data by adding 10% random data')
+                                                                           '.1 means extend train data by adding 10% random data')
 
     # Model Parameters
     # Hyperparameters
@@ -62,7 +62,7 @@ def argparse_default(description=None):
     parser.add_argument('--neg_ratio', type=int, default=0)
     # Data Augmentation.
     parser.add_argument('--num_folds_for_cv', type=int, default=0, help='Number of folds in k-fold cross validation.'
-                                                                         'If >2 ,no evaluation scenario is applied implies no evaluation.')
+                                                                        'If >2 ,no evaluation scenario is applied implies no evaluation.')
     # This is a workaround for read
     if description is None:
         return parser.parse_args()
diff --git a/scripts/config_runner.sh b/scripts/config_runner.sh
index b1af149a..75c9cf41 100644
--- a/scripts/config_runner.sh
+++ b/scripts/config_runner.sh
@@ -1,3 +1 @@
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/experiments.sh $1 $2
-# (1) KG PATH
-# (2) embedding dim
+python "$1" --path_dataset_folder "$2" --model  "$3"  --num_epochs "$4" --embedding_dim "$5"
\ No newline at end of file
diff --git a/scripts/dummy.sh b/scripts/dummy.sh
deleted file mode 100644
index a3317273..00000000
--- a/scripts/dummy.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/sh
-
-# shellcheck disable=SC2164
-dir_parent="$(cd "$PWD"; cd ..; pwd)"
-
-path_script="$dir_parent/main.py"
-
-models_name="Shallom"
-
-path_dataset_folder="$dir_parent/KGs/EN_FR_15K_V1"
-
-storage_path=$models_name
-# shellcheck disable=SC2039
-storage_path+="_EN_FR_15K_V1"
-mkdir $storage_path
-python "$path_script" --path_dataset_folder "$path_dataset_folder" --storage_path "$storage_path" --model "$models_name" --embedding_dim 300 --num_epochs 300 > "$storage_path/$models_name.log"
-
-
-path_dataset_folder="$dir_parent/KGs/EN_FR_15K_V2"
-storage_path=$models_name
-# shellcheck disable=SC2039
-storage_path+="_EN_FR_15K_V2"
-mkdir $storage_path
-python "$path_script" --path_dataset_folder "$path_dataset_folder" --storage_path "$storage_path" --model "$models_name" --embedding_dim 300 --num_epochs 300 > "$storage_path/$models_name.log"
-
-
-
-path_dataset_folder="$dir_parent/KGs/EN_FR_100K_V1"
-models_name="Shallom"
-storage_path=$models_name
-# shellcheck disable=SC2039
-storage_path+="_EN_FR_100K_V1"
-mkdir $storage_path
-python "$path_script" --path_dataset_folder "$path_dataset_folder" --storage_path "$storage_path" --model "$models_name" --embedding_dim 300 --num_epochs 300 > "$storage_path/$models_name.log"
-
-
-
-
-path_dataset_folder="$dir_parent/KGs/EN_FR_100K_V2"
-storage_path=$models_name
-# shellcheck disable=SC2039
-storage_path+="_EN_FR_100K_V2"
-mkdir $storage_path
-python "$path_script" --path_dataset_folder "$path_dataset_folder" --storage_path "$storage_path" --model "$models_name" --embedding_dim 300 --num_epochs 300 > "$storage_path/$models_name.log"
diff --git a/scripts/experiments.sh b/scripts/experiments.sh
deleted file mode 100644
index b42c5add..00000000
--- a/scripts/experiments.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-dataset_path="KGs/$1"
-embedding_dim=$2
-storage_path="Exp_Results_$1_$2"
-lr=.01
-num_epochs=1000
-storage_path="Exp_Results_$1_$2"
-mkdir "$storage_path"
-echo "Number of epochs:$num_epochs"
-echo "Learning rate:$lr"
-echo "embedding_dim :$embedding_dim"
-
-python main.py --storage_path "$storage_path" --path_dataset_folder "$dataset_path" --model 'DistMult' --lr $lr --embedding_dim "$embedding_dim"  --num_epochs $num_epochs> "$storage_path/DistMult.log"
-python main.py --storage_path "$storage_path" --path_dataset_folder "$dataset_path" --model 'KPDistMult' --lr $lr --embedding_dim "$embedding_dim"  --num_epochs $num_epochs > "$storage_path/KPDistMult.log"
-python main.py --storage_path "$storage_path" --path_dataset_folder "$dataset_path" --model 'KronE' --lr $lr --embedding_dim "$embedding_dim"  --num_epochs $num_epochs > "$storage_path/KronE.log"
-python core/collect_results_from_logs.py --logs  "$storage_path/DistMult.log" "$storage_path/KPDistMult.log" "$storage_path/KronE.log"
-echo 'Done!'
\ No newline at end of file
diff --git a/scripts/grid_search_starter.sh b/scripts/grid_search_starter.sh
index 8ecd41ed..d5115d76 100644
--- a/scripts/grid_search_starter.sh
+++ b/scripts/grid_search_starter.sh
@@ -1,40 +1,26 @@
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 4 > UMLS_4.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 9 > UMLS_9.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 16 > UMLS_16.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 25 > UMLS_25.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 36 > UMLS_36.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 49 > UMLS_49.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 64 > UMLS_64.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 81 > UMLS_81.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 100 > UMLS_100.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 121 > UMLS_121.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 144 > UMLS_144.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 169 > UMLS_169.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 196 > UMLS_196.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 225 > UMLS_225.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 256 > UMLS_256.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 289 > UMLS_289.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 324 > UMLS_324.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 361 > UMLS_361.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 400 > UMLS_400.log
-
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 4 > KINSHIP_4.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 9 > KINSHIP_9.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 16 > KINSHIP_16.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 25 > KINSHIP_25.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 36 > KINSHIP_36.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 49 > KINSHIP_49.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 64 > KINSHIP_64.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 81 > KINSHIP_81.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 100 > KINSHIP_100.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 121 > KINSHIP_121.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 144 > KINSHIP_144.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 169 > KINSHIP_169.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 196 > KINSHIP_196.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 225 > KINSHIP_225.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 256 > KINSHIP_256.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 289 > KINSHIP_289.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 324 > KINSHIP_324.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 361 > KINSHIP_361.log
-/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 400 > KINSHIP_400.log
+#!/bin/sh
+# (1) main working directory
+# shellcheck disable=SC2164
+main_wd="$(cd "$PWD"; cd ..; pwd)"
+# (2) Script in (1)
+python_script_path="$main_wd/main.py"
 
+# shellcheck disable=SC2043
+for kgname in "UMLS" "KINSHIP"
+do
+  kg_path="$main_wd/KGs/$kgname"
+  for model in "QMult" "OMult"
+  do
+    for epoch in 1
+    do
+      for dim in 25 50
+      do
+          # shellcheck disable=SC2154
+          log_name="$kg_path-$model-$epoch-$dim"
+          echo "Running $log_name configuration"
+          /bin/bash "$PWD/config_runner.sh" "$python_script_path" "$kg_path" "$model" "$epoch" "$dim" > "$log_name.log"
+          echo "Done!"
+      done
+    done
+  done
+done

From 3446e6cafc8fd325bf2b6341e853510cdee10f10 Mon Sep 17 00:00:00 2001
From: Caglar Demir <caglardemir8@gmail.com>
Date: Wed, 23 Feb 2022 14:38:29 +0100
Subject: [PATCH 3/6] Callbacks are added.

---
 core/callbacks.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 core/callbacks.py

diff --git a/core/callbacks.py b/core/callbacks.py
new file mode 100644
index 00000000..ef61f3a3
--- /dev/null
+++ b/core/callbacks.py
@@ -0,0 +1,15 @@
+# 1. Create Pytorch-lightning Trainer object from input configuration
+from pytorch_lightning.callbacks import Callback
+
+
+class PrintCallback(Callback):
+    def __init__(self):
+        super().__init__()
+
+    def on_train_start(self, trainer, model):
+        print(model)
+        print(model.summarize())
+        print("Training is started!")
+
+    def on_train_end(self, trainer, pl_module):
+        print("\nTraining is done.")

From de532788109f72430af469a9ef6507b511f2ba49 Mon Sep 17 00:00:00 2001
From: Caglar Demir <caglardemir8@gmail.com>
Date: Wed, 23 Feb 2022 14:42:54 +0100
Subject: [PATCH 4/6] Typo fixed in the grid search.

---
 scripts/grid_search_starter.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/grid_search_starter.sh b/scripts/grid_search_starter.sh
index d5115d76..c891c0f1 100644
--- a/scripts/grid_search_starter.sh
+++ b/scripts/grid_search_starter.sh
@@ -16,9 +16,9 @@ do
       for dim in 25 50
       do
           # shellcheck disable=SC2154
-          log_name="$kg_path-$model-$epoch-$dim"
-          echo "Running $log_name configuration"
-          /bin/bash "$PWD/config_runner.sh" "$python_script_path" "$kg_path" "$model" "$epoch" "$dim" > "$log_name.log"
+          config_name="$kgname-$model-$epoch-$dim"
+          echo "Running $config_name configuration"
+          /bin/bash "$PWD/config_runner.sh" "$python_script_path" "$kg_path" "$model" "$epoch" "$dim" > "$config_name.log"
           echo "Done!"
       done
     done

From 8d0c9c5bd37a38510fbf2341252baae1b02dd0b0 Mon Sep 17 00:00:00 2001
From: Caglar Demir <caglardemir8@gmail.com>
Date: Thu, 24 Feb 2022 11:52:31 +0100
Subject: [PATCH 5/6] Typo fix at reciprocal triples.

---
 core/knowledge_graph.py |  3 ++-
 run.sh                  | 58 -----------------------------------------
 2 files changed, 2 insertions(+), 59 deletions(-)
 delete mode 100644 run.sh

diff --git a/core/knowledge_graph.py b/core/knowledge_graph.py
index 0cec48a8..3a44c08a 100644
--- a/core/knowledge_graph.py
+++ b/core/knowledge_graph.py
@@ -67,7 +67,7 @@ def __init__(self, data_dir: str = None, deserialize_flag: str = None, large_kg_
                                                pd.DataFrame({'subject': self.test_set['object'],
                                                              'relation': self.test_set['relation'].map(
                                                                  lambda x: x + '_inverse'),
-                                                             'object': self.valid_set['subject']})], ignore_index=True)
+                                                             'object': self.test_set['subject']})], ignore_index=True)
                 print('Done !\n')
 
             if add_noise_rate is not None:
@@ -90,6 +90,7 @@ def __init__(self, data_dir: str = None, deserialize_flag: str = None, large_kg_
                 del list_of_entities
 
                 assert s + num_noisy_triples == len(self.train_set)
+
             # 3. Concatenate dataframes.
             print(f'[4 / 14] Concatenating data to obtain index...')
             df_str_kg = pd.concat([self.train_set, self.valid_set, self.test_set], ignore_index=True)
diff --git a/run.sh b/run.sh
deleted file mode 100644
index 27ae801e..00000000
--- a/run.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/sh
-python --version
-python -u -c 'import torch; print(torch.__version__)'
-echo "Start Training......"
-
-# To deserialize parsed KG.
-#python main.py --path_dataset_folder 'KGs/UMLS' --model 'ConEx' --deserialize_flag '/home/demir/Desktop/work/DAIKIRI_Emb/DAIKIRI_Storage/2021-05-12 12:11:12.095319'
-# Checked
-python main.py --path_dataset_folder 'KGs/UMLS' --model 'ConEx'
-python main.py --path_dataset_folder 'KGs/Family' --model 'ConEx' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'ConEx' --num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'ConEx' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3
-
-# Checked
-python main.py --path_dataset_folder 'KGs/UMLS' --model 'Shallom' --num_epochs 3 --scoring_technique 'KvsAll'
-python main.py --path_dataset_folder 'KGs/Family' --model 'Shallom' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'Shallom' --num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'Shallom' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3
-
-# Checked
-python main.py --path_dataset_folder 'KGs/UMLS' --model 'QMult' --num_epochs 3 --scoring_technique 'KvsAll'
-python main.py --path_dataset_folder 'KGs/Family' --model 'QMult' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'QMult' --num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'QMult' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3
-
-# Checked
-python main.py --path_dataset_folder 'KGs/UMLS' --model 'OMult' --num_epochs 3 --scoring_technique 'KvsAll'
-python main.py --path_dataset_folder 'KGs/Family' --model 'OMult' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'OMult' --num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'OMult' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3
-
-
-# Checked
-python main.py --path_dataset_folder 'KGs/UMLS' --model 'ConvQ' --num_epochs 3 --scoring_technique 'KvsAll'
-python main.py --path_dataset_folder 'KGs/Family' --model 'ConvQ' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'ConvQ' --num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'ConvQ' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3
-
-
-# Checked
-python main.py --path_dataset_folder 'KGs/UMLS' --model 'ConvO' --max_num_epochs 3 --scoring_technique 'KvsAll'
-python main.py --path_dataset_folder 'KGs/Family' --model 'ConvO' --max_num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'ConvO' --max_num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'ConvO' --max_num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3
-
-# Checked
-python main.py --path_dataset_folder 'KGs/UMLS' --model 'ComplEx' --num_epochs 3 --scoring_technique 'KvsAll'
-python main.py --path_dataset_folder 'KGs/Family' --model 'ComplEx' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'ComplEx' --num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'ComplEx' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3
-
-# Checked
-python main.py --path_dataset_folder 'KGs/UMLS' --model 'DistMult' --num_epochs 3 --scoring_technique 'KvsAll'
-python main.py --path_dataset_folder 'KGs/Family' --model 'DistMult' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'DistMult' --num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3
-python main.py --path_dataset_folder 'KGs/Family' --model 'DistMult' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3
-
-echo "Ends......"
\ No newline at end of file

From 117193459a53c78236987f754f3f64f2aa046877 Mon Sep 17 00:00:00 2001
From: Caglar Demir <caglardemir8@gmail.com>
Date: Thu, 24 Feb 2022 12:30:38 +0100
Subject: [PATCH 6/6] Eval at continues training added.

---
 continuous_training.py  |   2 +-
 core/executer.py        |   4 +-
 core/knowledge_graph.py | 193 +++++++++-------------------------------
 3 files changed, 46 insertions(+), 153 deletions(-)

diff --git a/continuous_training.py b/continuous_training.py
index 5b2cf8b6..5860ffca 100644
--- a/continuous_training.py
+++ b/continuous_training.py
@@ -20,7 +20,7 @@ def __init__(self, args):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(add_help=False)
     # Dataset and storage related
-    parser.add_argument("--path_experiment_folder", type=str, default="DAIKIRI_Storage/2022-02-04 15:02:25.958956",
+    parser.add_argument("--path_experiment_folder", type=str, default="DAIKIRI_Storage/2022-02-24 12:17:41.555572",
                         help="The path of a folder containing pretrained model")
     # Training Parameters
     parser.add_argument("--num_epochs", type=int, default=10,
diff --git a/core/executer.py b/core/executer.py
index 7169ec62..56c0d2aa 100644
--- a/core/executer.py
+++ b/core/executer.py
@@ -70,10 +70,10 @@ def read_input_data(args) -> KG:
         return kg
 
     @staticmethod
-    def reload_input_data(p: str) -> KG:
+    def reload_input_data(storage_path: str) -> KG:
         # 1. Read & Parse input data
         print("1. Reload Parsed Input Data")
-        return KG(deserialize_flag=p)
+        return KG(deserialize_flag=storage_path)
 
     def start(self) -> dict:
         """
diff --git a/core/knowledge_graph.py b/core/knowledge_graph.py
index 3a44c08a..c1dcc152 100644
--- a/core/knowledge_graph.py
+++ b/core/knowledge_graph.py
@@ -1,5 +1,5 @@
 import time
-from typing import Dict, List, Generator
+from typing import Dict, List
 from collections import defaultdict
 import numpy as np
 import pickle
@@ -14,6 +14,15 @@
 
 
 class KG:
+    """ Knowledge Graph Class
+        1- Reading : Large input data is read via DASK
+        2- Cleaning & Preprocessing :
+                                    Remove triples with literals if exists
+                                    Apply reciprocal data augmentation triples into train, valid and test datasets
+                                    Add noisy triples (random facts sampled from all possible triples E x R x E)
+        3- Serializing and Deserializing in parquet format
+    """
+
     def __init__(self, data_dir: str = None, deserialize_flag: str = None, large_kg_parse=False, add_reciprical=False,
                  eval_model=True, read_only_few: int = None, sample_triples_ratio: float = None,
                  path_for_serialization: str = None, add_noise_rate: float = None):
@@ -84,7 +93,7 @@ def __init__(self, data_dir: str = None, deserialize_flag: str = None, large_kg_
                                                      pd.unique(self.train_set[['relation']].values.ravel('K')),
                                                      num_noisy_triples),
                                                  'object': np.random.choice(list_of_entities, num_noisy_triples)}
-                                                )
+                                            )
                                             ], ignore_index=True)
 
                 del list_of_entities
@@ -103,7 +112,7 @@ def __init__(self, data_dir: str = None, deserialize_flag: str = None, large_kg_
                                               index=ordered_list)
             print('Done!\n')
 
-            # 5. Create a bijection mapping  from relations to to integer indexes.
+            # 5. Create a bijection mapping  from relations to integer indexes.
             print('[6 / 14] Creating a mapping from relations to integer indexes...')
             ordered_list = pd.unique(df_str_kg['relation'].values.ravel('K'))
             self.relation_to_idx = pd.DataFrame(data=np.arange(len(ordered_list)),
@@ -284,17 +293,15 @@ def index(data: List[List], add_reciprical=False) -> (Dict, Dict, Dict, Dict, Di
 
         return entity_idxs, relation_idxs, er_vocab, pe_vocab, ee_vocab
 
-    def deserialize(self, p: str) -> None:
-        """
-        Deserialize data
-        """
+    def deserialize(self, storage_path: str) -> None:
+        """ Deserialize data """
 
         print('Deserializing compressed entity integer mapping...')
-        self.entity_to_idx = ddf.read_parquet(p + '/entity_to_idx.gzip').compute()
+        self.entity_to_idx = ddf.read_parquet(storage_path + '/entity_to_idx.gzip').compute()
         print('Done!\n')
         self.num_entities = len(self.entity_to_idx)
         print('Deserializing compressed relation integer mapping...')
-        self.relation_to_idx = ddf.read_parquet(p + '/relation_to_idx.gzip').compute()
+        self.relation_to_idx = ddf.read_parquet(storage_path + '/relation_to_idx.gzip').compute()
         self.num_relations = len(self.entity_to_idx)
 
         print('Done!\n')
@@ -306,16 +313,35 @@ def deserialize(self, p: str) -> None:
 
         # 10. Serialize (9).
         print('Deserializing integer mapped data and mapping it to numpy ndarray...')
-        self.train_set = ddf.read_parquet(p + '/idx_train_df.gzip').values.compute()
+        self.train_set = ddf.read_parquet(storage_path + '/idx_train_df.gzip').values.compute()
         print('Done!\n')
+        try:
+            print('Deserializing integer mapped data and mapping it to numpy ndarray...')
+            self.valid_set = ddf.read_parquet(storage_path + '/idx_valid_df.gzip').values.compute()
+            print('Done!\n')
+        except FileNotFoundError:
+            print('No valid data found')
+            self.valid_set = pd.DataFrame()
 
-        print('Deserializing integer mapped data and mapping it to numpy ndarray...')
-        self.valid_set = ddf.read_parquet(p + '/idx_valid_df.gzip').values.compute()
-        print('Done!\n')
+        try:
+            print('Deserializing integer mapped data and mapping it to numpy ndarray...')
+            self.test_set = ddf.read_parquet(storage_path + '/idx_test_df.gzip').values.compute()
+            print('Done!\n')
+        except FileNotFoundError:
+            print('No test data found')
+            self.test_set = pd.DataFrame()
 
-        print('Deserializing integer mapped data and mapping it to numpy ndarray...')
-        self.test_set = ddf.read_parquet(p + '/idx_test_df.gzip').values.compute()
-        print('Done!\n')
+        print(storage_path)
+        with open(storage_path+'/configuration.json', 'r') as f:
+            args = json.load(f)
+
+        if args['eval']:
+            if len(self.valid_set) > 0 and len(self.test_set) > 0:
+                # 16. Create a bijection mapping from subject-relation pairs to tail entities.
+                data = np.concatenate([self.train_set, self.valid_set, self.test_set])
+            else:
+                data = self.train_set
+            self.er_vocab = get_er_vocab(data)
 
     @staticmethod
     def index_parallel(data: List[List], add_reciprical=False) -> (Dict, Dict, Dict, Dict, Dict):
@@ -461,75 +487,6 @@ def relations_str(self) -> List:
         """
         return list(self.relation_to_idx.keys())
 
-    # Not used anymore.
-    def load_data(self, data_path, add_reciprical=True, load_only=None):
-        raise NotImplemented()
-        # line can be 1 or 2
-        # a) <...> <...> <...> .
-        # b) <...> <...> "..." .
-        # c) ... ... ...
-        # (a) and (b) correspond to the N-Triples format
-        # (c) corresponds to the format of current link prediction benchmark datasets.
-        print(f'{data_path} is being read.')
-        try:
-            data = []
-            with open(data_path, "r") as f:
-                for line in f:
-                    # 1. Ignore lines with *** " *** or does only contain 2 or less characters.
-                    if '"' in line or len(line) < 3:
-                        continue
-
-                    # 2. Tokenize(<...> <...> <...> .) => ['<...>', '<...>','<...>','.']
-                    # Tokenize(... ... ...) => ['...', '...', '...',]
-                    decomposed_list_of_strings = line.split()
-
-                    # 3. Sanity checking.
-                    try:
-                        assert len(decomposed_list_of_strings) == 3 or len(decomposed_list_of_strings) == 4
-                    except AssertionError:
-                        print(f'Invalid input triple {line}. It can not be split into 3 or 4 items')
-                        print('This triple will be ignored')
-                        continue
-                    # 4. Storing
-                    if len(decomposed_list_of_strings) == 4:
-                        assert decomposed_list_of_strings[-1] == '.'
-                        data.append(self.ntriple_parser(decomposed_list_of_strings))
-                    if len(decomposed_list_of_strings) == 3:
-                        data.append(decomposed_list_of_strings)
-
-                    if load_only is not None:
-                        if len(data) == load_only:
-                            break
-
-                    if len(data) % 50_000_000 == 0:
-                        print(f'Size of already parsed data {len(data)}')
-
-        except FileNotFoundError:
-            print(f'{data_path} is not found')
-            return []
-        if add_reciprical:
-            data += [[i[2], i[1] + "_reverse", i[0]] for i in data]
-        return data
-
-    def process(self, x):
-        raise NotImplemented
-        # 2. Tokenize(<...> <...> <...> .) => ['<...>', '<...>','<...>','.']
-        # Tokenize(... ... ...) => ['...', '...', '...',]
-        decomposed_list_of_strings = x.split()
-
-        # 3. Sanity checking.
-        try:
-            assert len(decomposed_list_of_strings) == 3 or len(decomposed_list_of_strings) == 4
-        except AssertionError:
-            print(f'Invalid input triple {x}. It can not be split into 3 or 4 items')
-            print('This triple will be ignored')
-        # 4. Storing
-        if len(decomposed_list_of_strings) == 4:
-            assert decomposed_list_of_strings[-1] == '.'
-            decomposed_list_of_strings = self.ntriple_parser(decomposed_list_of_strings)
-        if len(decomposed_list_of_strings) == 3:
-            return decomposed_list_of_strings
-
     @staticmethod
     def ntriple_parser(l: List) -> List:
         """
@@ -543,6 +500,7 @@ def ntriple_parser(l: List) -> List:
         :param l:
         :return:
         """
+        raise NotImplementedError()
         assert l[3] == '.'
         try:
             s, p, o, _ = l[0], l[1], l[2], l[3]
@@ -661,68 +619,3 @@ def triple_indexing(self, large_kg_parse) -> None:
             else:
                 self.valid = np.array([])
                 self.test = np.array([])
-
-
-"""
-def serialize(self, p: str) -> None:
-    # Serialize entities and relations sotred in pandas dataframe and predicates
-
-    assert len(self.entity_to_idx) == self.num_entities
-    assert len(self.relation_to_idx) == self.num_relations
-
-    # Store data in parquet format
-    if len(self.train_set) > 0:
-        self.train_set.to_parquet(p + '/train_df.gzip', compression='gzip')
-        # Store as numpy
-        self.train_set['subject'] = self.train_set['subject'].map(lambda x: self.entity_to_idx[x])
-        self.train_set['relation'] = self.train_set['relation'].map(lambda x: self.relation_to_idx[x])
-        self.train_set['object'] = self.train_set['object'].map(lambda x: self.entity_to_idx[x])
-        self.train_set.to_parquet(p + '/idx_train_df.gzip', compression='gzip')
-        self.train_set = self.train_set.values
-        # Sanity checking
-        assert self.num_entities > max(self.train_set[0])
-        assert self.num_entities > max(self.train_set[0])
-        assert self.num_entities > max(self.train_set[2])
-        assert self.num_entities > max(self.train_set[2])
-
-        assert isinstance(self.train_set[0], np.ndarray)
-        assert isinstance(self.train_set[0][0], np.int64)
-        assert isinstance(self.train_set[0][1], np.int64)
-        assert isinstance(self.train_set[0][2], np.int64)
-
-    if len(self.valid_set) > 0:
-        self.valid_set.to_parquet(p + '/valid_df.gzip', compression='gzip')
-        self.valid_set['subject'] = self.valid_set['subject'].map(lambda x: self.entity_to_idx[x])
-        self.valid_set['relation'] = self.valid_set['relation'].map(lambda x: self.relation_to_idx[x])
-        self.valid_set['object'] = self.valid_set['object'].map(lambda x: self.entity_to_idx[x])
-        self.valid_set.to_parquet(p + '/idx_valid_df.gzip', compression='gzip')
-        self.valid_set = self.valid_set.values
-        # Sanity checking
-        assert self.num_entities > max(self.valid_set[0])
-        assert self.num_entities > max(self.valid_set[0])
-        assert self.num_entities > max(self.valid_set[2])
-        assert self.num_entities > max(self.valid_set[2])
-
-        assert isinstance(self.valid_set[0], np.ndarray)
-        assert isinstance(self.valid_set[0][0], np.int64)
-        assert isinstance(self.valid_set[0][1], np.int64)
-        assert isinstance(self.valid_set[0][2], np.int64)
-
-    if len(self.test_set) > 0:
-        self.test_set.to_parquet(p + '/test_df.gzip', compression='gzip')
-        self.test_set['subject'] = self.test_set['subject'].map(lambda x: self.entity_to_idx[x])
-        self.test_set['relation'] = self.test_set['relation'].map(lambda x: self.relation_to_idx[x])
-        self.test_set['object'] = self.test_set['object'].map(lambda x: self.entity_to_idx[x])
-        self.test_set.to_parquet(p + '/idx_test_df.gzip', compression='gzip')
-        self.test_set = self.test_set.values
-        # Sanity checking
-        assert self.num_entities > max(self.test_set[0])
-        assert self.num_entities > max(self.test_set[0])
-        assert self.num_entities > max(self.test_set[2])
-        assert self.num_entities > max(self.test_set[2])
-
-        assert isinstance(self.test_set[0], np.ndarray)
-        assert isinstance(self.test_set[0][0], np.int64)
-        assert isinstance(self.test_set[0][1], np.int64)
-        assert isinstance(self.test_set[0][2], np.int64)
-"""