From d7e8e4056723525daf86ea5f38853d01c3eb7184 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Wed, 23 Feb 2022 13:19:41 +0100 Subject: [PATCH 1/6] instalation and description are updated. --- README.md | 43 +++++++++++++++++++++---------------------- environment.yml | 34 +++++++++++++++++++++------------- requirements.txt | 34 +++++++++++++++++++++------------- 3 files changed, 63 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 38061fdc..3392f4d9 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,28 @@ # Knowledge Graph Embeddings at Scale -This open-source project facilitates learning embeddings of large knowledge graphs. -To scale on large knowledge graphs, we rely on [DASK](https://dask.org/) and [PytorchLightning](https://www.pytorchlightning.ai/). -Through [DASK](https://dask.org/), we utilize multi-CPUs at processing the input data, while -[PytorchLightning](https://www.pytorchlightning.ai/) allow us to use knowledge graph embedding model in hardware-agnostic manner. + +This open-source project is designed to ease real-world applications of knowledge graph embeddings. +Wit this aim, we apply rely on +1. [DASK](https://dask.org/) to use multi-CPUs at preprocessing a large knowledge graph, +2. [PytorchLightning](https://www.pytorchlightning.ai/) to learn knowledge graph embeddings via multi-CPUs, GPUs, TPUs or computing cluster, and +3. [Gradio](https://gradio.app/) to ease the deployment of pre-trained models. + ### Installation -First clone the repository: +Clone the repository: ``` git clone https://github.com/dice-group/DAIKIRI-Embedding.git ``` -Then obtain the required libraries: +Install dependencies via conda: ``` conda env create -f environment.yml conda activate daikiri -wget https://hobbitdata.informatik.uni-leipzig.de/KG/KGs.zip -unzip KGs.zip -python -m pytest -x tests ``` -### Manuel Installation +or via pip: +``` +# ensure that python 3.9 is available +pip install -r requirements.txt +``` +or manually ``` conda create -n daikiri python=3.9 conda activate daikiri @@ -28,6 +33,9 @@ pip install scikit-learn==1.0.2 pip install pytest==6.2.5 pip install gradio==2.7.5.2 pip install pyarrow==6.0.1 +``` +To test the Installation +``` wget https://hobbitdata.informatik.uni-leipzig.de/KG/KGs.zip unzip KGs.zip python -m pytest -x tests @@ -62,20 +70,11 @@ Please contact: ```caglar.demir@upb.de ``` or ```caglardemir8@gmai.com ``` , if - For more please look at [Hobbit Data](https://hobbitdata.informatik.uni-leipzig.de/KGE/) ### Available Models -1. Multiplicative based KGE models: - 1. [DistMult](https://arxiv.org/pdf/1412.6575.pdf) - 2. [ComplEx](https://arxiv.org/pdf/1606.06357.pdf) - 3. [QMult](https://proceedings.mlr.press/v157/demir21a.html) - 4. [OMult](https://proceedings.mlr.press/v157/demir21a.html) -2. Feed Forward Neural Models - 1. [Shallom](https://arxiv.org/pdf/2101.09090.pdf) -3. Convolutional Neural models - 1. [ConEx](https://openreview.net/forum?id=6T45-4TFqaX&invitationId=eswc-conferences.org/ESWC/2021/Conference/Research_Track/Paper49/-/Camera_Ready_Revision&referrer=%5BTasks%5D(%2Ftasks)) - 2. [ConvQ](https://proceedings.mlr.press/v157/demir21a.html) - 3. [ConvO](https://proceedings.mlr.press/v157/demir21a.html) +1. Multiplicative based KGE models: [DistMult](https://arxiv.org/pdf/1412.6575.pdf), [ComplEx](https://arxiv.org/pdf/1606.06357.pdf), [QMult](https://proceedings.mlr.press/v157/demir21a.html), and [OMult](https://proceedings.mlr.press/v157/demir21a.html) +2. Feed Forward Neural Models: [Shallom](https://arxiv.org/pdf/2101.09090.pdf) +3. Convolutional Neural models [ConEx](https://openreview.net/forum?id=6T45-4TFqaX&invitationId=eswc-conferences.org/ESWC/2021/Conference/Research_Track/Paper49/-/Camera_Ready_Revision&referrer=%5BTasks%5D(%2Ftasks)), [ConvQ](https://proceedings.mlr.press/v157/demir21a.html), [ConvO](https://proceedings.mlr.press/v157/demir21a.html) 4. Contact us to add your favorite one :) - ### Training 1. A dataset must be located in a folder, e.g. 'KGs/YAGO3-10'. diff --git a/environment.yml b/environment.yml index 6439dd42..f669c9b1 100644 --- a/environment.yml +++ b/environment.yml @@ -28,28 +28,31 @@ dependencies: - aiosignal==1.2.0 - analytics-python==1.4.0 - anyio==3.5.0 + - asdfghjkl==0.1a2 - asgiref==3.5.0 - async-timeout==4.0.2 - attrs==21.4.0 - backoff==1.10.0 + - backpack-for-pytorch==1.4.0 - bcrypt==3.2.0 - bokeh==2.4.2 - cachetools==5.0.0 - cffi==1.15.0 - - charset-normalizer==2.0.10 + - charset-normalizer==2.0.11 - click==8.0.3 - cloudpickle==2.0.0 - cryptography==36.0.1 - cycler==0.11.0 - dask==2022.1.0 - distributed==2022.1.0 + - einops==0.4.0 - fastapi==0.73.0 - ffmpy==0.3.0 - - fonttools==4.29.0 + - fonttools==4.29.1 - frozenlist==1.3.0 - fsspec==2022.1.0 - future==0.18.2 - - google-auth==2.5.0 + - google-auth==2.6.0 - google-auth-oauthlib==0.4.6 - gradio==2.7.5.2 - grpcio==1.43.0 @@ -58,9 +61,11 @@ dependencies: - idna==3.3 - importlib-metadata==4.10.1 - iniconfig==1.1.1 + - isodate==0.6.1 - jinja2==3.0.3 - joblib==1.1.0 - kiwisolver==1.3.2 + - laplace-torch==0.1a2 - locket==0.2.1 - markdown==3.3.6 - markdown2==2.4.2 @@ -69,22 +74,22 @@ dependencies: - monotonic==1.6 - msgpack==1.0.3 - multidict==6.0.2 - - numpy==1.22.1 - - oauthlib==3.1.1 + - numpy==1.22.2 + - oauthlib==3.2.0 - packaging==21.3 - pandas==1.4.0 - paramiko==2.9.2 - partd==1.2.0 - - pillow==9.0.0 + - pillow==9.0.1 - pluggy==1.0.0 - - protobuf==3.19.3 + - protobuf==3.19.4 - psutil==5.9.0 - py==1.11.0 - pyarrow==6.0.1 - pyasn1==0.4.8 - pyasn1-modules==0.2.8 - pycparser==2.21 - - pycryptodome==3.13.0 + - pycryptodome==3.14.1 - pydantic==1.9.0 - pydeprecate==0.3.1 - pydub==0.25.1 @@ -96,8 +101,9 @@ dependencies: - pytorch-lightning==1.5.9 - pytz==2021.3 - pyyaml==6.0 + - rdflib==6.1.1 - requests==2.27.1 - - requests-oauthlib==1.3.0 + - requests-oauthlib==1.3.1 - rsa==4.8 - scikit-learn==1.0.2 - scipy==1.7.3 @@ -110,16 +116,18 @@ dependencies: - tensorboard==2.8.0 - tensorboard-data-server==0.6.1 - tensorboard-plugin-wit==1.8.1 - - threadpoolctl==3.0.0 + - threadpoolctl==3.1.0 - toml==0.10.2 - toolz==0.11.2 - - torch==1.10.1+cu113 - - torchmetrics==0.7.0 + - torch==1.10.2 + - torchaudio==0.10.2 + - torchmetrics==0.7.1 + - torchvision==0.11.3 - tornado==6.1 - tqdm==4.62.3 - typing-extensions==4.0.1 - urllib3==1.26.8 - - uvicorn==0.17.0.post1 + - uvicorn==0.17.4 - werkzeug==2.0.2 - yarl==1.7.2 - zict==2.0.0 diff --git a/requirements.txt b/requirements.txt index b8ed753e..b23d3dc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,29 +3,32 @@ aiohttp==3.8.1 aiosignal==1.2.0 analytics-python==1.4.0 anyio==3.5.0 +asdfghjkl==0.1a2 asgiref==3.5.0 async-timeout==4.0.2 attrs==21.4.0 backoff==1.10.0 +backpack-for-pytorch==1.4.0 bcrypt==3.2.0 bokeh==2.4.2 cachetools==5.0.0 certifi==2021.10.8 cffi==1.15.0 -charset-normalizer==2.0.10 +charset-normalizer==2.0.11 click==8.0.3 cloudpickle==2.0.0 cryptography==36.0.1 cycler==0.11.0 dask==2022.1.0 distributed==2022.1.0 +einops==0.4.0 fastapi==0.73.0 ffmpy==0.3.0 -fonttools==4.29.0 +fonttools==4.29.1 frozenlist==1.3.0 fsspec==2022.1.0 future==0.18.2 -google-auth==2.5.0 +google-auth==2.6.0 google-auth-oauthlib==0.4.6 gradio==2.7.5.2 grpcio==1.43.0 @@ -34,9 +37,11 @@ HeapDict==1.0.1 idna==3.3 importlib-metadata==4.10.1 iniconfig==1.1.1 +isodate==0.6.1 Jinja2==3.0.3 joblib==1.1.0 kiwisolver==1.3.2 +laplace-torch==0.1a2 locket==0.2.1 Markdown==3.3.6 markdown2==2.4.2 @@ -45,22 +50,22 @@ matplotlib==3.5.1 monotonic==1.6 msgpack==1.0.3 multidict==6.0.2 -numpy==1.22.1 -oauthlib==3.1.1 +numpy==1.22.2 +oauthlib==3.2.0 packaging==21.3 pandas==1.4.0 paramiko==2.9.2 partd==1.2.0 -Pillow==9.0.0 +Pillow==9.0.1 pluggy==1.0.0 -protobuf==3.19.3 +protobuf==3.19.4 psutil==5.9.0 py==1.11.0 pyarrow==6.0.1 pyasn1==0.4.8 pyasn1-modules==0.2.8 pycparser==2.21 -pycryptodome==3.13.0 +pycryptodome==3.14.1 pydantic==1.9.0 pyDeprecate==0.3.1 pydub==0.25.1 @@ -72,8 +77,9 @@ python-multipart==0.0.5 pytorch-lightning==1.5.9 pytz==2021.3 PyYAML==6.0 +rdflib==6.1.1 requests==2.27.1 -requests-oauthlib==1.3.0 +requests-oauthlib==1.3.1 rsa==4.8 scikit-learn==1.0.2 scipy==1.7.3 @@ -85,16 +91,18 @@ tblib==1.7.0 tensorboard==2.8.0 tensorboard-data-server==0.6.1 tensorboard-plugin-wit==1.8.1 -threadpoolctl==3.0.0 +threadpoolctl==3.1.0 toml==0.10.2 toolz==0.11.2 -torch==1.10.1+cu113 -torchmetrics==0.7.0 +torch==1.10.2 +torchaudio==0.10.2 +torchmetrics==0.7.1 +torchvision==0.11.3 tornado==6.1 tqdm==4.62.3 typing_extensions==4.0.1 urllib3==1.26.8 -uvicorn==0.17.0.post1 +uvicorn==0.17.4 Werkzeug==2.0.2 yarl==1.7.2 zict==2.0.0 From 983df6158a46cb874d0a7657473cbcec3f234514 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Wed, 23 Feb 2022 14:33:35 +0100 Subject: [PATCH 2/6] GridSearch in shell script is implemented. --- core/executer.py | 13 ++++--- main.py | 6 ++-- scripts/config_runner.sh | 4 +-- scripts/dummy.sh | 44 ----------------------- scripts/experiments.sh | 16 --------- scripts/grid_search_starter.sh | 64 +++++++++++++--------------------- 6 files changed, 35 insertions(+), 112 deletions(-) delete mode 100644 scripts/dummy.sh delete mode 100644 scripts/experiments.sh diff --git a/core/executer.py b/core/executer.py index a0739823..7169ec62 100644 --- a/core/executer.py +++ b/core/executer.py @@ -4,6 +4,7 @@ from .helper_classes import LabelRelaxationLoss, LabelSmoothingLossCanonical from .dataset_classes import StandardDataModule, KvsAll, CVDataModule from .knowledge_graph import KG +from .callbacks import PrintCallback import torch from torch import nn from torch.nn import functional as F @@ -18,7 +19,6 @@ import dask.dataframe as dd import time from pytorch_lightning.plugins import DDPPlugin -from pytorch_lightning.callbacks import Callback from pytorch_lightning import Trainer, seed_everything import logging from collections import defaultdict @@ -28,6 +28,7 @@ warnings.filterwarnings(action="ignore", category=DeprecationWarning) seed_everything(1, workers=True) + # TODO: Execute can inherit from Trainer and Evaluator Classes # By doing so we can increase the modularity of our code. class Execute: @@ -118,11 +119,13 @@ def train_and_eval(self) -> BaseKGE: 2c. Train a model """ print('------------------- Train & Eval -------------------') - # 1. Create Pytorch-lightning Trainer object from input configuration + + if self.args.gpus: self.trainer = pl.Trainer.from_argparse_args(self.args, plugins=[DDPPlugin(find_unused_parameters=False)]) else: - self.trainer = pl.Trainer.from_argparse_args(self.args) + self.trainer = pl.Trainer.from_argparse_args(self.args,callbacks=[PrintCallback()]) + # 2. Check whether validation and test datasets are available. if self.dataset.is_valid_test_available(): if self.args.scoring_technique == 'NegSample': @@ -249,11 +252,7 @@ def get_batch_1_to_N(self, input_vocab, triples, idx, output_dim) -> Tuple[np.ar @staticmethod def model_fitting(trainer, model, train_dataloaders) -> None: - print(model) - print(model.summarize()) - print("Model fitting...") trainer.fit(model, train_dataloaders=train_dataloaders) - print("Done!") def training_kvsall(self): """ diff --git a/main.py b/main.py index ff103521..aeecebe2 100644 --- a/main.py +++ b/main.py @@ -20,7 +20,7 @@ def argparse_default(description=None): # Models. parser.add_argument("--model", type=str, default='DistMult', - help="Available models: KronE, ConEx, ConvQ, ConvO, QMult, OMult, Shallom, ConEx, ComplEx, DistMult,KPDistMult") + help="Available models: ConEx, ConvQ, ConvO, QMult, OMult, Shallom, ConEx, ComplEx, DistMult, KronE, KPDistMult") # Training Parameters parser.add_argument("--num_epochs", type=int, default=10, help='Number of epochs for training. ' 'This disables max_epochs and ' @@ -30,7 +30,7 @@ def argparse_default(description=None): parser.add_argument("--label_smoothing_rate", type=float, default=None, help='None for not using it.') parser.add_argument("--label_relaxation_rate", type=float, default=None, help='None for not using it.') parser.add_argument("--add_noise_rate", type=float, default=None, help='None for not using it. ' - '.1 means extand train data by adding 10% random data') + '.1 means extend train data by adding 10% random data') # Model Parameters # Hyperparameters @@ -62,7 +62,7 @@ def argparse_default(description=None): parser.add_argument('--neg_ratio', type=int, default=0) # Data Augmentation. parser.add_argument('--num_folds_for_cv', type=int, default=0, help='Number of folds in k-fold cross validation.' - 'If >2 ,no evaluation scenario is applied implies no evaluation.') + 'If >2 ,no evaluation scenario is applied implies no evaluation.') # This is a workaround for read if description is None: return parser.parse_args() diff --git a/scripts/config_runner.sh b/scripts/config_runner.sh index b1af149a..75c9cf41 100644 --- a/scripts/config_runner.sh +++ b/scripts/config_runner.sh @@ -1,3 +1 @@ -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/experiments.sh $1 $2 -# (1) KG PATH -# (2) embedding dim +python "$1" --path_dataset_folder "$2" --model "$3" --num_epochs "$4" --embedding_dim "$5" \ No newline at end of file diff --git a/scripts/dummy.sh b/scripts/dummy.sh deleted file mode 100644 index a3317273..00000000 --- a/scripts/dummy.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/sh - -# shellcheck disable=SC2164 -dir_parent="$(cd "$PWD"; cd ..; pwd)" - -path_script="$dir_parent/main.py" - -models_name="Shallom" - -path_dataset_folder="$dir_parent/KGs/EN_FR_15K_V1" - -storage_path=$models_name -# shellcheck disable=SC2039 -storage_path+="_EN_FR_15K_V1" -mkdir $storage_path -python "$path_script" --path_dataset_folder "$path_dataset_folder" --storage_path "$storage_path" --model "$models_name" --embedding_dim 300 --num_epochs 300 > "$storage_path/$models_name.log" - - -path_dataset_folder="$dir_parent/KGs/EN_FR_15K_V2" -storage_path=$models_name -# shellcheck disable=SC2039 -storage_path+="_EN_FR_15K_V2" -mkdir $storage_path -python "$path_script" --path_dataset_folder "$path_dataset_folder" --storage_path "$storage_path" --model "$models_name" --embedding_dim 300 --num_epochs 300 > "$storage_path/$models_name.log" - - - -path_dataset_folder="$dir_parent/KGs/EN_FR_100K_V1" -models_name="Shallom" -storage_path=$models_name -# shellcheck disable=SC2039 -storage_path+="_EN_FR_100K_V1" -mkdir $storage_path -python "$path_script" --path_dataset_folder "$path_dataset_folder" --storage_path "$storage_path" --model "$models_name" --embedding_dim 300 --num_epochs 300 > "$storage_path/$models_name.log" - - - - -path_dataset_folder="$dir_parent/KGs/EN_FR_100K_V2" -storage_path=$models_name -# shellcheck disable=SC2039 -storage_path+="_EN_FR_100K_V2" -mkdir $storage_path -python "$path_script" --path_dataset_folder "$path_dataset_folder" --storage_path "$storage_path" --model "$models_name" --embedding_dim 300 --num_epochs 300 > "$storage_path/$models_name.log" diff --git a/scripts/experiments.sh b/scripts/experiments.sh deleted file mode 100644 index b42c5add..00000000 --- a/scripts/experiments.sh +++ /dev/null @@ -1,16 +0,0 @@ -dataset_path="KGs/$1" -embedding_dim=$2 -storage_path="Exp_Results_$1_$2" -lr=.01 -num_epochs=1000 -storage_path="Exp_Results_$1_$2" -mkdir "$storage_path" -echo "Number of epochs:$num_epochs" -echo "Learning rate:$lr" -echo "embedding_dim :$embedding_dim" - -python main.py --storage_path "$storage_path" --path_dataset_folder "$dataset_path" --model 'DistMult' --lr $lr --embedding_dim "$embedding_dim" --num_epochs $num_epochs> "$storage_path/DistMult.log" -python main.py --storage_path "$storage_path" --path_dataset_folder "$dataset_path" --model 'KPDistMult' --lr $lr --embedding_dim "$embedding_dim" --num_epochs $num_epochs > "$storage_path/KPDistMult.log" -python main.py --storage_path "$storage_path" --path_dataset_folder "$dataset_path" --model 'KronE' --lr $lr --embedding_dim "$embedding_dim" --num_epochs $num_epochs > "$storage_path/KronE.log" -python core/collect_results_from_logs.py --logs "$storage_path/DistMult.log" "$storage_path/KPDistMult.log" "$storage_path/KronE.log" -echo 'Done!' \ No newline at end of file diff --git a/scripts/grid_search_starter.sh b/scripts/grid_search_starter.sh index 8ecd41ed..d5115d76 100644 --- a/scripts/grid_search_starter.sh +++ b/scripts/grid_search_starter.sh @@ -1,40 +1,26 @@ -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 4 > UMLS_4.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 9 > UMLS_9.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 16 > UMLS_16.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 25 > UMLS_25.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 36 > UMLS_36.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 49 > UMLS_49.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 64 > UMLS_64.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 81 > UMLS_81.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 100 > UMLS_100.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 121 > UMLS_121.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 144 > UMLS_144.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 169 > UMLS_169.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 196 > UMLS_196.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 225 > UMLS_225.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 256 > UMLS_256.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 289 > UMLS_289.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 324 > UMLS_324.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 361 > UMLS_361.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "UMLS" 400 > UMLS_400.log - -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 4 > KINSHIP_4.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 9 > KINSHIP_9.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 16 > KINSHIP_16.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 25 > KINSHIP_25.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 36 > KINSHIP_36.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 49 > KINSHIP_49.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 64 > KINSHIP_64.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 81 > KINSHIP_81.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 100 > KINSHIP_100.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 121 > KINSHIP_121.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 144 > KINSHIP_144.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 169 > KINSHIP_169.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 196 > KINSHIP_196.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 225 > KINSHIP_225.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 256 > KINSHIP_256.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 289 > KINSHIP_289.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 324 > KINSHIP_324.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 361 > KINSHIP_361.log -/bin/bash /home/demir/Desktop/Softwares/DAIKIRI_Emb/config_runner.sh "KINSHIP" 400 > KINSHIP_400.log +#!/bin/sh +# (1) main working directory +# shellcheck disable=SC2164 +main_wd="$(cd "$PWD"; cd ..; pwd)" +# (2) Script in (1) +python_script_path="$main_wd/main.py" +# shellcheck disable=SC2043 +for kgname in "UMLS" "KINSHIP" +do + kg_path="$main_wd/KGs/$kgname" + for model in "QMult" "OMult" + do + for epoch in 1 + do + for dim in 25 50 + do + # shellcheck disable=SC2154 + log_name="$kg_path-$model-$epoch-$dim" + echo "Running $log_name configuration" + /bin/bash "$PWD/config_runner.sh" "$python_script_path" "$kg_path" "$model" "$epoch" "$dim" > "$log_name.log" + echo "Done!" + done + done + done +done From 3446e6cafc8fd325bf2b6341e853510cdee10f10 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Wed, 23 Feb 2022 14:38:29 +0100 Subject: [PATCH 3/6] Callbacks are added. --- core/callbacks.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 core/callbacks.py diff --git a/core/callbacks.py b/core/callbacks.py new file mode 100644 index 00000000..ef61f3a3 --- /dev/null +++ b/core/callbacks.py @@ -0,0 +1,15 @@ +# 1. Create Pytorch-lightning Trainer object from input configuration +from pytorch_lightning.callbacks import Callback + + +class PrintCallback(Callback): + def __init__(self): + super().__init__() + + def on_train_start(self, trainer, model): + print(model) + print(model.summarize()) + print("Training is started!") + + def on_train_end(self, trainer, pl_module): + print("\nTraining is done.") From de532788109f72430af469a9ef6507b511f2ba49 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Wed, 23 Feb 2022 14:42:54 +0100 Subject: [PATCH 4/6] Typo fixed in the grid search. --- scripts/grid_search_starter.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/grid_search_starter.sh b/scripts/grid_search_starter.sh index d5115d76..c891c0f1 100644 --- a/scripts/grid_search_starter.sh +++ b/scripts/grid_search_starter.sh @@ -16,9 +16,9 @@ do for dim in 25 50 do # shellcheck disable=SC2154 - log_name="$kg_path-$model-$epoch-$dim" - echo "Running $log_name configuration" - /bin/bash "$PWD/config_runner.sh" "$python_script_path" "$kg_path" "$model" "$epoch" "$dim" > "$log_name.log" + config_name="$kgname-$model-$epoch-$dim" + echo "Running $config_name configuration" + /bin/bash "$PWD/config_runner.sh" "$python_script_path" "$kg_path" "$model" "$epoch" "$dim" > "$config_name.log" echo "Done!" done done From 8d0c9c5bd37a38510fbf2341252baae1b02dd0b0 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Thu, 24 Feb 2022 11:52:31 +0100 Subject: [PATCH 5/6] Typo fix at reciprocal triples. --- core/knowledge_graph.py | 3 ++- run.sh | 58 ----------------------------------------- 2 files changed, 2 insertions(+), 59 deletions(-) delete mode 100644 run.sh diff --git a/core/knowledge_graph.py b/core/knowledge_graph.py index 0cec48a8..3a44c08a 100644 --- a/core/knowledge_graph.py +++ b/core/knowledge_graph.py @@ -67,7 +67,7 @@ def __init__(self, data_dir: str = None, deserialize_flag: str = None, large_kg_ pd.DataFrame({'subject': self.test_set['object'], 'relation': self.test_set['relation'].map( lambda x: x + '_inverse'), - 'object': self.valid_set['subject']})], ignore_index=True) + 'object': self.test_set['subject']})], ignore_index=True) print('Done !\n') if add_noise_rate is not None: @@ -90,6 +90,7 @@ def __init__(self, data_dir: str = None, deserialize_flag: str = None, large_kg_ del list_of_entities assert s + num_noisy_triples == len(self.train_set) + # 3. Concatenate dataframes. print(f'[4 / 14] Concatenating data to obtain index...') df_str_kg = pd.concat([self.train_set, self.valid_set, self.test_set], ignore_index=True) diff --git a/run.sh b/run.sh deleted file mode 100644 index 27ae801e..00000000 --- a/run.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/sh -python --version -python -u -c 'import torch; print(torch.__version__)' -echo "Start Training......" - -# To deserialize parsed KG. -#python main.py --path_dataset_folder 'KGs/UMLS' --model 'ConEx' --deserialize_flag '/home/demir/Desktop/work/DAIKIRI_Emb/DAIKIRI_Storage/2021-05-12 12:11:12.095319' -# Checked -python main.py --path_dataset_folder 'KGs/UMLS' --model 'ConEx' -python main.py --path_dataset_folder 'KGs/Family' --model 'ConEx' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'ConEx' --num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'ConEx' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3 - -# Checked -python main.py --path_dataset_folder 'KGs/UMLS' --model 'Shallom' --num_epochs 3 --scoring_technique 'KvsAll' -python main.py --path_dataset_folder 'KGs/Family' --model 'Shallom' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'Shallom' --num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'Shallom' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3 - -# Checked -python main.py --path_dataset_folder 'KGs/UMLS' --model 'QMult' --num_epochs 3 --scoring_technique 'KvsAll' -python main.py --path_dataset_folder 'KGs/Family' --model 'QMult' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'QMult' --num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'QMult' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3 - -# Checked -python main.py --path_dataset_folder 'KGs/UMLS' --model 'OMult' --num_epochs 3 --scoring_technique 'KvsAll' -python main.py --path_dataset_folder 'KGs/Family' --model 'OMult' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'OMult' --num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'OMult' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3 - - -# Checked -python main.py --path_dataset_folder 'KGs/UMLS' --model 'ConvQ' --num_epochs 3 --scoring_technique 'KvsAll' -python main.py --path_dataset_folder 'KGs/Family' --model 'ConvQ' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'ConvQ' --num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'ConvQ' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3 - - -# Checked -python main.py --path_dataset_folder 'KGs/UMLS' --model 'ConvO' --max_num_epochs 3 --scoring_technique 'KvsAll' -python main.py --path_dataset_folder 'KGs/Family' --model 'ConvO' --max_num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'ConvO' --max_num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'ConvO' --max_num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3 - -# Checked -python main.py --path_dataset_folder 'KGs/UMLS' --model 'ComplEx' --num_epochs 3 --scoring_technique 'KvsAll' -python main.py --path_dataset_folder 'KGs/Family' --model 'ComplEx' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'ComplEx' --num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'ComplEx' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3 - -# Checked -python main.py --path_dataset_folder 'KGs/UMLS' --model 'DistMult' --num_epochs 3 --scoring_technique 'KvsAll' -python main.py --path_dataset_folder 'KGs/Family' --model 'DistMult' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'DistMult' --num_epochs 3 --scoring_technique 'KvsAll' --num_folds_for_cv 3 -python main.py --path_dataset_folder 'KGs/Family' --model 'DistMult' --num_epochs 3 --scoring_technique 'NegSample' --negative_sample_ratio 3 --num_folds_for_cv 3 - -echo "Ends......" \ No newline at end of file From 117193459a53c78236987f754f3f64f2aa046877 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Thu, 24 Feb 2022 12:30:38 +0100 Subject: [PATCH 6/6] Eval at continues training added. --- continuous_training.py | 2 +- core/executer.py | 4 +- core/knowledge_graph.py | 193 +++++++++------------------------------- 3 files changed, 46 insertions(+), 153 deletions(-) diff --git a/continuous_training.py b/continuous_training.py index 5b2cf8b6..5860ffca 100644 --- a/continuous_training.py +++ b/continuous_training.py @@ -20,7 +20,7 @@ def __init__(self, args): if __name__ == '__main__': parser = argparse.ArgumentParser(add_help=False) # Dataset and storage related - parser.add_argument("--path_experiment_folder", type=str, default="DAIKIRI_Storage/2022-02-04 15:02:25.958956", + parser.add_argument("--path_experiment_folder", type=str, default="DAIKIRI_Storage/2022-02-24 12:17:41.555572", help="The path of a folder containing pretrained model") # Training Parameters parser.add_argument("--num_epochs", type=int, default=10, diff --git a/core/executer.py b/core/executer.py index 7169ec62..56c0d2aa 100644 --- a/core/executer.py +++ b/core/executer.py @@ -70,10 +70,10 @@ def read_input_data(args) -> KG: return kg @staticmethod - def reload_input_data(p: str) -> KG: + def reload_input_data(storage_path: str) -> KG: # 1. Read & Parse input data print("1. Reload Parsed Input Data") - return KG(deserialize_flag=p) + return KG(deserialize_flag=storage_path) def start(self) -> dict: """ diff --git a/core/knowledge_graph.py b/core/knowledge_graph.py index 3a44c08a..c1dcc152 100644 --- a/core/knowledge_graph.py +++ b/core/knowledge_graph.py @@ -1,5 +1,5 @@ import time -from typing import Dict, List, Generator +from typing import Dict, List from collections import defaultdict import numpy as np import pickle @@ -14,6 +14,15 @@ class KG: + """ Knowledge Graph Class + 1- Reading : Large input data is read via DASK + 2- Cleaning & Preprocessing : + Remove triples with literals if exists + Apply reciprocal data augmentation triples into train, valid and test datasets + Add noisy triples (random facts sampled from all possible triples E x R x E) + 3- Serializing and Deserializing in parquet format + """ + def __init__(self, data_dir: str = None, deserialize_flag: str = None, large_kg_parse=False, add_reciprical=False, eval_model=True, read_only_few: int = None, sample_triples_ratio: float = None, path_for_serialization: str = None, add_noise_rate: float = None): @@ -84,7 +93,7 @@ def __init__(self, data_dir: str = None, deserialize_flag: str = None, large_kg_ pd.unique(self.train_set[['relation']].values.ravel('K')), num_noisy_triples), 'object': np.random.choice(list_of_entities, num_noisy_triples)} - ) + ) ], ignore_index=True) del list_of_entities @@ -103,7 +112,7 @@ def __init__(self, data_dir: str = None, deserialize_flag: str = None, large_kg_ index=ordered_list) print('Done!\n') - # 5. Create a bijection mapping from relations to to integer indexes. + # 5. Create a bijection mapping from relations to integer indexes. print('[6 / 14] Creating a mapping from relations to integer indexes...') ordered_list = pd.unique(df_str_kg['relation'].values.ravel('K')) self.relation_to_idx = pd.DataFrame(data=np.arange(len(ordered_list)), @@ -284,17 +293,15 @@ def index(data: List[List], add_reciprical=False) -> (Dict, Dict, Dict, Dict, Di return entity_idxs, relation_idxs, er_vocab, pe_vocab, ee_vocab - def deserialize(self, p: str) -> None: - """ - Deserialize data - """ + def deserialize(self, storage_path: str) -> None: + """ Deserialize data """ print('Deserializing compressed entity integer mapping...') - self.entity_to_idx = ddf.read_parquet(p + '/entity_to_idx.gzip').compute() + self.entity_to_idx = ddf.read_parquet(storage_path + '/entity_to_idx.gzip').compute() print('Done!\n') self.num_entities = len(self.entity_to_idx) print('Deserializing compressed relation integer mapping...') - self.relation_to_idx = ddf.read_parquet(p + '/relation_to_idx.gzip').compute() + self.relation_to_idx = ddf.read_parquet(storage_path + '/relation_to_idx.gzip').compute() self.num_relations = len(self.entity_to_idx) print('Done!\n') @@ -306,16 +313,35 @@ def deserialize(self, p: str) -> None: # 10. Serialize (9). print('Deserializing integer mapped data and mapping it to numpy ndarray...') - self.train_set = ddf.read_parquet(p + '/idx_train_df.gzip').values.compute() + self.train_set = ddf.read_parquet(storage_path + '/idx_train_df.gzip').values.compute() print('Done!\n') + try: + print('Deserializing integer mapped data and mapping it to numpy ndarray...') + self.valid_set = ddf.read_parquet(storage_path + '/idx_valid_df.gzip').values.compute() + print('Done!\n') + except FileNotFoundError: + print('No valid data found') + self.valid_set = pd.DataFrame() - print('Deserializing integer mapped data and mapping it to numpy ndarray...') - self.valid_set = ddf.read_parquet(p + '/idx_valid_df.gzip').values.compute() - print('Done!\n') + try: + print('Deserializing integer mapped data and mapping it to numpy ndarray...') + self.test_set = ddf.read_parquet(storage_path + '/idx_test_df.gzip').values.compute() + print('Done!\n') + except FileNotFoundError: + print('No test data found') + self.test_set = pd.DataFrame() - print('Deserializing integer mapped data and mapping it to numpy ndarray...') - self.test_set = ddf.read_parquet(p + '/idx_test_df.gzip').values.compute() - print('Done!\n') + print(storage_path) + with open(storage_path+'/configuration.json', 'r') as f: + args = json.load(f) + + if args['eval']: + if len(self.valid_set) > 0 and len(self.test_set) > 0: + # 16. Create a bijection mapping from subject-relation pairs to tail entities. + data = np.concatenate([self.train_set, self.valid_set, self.test_set]) + else: + data = self.train_set + self.er_vocab = get_er_vocab(data) @staticmethod def index_parallel(data: List[List], add_reciprical=False) -> (Dict, Dict, Dict, Dict, Dict): @@ -461,75 +487,6 @@ def relations_str(self) -> List: """ return list(self.relation_to_idx.keys()) - # Not used anymore. - def load_data(self, data_path, add_reciprical=True, load_only=None): - raise NotImplemented() - # line can be 1 or 2 - # a) <...> <...> <...> . - # b) <...> <...> "..." . - # c) ... ... ... - # (a) and (b) correspond to the N-Triples format - # (c) corresponds to the format of current link prediction benchmark datasets. - print(f'{data_path} is being read.') - try: - data = [] - with open(data_path, "r") as f: - for line in f: - # 1. Ignore lines with *** " *** or does only contain 2 or less characters. - if '"' in line or len(line) < 3: - continue - - # 2. Tokenize(<...> <...> <...> .) => ['<...>', '<...>','<...>','.'] - # Tokenize(... ... ...) => ['...', '...', '...',] - decomposed_list_of_strings = line.split() - - # 3. Sanity checking. - try: - assert len(decomposed_list_of_strings) == 3 or len(decomposed_list_of_strings) == 4 - except AssertionError: - print(f'Invalid input triple {line}. It can not be split into 3 or 4 items') - print('This triple will be ignored') - continue - # 4. Storing - if len(decomposed_list_of_strings) == 4: - assert decomposed_list_of_strings[-1] == '.' - data.append(self.ntriple_parser(decomposed_list_of_strings)) - if len(decomposed_list_of_strings) == 3: - data.append(decomposed_list_of_strings) - - if load_only is not None: - if len(data) == load_only: - break - - if len(data) % 50_000_000 == 0: - print(f'Size of already parsed data {len(data)}') - - except FileNotFoundError: - print(f'{data_path} is not found') - return [] - if add_reciprical: - data += [[i[2], i[1] + "_reverse", i[0]] for i in data] - return data - - def process(self, x): - raise NotImplemented - # 2. Tokenize(<...> <...> <...> .) => ['<...>', '<...>','<...>','.'] - # Tokenize(... ... ...) => ['...', '...', '...',] - decomposed_list_of_strings = x.split() - - # 3. Sanity checking. - try: - assert len(decomposed_list_of_strings) == 3 or len(decomposed_list_of_strings) == 4 - except AssertionError: - print(f'Invalid input triple {x}. It can not be split into 3 or 4 items') - print('This triple will be ignored') - # 4. Storing - if len(decomposed_list_of_strings) == 4: - assert decomposed_list_of_strings[-1] == '.' - decomposed_list_of_strings = self.ntriple_parser(decomposed_list_of_strings) - if len(decomposed_list_of_strings) == 3: - return decomposed_list_of_strings - @staticmethod def ntriple_parser(l: List) -> List: """ @@ -543,6 +500,7 @@ def ntriple_parser(l: List) -> List: :param l: :return: """ + raise NotImplementedError() assert l[3] == '.' try: s, p, o, _ = l[0], l[1], l[2], l[3] @@ -661,68 +619,3 @@ def triple_indexing(self, large_kg_parse) -> None: else: self.valid = np.array([]) self.test = np.array([]) - - -""" -def serialize(self, p: str) -> None: - # Serialize entities and relations sotred in pandas dataframe and predicates - - assert len(self.entity_to_idx) == self.num_entities - assert len(self.relation_to_idx) == self.num_relations - - # Store data in parquet format - if len(self.train_set) > 0: - self.train_set.to_parquet(p + '/train_df.gzip', compression='gzip') - # Store as numpy - self.train_set['subject'] = self.train_set['subject'].map(lambda x: self.entity_to_idx[x]) - self.train_set['relation'] = self.train_set['relation'].map(lambda x: self.relation_to_idx[x]) - self.train_set['object'] = self.train_set['object'].map(lambda x: self.entity_to_idx[x]) - self.train_set.to_parquet(p + '/idx_train_df.gzip', compression='gzip') - self.train_set = self.train_set.values - # Sanity checking - assert self.num_entities > max(self.train_set[0]) - assert self.num_entities > max(self.train_set[0]) - assert self.num_entities > max(self.train_set[2]) - assert self.num_entities > max(self.train_set[2]) - - assert isinstance(self.train_set[0], np.ndarray) - assert isinstance(self.train_set[0][0], np.int64) - assert isinstance(self.train_set[0][1], np.int64) - assert isinstance(self.train_set[0][2], np.int64) - - if len(self.valid_set) > 0: - self.valid_set.to_parquet(p + '/valid_df.gzip', compression='gzip') - self.valid_set['subject'] = self.valid_set['subject'].map(lambda x: self.entity_to_idx[x]) - self.valid_set['relation'] = self.valid_set['relation'].map(lambda x: self.relation_to_idx[x]) - self.valid_set['object'] = self.valid_set['object'].map(lambda x: self.entity_to_idx[x]) - self.valid_set.to_parquet(p + '/idx_valid_df.gzip', compression='gzip') - self.valid_set = self.valid_set.values - # Sanity checking - assert self.num_entities > max(self.valid_set[0]) - assert self.num_entities > max(self.valid_set[0]) - assert self.num_entities > max(self.valid_set[2]) - assert self.num_entities > max(self.valid_set[2]) - - assert isinstance(self.valid_set[0], np.ndarray) - assert isinstance(self.valid_set[0][0], np.int64) - assert isinstance(self.valid_set[0][1], np.int64) - assert isinstance(self.valid_set[0][2], np.int64) - - if len(self.test_set) > 0: - self.test_set.to_parquet(p + '/test_df.gzip', compression='gzip') - self.test_set['subject'] = self.test_set['subject'].map(lambda x: self.entity_to_idx[x]) - self.test_set['relation'] = self.test_set['relation'].map(lambda x: self.relation_to_idx[x]) - self.test_set['object'] = self.test_set['object'].map(lambda x: self.entity_to_idx[x]) - self.test_set.to_parquet(p + '/idx_test_df.gzip', compression='gzip') - self.test_set = self.test_set.values - # Sanity checking - assert self.num_entities > max(self.test_set[0]) - assert self.num_entities > max(self.test_set[0]) - assert self.num_entities > max(self.test_set[2]) - assert self.num_entities > max(self.test_set[2]) - - assert isinstance(self.test_set[0], np.ndarray) - assert isinstance(self.test_set[0][0], np.int64) - assert isinstance(self.test_set[0][1], np.int64) - assert isinstance(self.test_set[0][2], np.int64) -"""