Merge branch 'feature/ScriptsForGridSearch' into develop

dice-group · Feb 22, 2022 · e06de9a · e06de9a
2 parents 3e37b34 + 827da56
commit e06de9a
Show file tree

Hide file tree

Showing 9 changed files with 248 additions and 409 deletions.
diff --git a/README.md b/README.md
@@ -46,21 +46,34 @@ This share link expires in 72 hours. For free permanent hosting, check out Space
 ```
 ![alt text](core/figures/deploy_qmult_family.png)
 
-### Available Models
-1. [DistMult](https://arxiv.org/pdf/1412.6575.pdf)
-2. [ComplEx](https://arxiv.org/pdf/1606.06357.pdf)
-3. [Shallom](https://arxiv.org/pdf/2101.09090.pdf) 
-4. [ConEx](https://openreview.net/forum?id=6T45-4TFqaX&invitationId=eswc-conferences.org/ESWC/2021/Conference/Research_Track/Paper49/-/Camera_Ready_Revision&referrer=%5BTasks%5D(%2Ftasks))
-5. [QMult](https://proceedings.mlr.press/v157/demir21a.html) 
-6. [OMult](https://proceedings.mlr.press/v157/demir21a.html) 
-7. [ConvQ](https://proceedings.mlr.press/v157/demir21a.html) 
-8. [ConvO](https://proceedings.mlr.press/v157/demir21a.html)
-9. Contact us to add your favorite one :)
-
-
-### Pretrained Models
-TODO: Add a script to load and deploy a pretrained model
+## Pre-trained Models
+Please contact:  ```[email protected] ``` or ```[email protected] ``` , if you lack hardware resources to obtain embeddings of a specific knowledge Graph.
+- [English-French DBpedia Shallom embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/shallom/Shallom_EN_FR_15K_V1.zip)
+- [YAGO3-10 ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/YAGO3-10.zip)
+- [FB15K-237 ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/FB15K-237.zip)
+- [FB15K ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/FB15K.zip)
+- [WN18RR ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/WN18RR.zip)
+- [WN18 ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/WN18.zip)
+- [Forte ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/ConEx_Forte.zip)
+- [Hepatitis ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/ConEx_Hepatitis.zip)
+- [Lymphography ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/ConEx_Lymphography.zip)
+- [Mammographic ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/ConEx_Mammographic.zip)
+- [Animals ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/Animals.zip)
+- For more please look at [Hobbit Data](https://hobbitdata.informatik.uni-leipzig.de/KGE/)
 
+### Available Models
+1. Multiplicative based KGE models:
+   1. [DistMult](https://arxiv.org/pdf/1412.6575.pdf)
+   2. [ComplEx](https://arxiv.org/pdf/1606.06357.pdf)
+   3. [QMult](https://proceedings.mlr.press/v157/demir21a.html)
+   4. [OMult](https://proceedings.mlr.press/v157/demir21a.html) 
+2. Feed Forward Neural Models 
+   1. [Shallom](https://arxiv.org/pdf/2101.09090.pdf)
+3. Convolutional Neural models
+   1. [ConEx](https://openreview.net/forum?id=6T45-4TFqaX&invitationId=eswc-conferences.org/ESWC/2021/Conference/Research_Track/Paper49/-/Camera_Ready_Revision&referrer=%5BTasks%5D(%2Ftasks))
+   2. [ConvQ](https://proceedings.mlr.press/v157/demir21a.html) 
+   3. [ConvO](https://proceedings.mlr.press/v157/demir21a.html)
+4. Contact us to add your favorite one :)
 
 
 ### Training
@@ -146,7 +159,6 @@ url={https://openreview.net/forum?id=6T45-4TFqaX}}
   pages={179--182},
   year={2021},
   organization={IEEE}
-
 ```
 
 For any questions or wishes, please contact:  ```[email protected]``` or ```[email protected]```

diff --git a/core/executer.py b/core/executer.py
@@ -1,7 +1,7 @@
 import warnings
 import os
 from .models import *
-from .helper_classes import LabelRelaxationLoss,LabelSmoothingLossCanonical
+from .helper_classes import LabelRelaxationLoss, LabelSmoothingLossCanonical
 from .dataset_classes import StandardDataModule, KvsAll, CVDataModule
 from .knowledge_graph import KG
 import torch
@@ -28,9 +28,8 @@
 warnings.filterwarnings(action="ignore", category=DeprecationWarning)
 seed_everything(1, workers=True)
 
-
-# TODO later measure the spent time for every done ! operation
-
+# TODO: Execute can inherit from Trainer and Evaluator Classes
+# By doing so we can increase the modularity of our code.
 class Execute:
     def __init__(self, args, continuous_training=False):
         # (1) Process arguments and sanity checking
@@ -101,7 +100,7 @@ def start(self) -> dict:
         else:
             message = f'{total_runtime / (60 ** 2):.3f} hours'
         self.report['Runtime'] = message
-        self.report.update(extract_model_summary(trained_model.summarize()))
+
         print(f'Runtime of {trained_model.name}:', total_runtime)
         print(f'NumParam of {trained_model.name}:', self.report["NumParam"])
         # print(f'Estimated of {trained_model.name}:', self.report["EstimatedSizeMB"])
@@ -172,6 +171,30 @@ def config_kge_sanity_checking(self):
         if self.args.scoring_technique == 'KvsAll':
             self.args.neg_ratio = None
 
+    def save_embeddings(self, embeddings: np.ndarray, indexes, path: str) -> None:
+        """
+
+        :param embeddings:
+        :param indexes:
+        :param path:
+        :return:
+        """
+        try:
+            df = pd.DataFrame(embeddings, index=indexes)
+            del embeddings
+            num_mb = df.memory_usage(index=True, deep=True).sum() / (10 ** 6)
+            if num_mb > 10 ** 6:
+                df = dd.from_pandas(df, npartitions=len(df) / 100)
+                # PARQUET wants columns to be stn
+                df.columns = df.columns.astype(str)
+                df.to_parquet(self.args.full_storage_path + '/' + trained_model.name + '_entity_embeddings')
+            else:
+                df.to_csv(path)
+        except KeyError or AttributeError as e:
+            print('Exception occurred at saving entity embeddings. Computation will continue')
+            print(e)
+        del df
+
     def store(self, trained_model) -> None:
         """
         Store trained_model model and save embeddings into csv file.
@@ -186,45 +209,25 @@ def store(self, trained_model) -> None:
         with open(self.args.full_storage_path + '/configuration.json', 'w') as file_descriptor:
             temp = vars(self.args)
             json.dump(temp, file_descriptor)
-        print('Saving embeddings..')
-        # TODO: Find a faster way to store embeddings.
-        if trained_model.name == 'Shallom':
-            entity_emb = trained_model.get_embeddings()
-        else:
+        # See available memory and decide whether embeddings are stored separately or not.
+        available_memory = [i.split() for i in os.popen('free -h').read().splitlines()][1][-1]  # ,e.g., 10Gi
+        available_memory_mb = float(available_memory[:-2]) * 1000
+        self.report.update(extract_model_summary(trained_model.summarize()))
+
+        if available_memory_mb * .01 > self.report['EstimatedSizeMB']:
+            """ We have enough space for data conversion"""
+            print('Saving embeddings..')
             entity_emb, relation_ebm = trained_model.get_embeddings()
-            try:
-                df = pd.DataFrame(relation_ebm, index=self.dataset.relations_str)
-                df.columns = df.columns.astype(str)
-                num_mb = df.memory_usage(index=True, deep=True).sum() / (10 ** 6)
-                if num_mb > 10 ** 6:
-                    df = dd.from_pandas(df, npartitions=len(df) / 100)
-                    # PARQUET wants columns to be stn
-                    df.columns = df.columns.astype(str)
-                    df.to_parquet(self.args.full_storage_path + '/' + trained_model.name + '_relation_embeddings')
-                    # TO READ PARQUET FILE INTO PANDAS
-                    # m=dd.read_parquet(self.storage_path + '/' + trained_model.name + '_relation_embeddings').compute()
-                else:
-                    df.to_csv(self.args.full_storage_path + '/' + trained_model.name + '_relation_embeddings.csv')
-            except KeyError or AttributeError as e:
-                print('Exception occurred at saving relation embeddings. Computation will continue')
-                print(e)
 
-            # Free mem del
-            del df
+            self.save_embeddings(entity_emb, indexes=self.dataset.entities_str,
+                                 path=self.args.full_storage_path + '/' + trained_model.name + '_entity_embeddings.csv')
+            del entity_emb
+            if relation_ebm is not None:
+                self.save_embeddings(relation_ebm, indexes=self.dataset.relations_str,
+                                     path=self.args.full_storage_path + '/' + trained_model.name + '_relation_embeddings.csv')
             del relation_ebm
-        try:
-            df = pd.DataFrame(entity_emb, index=self.dataset.entities_str)
-            num_mb = df.memory_usage(index=True, deep=True).sum() / (10 ** 6)
-            if num_mb > 10 ** 6:
-                df = dd.from_pandas(df, npartitions=len(df) / 100)
-                # PARQUET wants columns to be stn
-                df.columns = df.columns.astype(str)
-                df.to_parquet(self.args.full_storage_path + '/' + trained_model.name + '_relation_embeddings')
-            else:
-                df.to_csv(self.args.full_storage_path + '/' + trained_model.name + '_entity_embeddings.csv', )
-        except KeyError or AttributeError as e:
-            print('Exception occurred at saving entity embeddings.Computation will continue')
-            print(e)
+        else:
+            print('There is not enough memory to store embeddings separately.')
 
     def get_batch_1_to_N(self, input_vocab, triples, idx, output_dim) -> Tuple[np.array, torch.FloatTensor]:
         """ A mini-batch for training on multi-labels (x,y) -> [0.,0.,0.,----, 1.,1,]
@@ -244,6 +247,14 @@ def get_batch_1_to_N(self, input_vocab, triples, idx, output_dim) -> Tuple[np.ar
             targets[idx, input_vocab[pair]] = 1
         return np.array(batch), torch.FloatTensor(targets)
 
+    @staticmethod
+    def model_fitting(trainer, model, train_dataloaders) -> None:
+        print(model)
+        print(model.summarize())
+        print("Model fitting...")
+        trainer.fit(model, train_dataloaders=train_dataloaders)
+        print("Done!")
+
     def training_kvsall(self):
         """
         Train models with KvsAll or NegativeSampling
@@ -263,15 +274,14 @@ def training_kvsall(self):
                                      batch_size=self.args.batch_size,
                                      num_workers=self.args.num_processes,
                                      label_smoothing_rate=self.args.label_smoothing_rate)
-        # 5. Train model
-        self.trainer.fit(model, train_dataloaders=dataset.train_dataloader())
-
+        # 3. Train model.
+        self.model_fitting(trainer=self.trainer, model=model, train_dataloaders=dataset.train_dataloader())
+        # 4. Test model on the training dataset if it is needed.
         if self.args.eval_on_train:
             res = self.evaluate_lp_k_vs_all(model, self.dataset.train_set,
                                             f'Evaluate {model.name} on Train set', form_of_labelling)
             self.report['Train'] = res
-
-        # 6. Test model on validation and test sets if possible.
+        # 5. Test model on the validation and test dataset if it is needed.
         if self.args.eval:
             if len(self.dataset.valid_set) > 0:
                 res = self.evaluate_lp_k_vs_all(model, self.dataset.valid_set,
@@ -289,7 +299,6 @@ def training_1vsall(self):
         model, form_of_labelling = select_model(self.args)
         print(f'1vsAll training starts: {model.name}')
         form_of_labelling = '1VsAll'
-
         # 2. Create training data.
         dataset = StandardDataModule(train_set_idx=self.dataset.train_set,
                                      valid_set_idx=self.dataset.valid_set,
@@ -301,27 +310,23 @@ def training_1vsall(self):
                                      batch_size=self.args.batch_size,
                                      num_workers=self.args.num_processes
                                      )
-
-        # 3. Display the selected model's architecture.
         if self.args.label_relaxation_rate:
-            model.loss=LabelRelaxationLoss(alpha=self.args.label_relaxation_rate)
-            #model.loss=LabelSmoothingLossCanonical()
+            model.loss = LabelRelaxationLoss(alpha=self.args.label_relaxation_rate)
+            # model.loss=LabelSmoothingLossCanonical()
 
         elif self.args.label_smoothing_rate:
             model.loss = nn.CrossEntropyLoss(label_smoothing=self.args.label_smoothing_rate)
         else:
             model.loss = nn.CrossEntropyLoss()
-
-        print(model)
-        print(model.loss)
-        # 5. Train model
-        self.trainer.fit(model, train_dataloaders=dataset.train_dataloader())
+        # 3. Train model
+        self.model_fitting(trainer=self.trainer, model=model, train_dataloaders=dataset.train_dataloader())
+        # 4. Test model on the training dataset if it is needed.
         if self.args.eval_on_train:
             res = self.evaluate_lp_k_vs_all(model, self.dataset.train_set,
                                             f'Evaluate {model.name} on train set', form_of_labelling)
             self.report['Train'] = res
 
-        # 6. Test model on validation and test sets if possible.
+        # 5. Test model on the validation and test dataset if it is needed.
         if self.args.eval:
             if len(self.dataset.valid_set) > 0:
                 res = self.evaluate_lp_k_vs_all(model, self.dataset.valid_set,
@@ -353,15 +358,13 @@ def training_negative_sampling(self) -> pl.LightningModule:
                                      batch_size=self.args.batch_size,
                                      num_workers=self.args.num_processes
                                      )
-        print('Done!\n')
-        print(model)
-        print('Fitting the model...')
-        self.trainer.fit(model, train_dataloaders=dataset.train_dataloader())
-        print('Done!\n')
+        # 3. Train model
+        self.model_fitting(trainer=self.trainer, model=model, train_dataloaders=dataset.train_dataloader())
+        # 4. Test model on the training dataset if it is needed.
         if self.args.eval_on_train:
             res = self.evaluate_lp(model, self.dataset.train_set, f'Evaluate {model.name} on Train set')
             self.report['Train'] = res
-
+        # 5. Test model on the validation and test dataset if it is needed.
         if self.args.eval:
             if len(self.dataset.valid_set) > 0:
                 self.report['Val'] = self.evaluate_lp(model, self.dataset.valid_set, 'Evaluation of Validation set')
@@ -606,8 +609,8 @@ def k_fold_cross_validation(self) -> pl.LightningModule:
                                          batch_size=self.args.batch_size,
                                          num_workers=self.args.num_processes
                                          )
-            # 5. Train model
-            trainer.fit(model, train_dataloaders=dataset.train_dataloader())
+            # 3. Train model
+            self.model_fitting(trainer=trainer, model=model, train_dataloaders=dataset.train_dataloader())
 
             # 6. Test model on validation and test sets if possible.
             res = self.evaluate_lp_k_vs_all(model, test_set_for_i_th_fold, form_of_labelling=form_of_labelling)

diff --git a/core/models/base_model.py b/core/models/base_model.py
@@ -1,5 +1,6 @@
 import pytorch_lightning as pl
 import torch
+from pytorch_lightning.utilities.types import TRAIN_DATALOADERS, EVAL_DATALOADERS
 from torch import nn
 from torch.nn import functional as F
 from torchmetrics import Accuracy as accuracy
@@ -8,6 +9,7 @@
 
 
 class BaseKGE(pl.LightningModule):
+
     def __init__(self, learning_rate=.1):
         super().__init__()
         self.name = 'Not init'
@@ -38,11 +40,11 @@ def forward(self, x):
 
     def training_step(self, batch, batch_idx):
         x_batch, y_batch = batch
-        pred_batch=self.forward(x_batch)
+        pred_batch = self.forward(x_batch)
         train_loss = self.loss_function(pred_batch, y_batch)
         return {'loss': train_loss}
 
-    #def training_epoch_end(self, outputs) -> None:
+    # def training_epoch_end(self, outputs) -> None:
     #    """ DBpedia debugging removed."""
     #    #avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
     #    #self.log('avg_loss', avg_loss, on_epoch=False, prog_bar=True)
@@ -78,3 +80,15 @@ def test_step(self, batch, batch_idx):
     def test_epoch_end(self, outputs: List[Any]):
         avg_test_accuracy = torch.stack([x['test_accuracy'] for x in outputs]).mean()
         self.log('avg_test_accuracy', avg_test_accuracy, on_epoch=True, prog_bar=True)
+
+    def test_dataloader(self) -> EVAL_DATALOADERS:
+        pass
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        pass
+
+    def predict_dataloader(self) -> EVAL_DATALOADERS:
+        pass
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        pass