Merge pull request #137 from dice-group/develop

Develop
dice-group · Aug 10, 2023 · d7f9905 · d7f9905
2 parents 249210d + f2084df
commit d7f9905
Show file tree

Hide file tree

Showing 29 changed files with 779 additions and 436 deletions.
diff --git a/README.md b/README.md
@@ -29,29 +29,34 @@ Deploy a pre-trained embedding model without writing a single line of code.
 # Installation
 <details><summary> Details </summary>
 
-``` 
-pip install dicee
+``` bash
+git clone https://github.com/dice-group/dice-embeddings.git
+conda create -n dice python=3.10 --no-default-packages && conda activate dice
+pip3 install -r requirements.txt
 ```
-
 or
-
+```bash
+pip install dicee
 ```
-git clone https://github.com/dice-group/dice-embeddings.git
-conda create -n dice python=3.10 --no-default-packages && conda activate dice
-pip3 install "pandas>=1.5.1"
+or
+```bash
 pip3 install "torch>=2.0.0"
+pip3 install "pandas>=1.5.1"
 pip3 install "polars>=0.16.14"
 pip3 install "scikit-learn>=1.2.2"
 pip3 install "pyarrow>=11.0.0"
-pip3 install "pytest>=7.2.2"
-pip3 install "gradio>=3.23.0"
-pip3 install "psutil>=5.9.4"
 pip3 install "pytorch-lightning==1.6.4"
 pip3 install "pykeen==1.10.1"
 pip3 install "zstandard>=0.21.0"
+pip3 install "pytest>=7.2.2"
+pip3 install "psutil>=5.9.4"
+pip3 install "ruff>=0.0.284"
+pip3 install "gradio>=3.23.0"
+pip3 install "rdflib>=7.0.0"
 ```
+
 To test the Installation
-```
+```bash
 wget https://hobbitdata.informatik.uni-leipzig.de/KG/KGs.zip
 unzip KGs.zip
 pytest -p no:warnings -x # it takes circa 15 minutes
@@ -66,19 +71,8 @@ pyreverse dicee/trainer && dot -Tpng -x classes.dot -o trainer.png && eog traine
 ```
 </details>
 
-## Docker
-To build the Docker image:
-```
-docker build -t dice-embeddings .
-```
-
-To test the Docker image:
-```
-docker run --rm -v ~/.local/share/dicee/KGs:/dicee/KGs dice-embeddings ./main.py --model AConEx --embedding_dim 16
-```
-
 # Knowledge Graph Embedding Models
-<details> <summary> Details</summary>
+<details> <summary> To see available Models</summary>
 
 1. TransE, DistMult, ComplEx, ConEx, QMult, OMult, ConvO, ConvQ, Keci
 2. All 44 models available in https://github.com/pykeen/pykeen#models
@@ -87,17 +81,54 @@ docker run --rm -v ~/.local/share/dicee/KGs:/dicee/KGs dice-embeddings ./main.py
 </details>
 
 # How to Train
-<details> <summary> Details</summary>
+<details> <summary> To see  examples</summary>
+
+Train a KGE model and evaluate it on the train, validation, and test sets of the UMLS benchmark dataset.
+```bash
+python main.py --path_dataset_folder "KGs/UMLS" --model Keci --eval_model "train_val_test"
+```
+where the data is in the following form
+```bash
+$ head -3 KGs/UMLS/train.txt 
+acquired_abnormality    location_of     experimental_model_of_disease
+anatomical_abnormality  manifestation_of        physiologic_function
+alga    isa     entity
+```
+Models can be easily trained in a single node multi-gpu setting
+```bash
+python main.py --accelerator "gpu" --strategy "ddp" --path_dataset_folder "KGs/UMLS" --model Keci --eval_model "train_val_test" 
+```
 
-> Please refer to `examples`.
+Train a KGE model by providing the path of a single file and store all parameters under newly created directory
+called `KeciFamilyRun`.
+```bash
+python main.py --path_single_kg "KGs/Family/train.txt" --model Keci --path_to_store_single_run KeciFamilyRun
+```
+where the data is in the following form
+```bash
+$ head -3 KGs/Family/train.txt 
+_:1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Ontology> .
+<http://www.benchmark.org/family#hasChild> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#ObjectProperty> .
+<http://www.benchmark.org/family#hasParent> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#ObjectProperty> .
+```
+**Apart from n-triples or standard link prediction dataset formats, we support ["owl", "nt", "turtle", "rdf/xml", "n3"]***.
+Moreover, a KGE model can be also trained  by providing **an endpoint of a triple store**.
+```bash
+python main.py --sparql_endpoint "http://localhost:3030/mutagenesis/" --model Keci
+```
+For more, please refer to `examples`.
 </details>
 
 
 # How to Deploy
+<details> <summary> To see a single line of code</summary>
+
 ```python
 from dicee import KGE
 KGE(path='...').deploy(share=True,top_k=10)
 ```
+</details>
+
 <details> <summary> To see the interface of the webservice</summary>
 <img src="dicee/lp.png" alt="Italian Trulli">
 </details>
@@ -172,6 +203,20 @@ Please contact:  ```[email protected] ``` or ```[email protected] ``` , i
 - [FB15K-237 ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/FB15K-237.zip)
 - [WN18RR ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/WN18RR.zip)
 - For more please look at [Hobbit Data](https://hobbitdata.informatik.uni-leipzig.de/KGE/)
+
+## Docker
+<details> <summary> Details</summary>
+To build the Docker image:
+```
+docker build -t dice-embeddings .
+```
+
+To test the Docker image:
+```
+docker run --rm -v ~/.local/share/dicee/KGs:/dicee/KGs dice-embeddings ./main.py --model AConEx --embedding_dim 16
+```
+</details>
+
 ### Documentation
 In documents folder, we explained many details about knowledge graphs, knowledge graph embeddings, training strategies and many more background knowledge.
 We continuously work on documenting each and every step to increase the readability of our code.

diff --git a/analyse_experiments.py b/analyse_experiments.py
@@ -131,8 +131,8 @@ def to_df(self):
                  test_mrr=self.test_mrr, test_h1=self.test_h1,
                  test_h3=self.test_h3, test_h10=self.test_h10,
                  runtime=self.runtime,
-                 params=self.num_params
-                 #callbacks=self.callbacks,
+                 params=self.num_params,
+                 callbacks=self.callbacks,
                  #normalization=self.normalization,
                  #embeddingdim=self.embedding_dim
                  )
@@ -148,6 +148,6 @@ def to_df(self):
 df = counter.to_df()
 pd.set_option("display.precision", 3)
 #print(df)
-#print(df.to_latex(index=False,float_format="%.3f"))
+print(df.to_latex(index=False,float_format="%.3f"))
 
 print(df.to_markdown(index=False))
diff --git a/dicee/__init__.py b/dicee/__init__.py
@@ -3,4 +3,5 @@
 from .trainer import DICE_Trainer  # noqa
 from .knowledge_graph_embeddings import KGE  # noqa
 from .executer import Execute # noqa
-__version__ = '0.0.4'
+from .dataset_classes import * # noqa
+__version__ = '0.0.5'
diff --git a/dicee/abstracts.py b/dicee/abstracts.py
@@ -29,8 +29,8 @@ def __init__(self, args, callbacks):
         self.callbacks = callbacks
         self.is_global_zero = True
         # Set True to use Model summary callback of pl.
-        torch.manual_seed(self.attributes.seed_for_computation)
-        torch.cuda.manual_seed_all(self.attributes.seed_for_computation)
+        torch.manual_seed(self.attributes.random_seed)
+        torch.cuda.manual_seed_all(self.attributes.random_seed)
 
     def on_fit_start(self, *args, **kwargs):
         """

diff --git a/dicee/callbacks.py b/dicee/callbacks.py
@@ -222,16 +222,14 @@ def on_fit_end(self, trainer, model):
         """
 
     def on_train_epoch_end(self, trainer, model):
-        self.epoch_counter+=1
+        self.epoch_counter += 1
         if self.epoch_counter % self.epoch_ratio == 0:
             model.eval()
             report = trainer.evaluator.eval(dataset=trainer.dataset, trained_model=model,
                                             form_of_labelling=trainer.form_of_labelling, during_training=True)
             model.train()
             self.reports.append(report)
 
-
-
     def on_train_batch_end(self, *args, **kwargs):
         return
 
@@ -296,11 +294,30 @@ def __init__(self, std: float = 0.1, epoch_ratio: int = None):
         self.epoch_ratio = epoch_ratio if epoch_ratio is not None else 1
         self.epoch_counter = 0
 
-    def on_train_epoch_end(self, trainer, model):
+    def on_train_epoch_start(self, trainer, model):
         if self.epoch_counter % self.epoch_ratio == 0:
             with torch.no_grad():
                 # Access the parameters
                 for param in model.parameters():
                     noise_mat = torch.normal(mean=0, std=self.std, size=param.shape, device=model.device)
                     param.add_(noise_mat)
         self.epoch_counter += 1
+
+
+class RN(AbstractCallback):
+    """ Adding Uniform at Random Noise into Inputs/Parameters """
+
+    def __init__(self, std: float = 0.1, epoch_ratio: int = None):
+        super().__init__()
+        self.std = std
+        self.epoch_ratio = epoch_ratio if epoch_ratio is not None else 1
+        self.epoch_counter = 0
+
+    def on_train_epoch_start(self, trainer, model):
+        if self.epoch_counter % self.epoch_ratio == 0:
+            with torch.no_grad():
+                # Access the parameters
+                for param in model.parameters():
+                    noise_mat = torch.rand(size=param.shape) * self.std
+                    param.add_(noise_mat)
+        self.epoch_counter += 1
diff --git a/dicee/config.py b/dicee/config.py
@@ -16,12 +16,22 @@ def __call__(self, parser, namespace, values, option_string=None):
 class Namespace(argparse.Namespace):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        "The path of a folder containing train.txt, and/or valid.txt and/or test.txt"
         self.path_dataset_folder: str = 'KGs/UMLS'
+        "A flag for saving embeddings in csv file."
         self.save_embeddings_as_csv: bool = False
+        "A directory named with time of execution under --storage_path that contains related data about embeddings."
         self.storage_path: str = 'Experiments'
-        self.absolute_path_to_store: str = None
-        self.absolute_path_dataset = None
+        "A single directory created that contains related data about embeddings."
+        self.path_to_store_single_run: str = None
+        "Path of a file corresponding to the input knowledge graph"
+        self.path_single_kg = None
+        "An endpoint of a triple store."
+        self.sparql_endpoint = None
+        "KGE model"
         self.model: str = "Keci"
+        " The ratio of added random triples into training dataset"
+        self.add_noise_rate: float = None
         self.p: int = 0
         self.q: int = 1
         self.optim: str = 'Adam'
@@ -50,7 +60,8 @@ def __init__(self, **kwargs):
         self.kernel_size: int = 3
         self.num_of_output_channels: int = 32
         self.num_core: int = 0
-        self.seed_for_computation: int = 0
+        "Random Seed"
+        self.random_seed: int = 0
         self.sample_triples_ratio = None
         self.read_only_few = None
         self.pykeen_model_kwargs: ParseDict = dict()

diff --git a/dicee/dataset_classes.py b/dicee/dataset_classes.py
@@ -254,7 +254,7 @@ def __init__(self, train_set: np.ndarray, num_entities, num_relations, neg_sampl
                  label_smoothing_rate: float = 0.0):
         super().__init__()
         assert isinstance(train_set, np.ndarray)
-        assert isinstance(neg_sample_ratio,int)
+        assert isinstance(neg_sample_ratio, int)
         self.train_data = train_set
         self.num_entities = num_entities
         self.num_relations = num_relations
@@ -314,6 +314,35 @@ def __getitem__(self, idx):
         return x, y_idx, y_vec
 
 
+class NegSampleDataset(torch.utils.data.Dataset):
+    def __init__(self, train_set: np.ndarray, num_entities: int, num_relations: int, neg_sample_ratio: int = 1):
+        assert isinstance(train_set, np.ndarray)
+        # https://pytorch.org/docs/stable/data.html#multi-process-data-loading
+        # TLDL; replace Python objects with non-refcounted representations such as Pandas, Numpy or PyArrow objects
+        self.neg_sample_ratio = torch.tensor(
+            neg_sample_ratio)  # 0 Implies that we do not add negative samples. This is needed during testing and validation
+        self.train_set = torch.from_numpy(train_set).unsqueeze(1)
+        # assert num_entities >= max(self.train_set[:, 0]) and num_entities >= max(self.train_set[:, 2])
+        self.length = len(self.train_set)
+        self.num_entities = torch.tensor(num_entities)
+        self.num_relations = torch.tensor(num_relations)
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, idx):
+        # Generate negative sample
+
+        triple = self.train_set[idx]
+
+        corr_entities = torch.randint(0, high=self.num_entities, size=(1,))
+        negative_triple = torch.cat((triple[:, 0], triple[:, 1], corr_entities), dim=0).unsqueeze(0)
+
+        x = torch.cat((triple, negative_triple), dim=0)
+        y=torch.tensor([1.0, 0.0])
+        return x,y
+
+
 class TriplePredictionDataset(torch.utils.data.Dataset):
     """
     Triple Dataset
@@ -426,7 +455,6 @@ def collate_fn(self, batch: List[torch.Tensor]):
         return x, label
 
 
-
 class CVDataModule(pl.LightningDataModule):
     """
        Create a Dataset for cross validation

diff --git a/dicee/evaluator.py b/dicee/evaluator.py
@@ -2,6 +2,7 @@
 import numpy as np
 import json
 from .static_funcs import pickle
+from .static_funcs_training import evaluate_lp
 
 
 class Evaluator:
@@ -263,6 +264,14 @@ def evaluate_lp_k_vs_all(self, model, triple_idx, info=None, form_of_labelling=N
 
     def evaluate_lp(self, model, triple_idx, info):
         """
+
+        """
+        # @TODO: Document this method
+        return evaluate_lp(model, triple_idx, num_entities=self.num_entities,
+                           er_vocab=self.er_vocab,re_vocab=self.re_vocab,info=info)
+
+    def dept_evaluate_lp(self, model, triple_idx, info):
+        """
         Evaluate model in a standard link prediction task
 
         for each triple

diff --git a/dicee/executer.py b/dicee/executer.py
@@ -36,7 +36,7 @@ def __init__(self, args, continuous_training=False):
         # (1) Process arguments and sanity checking.
         self.args = preprocesses_input_args(args)
         # (2) Ensure reproducibility.
-        seed_everything(args.seed_for_computation, workers=True)
+        seed_everything(args.random_seed, workers=True)
         # (3) Set the continual training flag
         self.is_continual_training = continuous_training
         # (4) Create an experiment folder or use the previous one
@@ -198,9 +198,7 @@ def start(self) -> dict:
         self.trainer = DICE_Trainer(args=self.args,
                                     is_continual_training=self.is_continual_training,
                                     storage_path=self.storage_path,
-                                    evaluator=self.evaluator,
-                                    dataset=self.dataset  # only used for Pykeen's models
-                                    )
+                                    evaluator=self.evaluator)
         # (4) Start the training
         self.trained_model, form_of_labelling = self.trainer.start(dataset=self.dataset)
         return self.end(form_of_labelling)