diff --git a/.github/workflows/github-actions-python-package.yml b/.github/workflows/github-actions-python-package.yml index 329d0b16..ff653b54 100644 --- a/.github/workflows/github-actions-python-package.yml +++ b/.github/workflows/github-actions-python-package.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.9.17"] steps: - uses: actions/checkout@v3 @@ -19,13 +19,12 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install ruff pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install -r requirements.txt - name: Lint with ruff run: | - ruff --format=github --select=F63,F7,F82 --target-version=py310 dicee/. + ruff --format=github --select=F63,F7,F82 --target-version=py39 dicee/. - name: Test with pytest run: | - wget https://hobbitdata.informatik.uni-leipzig.de/KG/KGs.zip + wget https://hobbitdata.informatik.uni-leipzig.de/KG/KGs.zip --no-check-certificate unzip KGs.zip pytest -p no:warnings -x diff --git a/.github/workflows/sphinx.yml b/.github/workflows/sphinx.yml index d6b5f213..f5c23eb3 100644 --- a/.github/workflows/sphinx.yml +++ b/.github/workflows/sphinx.yml @@ -20,18 +20,12 @@ jobs: uses: actions/setup-python@v2 with: python-version: "3.10" - # Runs a single command using the runners shell - - name: Run a one-line script - run: echo Hello, world! - + - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt - - name: Build HTML + - name: Build HTML and import run: | - sphinx-apidoc -o docs dicee/ && make -C docs/ html - - name: Run ghp-import - run: | - mv docs/_build/html docs/ && ghp-import -n -p -f docs/html \ No newline at end of file + sphinx-apidoc -o docs dicee/ && make -C docs/ html && mv docs/_build/html docs/ && ghp-import -n -p -f docs/html \ No newline at end of file diff --git a/README.md b/README.md index 8fdbd888..81c9afb5 100644 --- a/README.md +++ b/README.md @@ -34,33 +34,16 @@ Deploy a pre-trained embedding model without writing a single line of code. ``` bash git clone https://github.com/dice-group/dice-embeddings.git -conda create -n dice python=3.10 --no-default-packages && conda activate dice +conda create -n dice python=3.9 --no-default-packages && conda activate dice pip3 install -r requirements.txt ``` or ```bash pip install dicee ``` -or -```bash -pip3 install "torch>=2.0.0" -pip3 install "pandas>=1.5.1" -pip3 install "polars>=0.16.14" -pip3 install "scikit-learn>=1.2.2" -pip3 install "pyarrow>=11.0.0" -pip3 install "pytorch-lightning==1.6.4" -pip3 install "pykeen==1.10.1" -pip3 install "zstandard>=0.21.0" -pip3 install "pytest>=7.2.2" -pip3 install "psutil>=5.9.4" -pip3 install "ruff>=0.0.284" -pip3 install "gradio>=3.23.0" -pip3 install "rdflib>=7.0.0" -``` - To test the Installation ```bash -wget https://hobbitdata.informatik.uni-leipzig.de/KG/KGs.zip +wget https://hobbitdata.informatik.uni-leipzig.de/KG/KGs.zip --no-check-certificate unzip KGs.zip pytest -p no:warnings -x # it takes circa 15 minutes pytest -p no:warnings --lf # run only the last failed test diff --git a/analyse_experiments.py b/analyse_experiments.py index ed7e5e71..e8e23c4c 100644 --- a/analyse_experiments.py +++ b/analyse_experiments.py @@ -2,59 +2,18 @@ import json import pandas as pd import sys +import argparse -# print('Number of arguments:', len(sys.argv), 'arguments.') -# print('Argument List:', str(sys.argv)) - - -if len(sys.argv) > 1: - input_str_path = sys.argv[1] -else: - # (1) Give a path of Experiments folder - input_str_path = 'Experiments/' - -# (2) Get all subfolders -sub_folder_str_paths = os.listdir(input_str_path) - -results = dict() - -experiments = [] -for path in sub_folder_str_paths: - try: - with open(input_str_path + path + '/configuration.json', 'r') as f: - config = json.load(f) - config = {i: config[i] for i in - ['model', 'full_storage_path', 'embedding_dim', - 'normalization', 'num_epochs', 'batch_size', 'lr', - 'callbacks', - 'scoring_technique', - 'path_dataset_folder', 'p', 'q']} - except FileNotFoundError: - print('Exception occured at reading config') - continue - - try: - with open(input_str_path + path + '/report.json', 'r') as f: - report = json.load(f) - report = {i: report[i] for i in ['Runtime','NumParam']} - except FileNotFoundError: - print('Exception occured at reading report') - continue - - try: - with open(input_str_path + path + '/eval_report.json', 'r') as f: - eval_report = json.load(f) - # print(eval_report) - # exit(1) - # eval_report = {i: str(eval_report[i]) for i in ['Train', 'Val', 'Test']} - except FileNotFoundError: - print('Exception occured at reading eval_report') - continue - - config.update(eval_report) - config.update(report) - experiments.append(config) +def get_default_arguments(description=None): + parser = argparse.ArgumentParser(add_help=False) + # Default Trainer param https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#methods + # Data related arguments + parser.add_argument("--dir", type=str, default="KINSHIP-DistMult-RN/", + help="") + if description is None: + return parser.parse_args() + return parser.parse_args(description) # need a class to hold all params @@ -63,7 +22,7 @@ def __init__(self): self.model_name = [] self.callbacks = [] self.embedding_dim = [] - self.num_params=[] + self.num_params = [] self.num_epochs = [] self.batch_size = [] self.lr = [] @@ -87,6 +46,7 @@ def __init__(self): self.runtime = [] self.normalization = [] + self.scoring_technique = [] def save_experiment(self, x): self.model_name.append(x['model']) @@ -100,6 +60,7 @@ def save_experiment(self, x): self.num_params.append(x['NumParam']) self.normalization.append(x['normalization']) + self.scoring_technique.append(x['scoring_technique']) self.callbacks.append(x['callbacks']) self.train_mrr.append(x['Train']['MRR']) @@ -122,10 +83,10 @@ def save_experiment(self, x): def to_df(self): return pd.DataFrame( - dict(model_name=self.model_name, #pq=self.pq, path_dataset_folder=self.path_dataset_folder, + dict(model_name=self.model_name, # pq=self.pq, path_dataset_folder=self.path_dataset_folder, train_mrr=self.train_mrr, train_h1=self.train_h1, train_h3=self.train_h3, train_h10=self.train_h10, - #full_storage_path=self.full_storage_path, + # full_storage_path=self.full_storage_path, val_mrr=self.val_mrr, val_h1=self.val_h1, val_h3=self.val_h3, val_h10=self.val_h10, test_mrr=self.test_mrr, test_h1=self.test_h1, @@ -133,21 +94,51 @@ def to_df(self): runtime=self.runtime, params=self.num_params, callbacks=self.callbacks, - #normalization=self.normalization, - #embeddingdim=self.embedding_dim + # normalization=self.normalization, + # embeddingdim=self.embedding_dim + scoring_technique=self.scoring_technique ) ) -counter = Experiment() +def analyse(args): + # (2) Get all subfolders + sub_folder_str_paths = os.listdir(args.dir) + experiments = [] + for path in sub_folder_str_paths: + full_path=args.dir +"/"+path + with open(f'{full_path}/configuration.json', 'r') as f: + config = json.load(f) + config = {i: config[i] for i in + ['model', 'full_storage_path', 'embedding_dim', + 'normalization', 'num_epochs', 'batch_size', 'lr', + 'callbacks', + 'scoring_technique', + "scoring_technique", + 'path_dataset_folder', 'p', 'q']} + with open(f'{full_path}/report.json', 'r') as f: + report = json.load(f) + report = {i: report[i] for i in ['Runtime', 'NumParam']} + with open(f'{full_path}/eval_report.json', 'r') as f: + eval_report = json.load(f) + + config.update(eval_report) + config.update(report) + experiments.append(config) + + counter = Experiment() -for i in experiments: - counter.save_experiment(i) + for i in experiments: + counter.save_experiment(i) + df = counter.to_df() + df.sort_values(by=['test_mrr'], ascending=False, inplace=True) + pd.set_option("display.precision", 3) + # print(df) + print(df.to_latex(index=False, float_format="%.3f")) + #print(df.to_markdown(index=False)) + df.to_csv(path_or_buf=args.dir+'/summary.csv') -df = counter.to_df() -pd.set_option("display.precision", 3) -#print(df) -print(df.to_latex(index=False,float_format="%.3f")) -print(df.to_markdown(index=False)) +if __name__ == '__main__': + analyse(get_default_arguments()) diff --git a/dicee/callbacks.py b/dicee/callbacks.py index 6912105d..861692df 100644 --- a/dicee/callbacks.py +++ b/dicee/callbacks.py @@ -271,8 +271,8 @@ def get_kronecker_triple_representation(self, indexed_triple: torch.LongTensor): tail_ent_kron_emb = self.batch_kronecker_product(*torch.hsplit(tail_ent_emb, 2)) return torch.cat((head_ent_emb, head_ent_kron_emb), dim=1), \ - torch.cat((rel_ent_emb, rel_ent_kron_emb), dim=1), \ - torch.cat((tail_ent_emb, tail_ent_kron_emb), dim=1) + torch.cat((rel_ent_emb, rel_ent_kron_emb), dim=1), \ + torch.cat((tail_ent_emb, tail_ent_kron_emb), dim=1) def on_fit_start(self, trainer, model): if isinstance(model.normalize_head_entity_embeddings, dicee.models.base_model.IdentityClass): @@ -283,41 +283,107 @@ def on_fit_start(self, trainer, model): raise NotImplementedError('Normalizer should be reinitialized') -class GN(AbstractCallback): - ''' - Adding Gaussian Noise into Inputs/Parameters - ''' +class Perturb(AbstractCallback): + """ A callback for a three-Level Perturbation - def __init__(self, std: float = 0.1, epoch_ratio: int = None): - super().__init__() - self.std = std - self.epoch_ratio = epoch_ratio if epoch_ratio is not None else 1 - self.epoch_counter = 0 - - def on_train_epoch_start(self, trainer, model): - if self.epoch_counter % self.epoch_ratio == 0: - with torch.no_grad(): - # Access the parameters - for param in model.parameters(): - noise_mat = torch.normal(mean=0, std=self.std, size=param.shape, device=model.device) - param.add_(noise_mat) - self.epoch_counter += 1 + Input Perturbation: During training an input x is perturbed by randomly replacing its element. + In the context of knowledge graph embedding models, x can denote a triple, a tuple of an entity and a relation, + or a tuple of two entities. + A perturbation means that a component of x is randomly replaced by an entity or a relation. + Parameter Perturbation: -class RN(AbstractCallback): - """ Adding Uniform at Random Noise into Inputs/Parameters """ + Output Perturbation: + """ - def __init__(self, std: float = 0.1, epoch_ratio: int = None): + def __init__(self, level: str = "input", ratio: float = 0.0, method: str = None, scaler: float = None, + frequency=None): + """ + level in {input, param, output} + ratio:float btw [0,1] a percentage of mini-batch data point to be perturbed. + method = ? + """ super().__init__() - self.std = std - self.epoch_ratio = epoch_ratio if epoch_ratio is not None else 1 - self.epoch_counter = 0 - def on_train_epoch_start(self, trainer, model): - if self.epoch_counter % self.epoch_ratio == 0: - with torch.no_grad(): - # Access the parameters - for param in model.parameters(): - noise_mat = torch.rand(size=param.shape) * self.std - param.add_(noise_mat) - self.epoch_counter += 1 + assert level in {"input", "param", "out"} + assert ratio >= 0.0 + self.level = level + self.ratio = ratio + self.method = method + self.scaler = scaler + self.frequency = frequency # per epoch, per mini-batch ? + + def on_train_batch_start(self, trainer, model, batch, batch_idx): + # Modifications should be in-place + x, y = batch + n, _ = x.shape + num_of_perturbed_data = int(n * self.ratio) + if num_of_perturbed_data ==0: + return None + assert n > 0 + device = x.get_device() + if device == -1: + device = "cpu" + # Sample random integers from 0 to n without replacement and take k of tem + random_indices = torch.randperm(n, device=device)[:num_of_perturbed_data] + if self.level == "input": + if torch.rand(1) > 0.5: + # Perturb input via heads + perturbation = torch.randint(low=0, high=model.num_entities, size=(num_of_perturbed_data,), + device=device) + x[random_indices] = torch.column_stack( + (perturbation, x[:, 1][random_indices])) + else: + # Perturb input via relations + perturbation = torch.randint(low=0, high=model.num_relations, size=(num_of_perturbed_data,), + device=device) + x[random_indices] = torch.column_stack( + (x[:, 0][random_indices], perturbation)) + elif self.level == "param": + h, r = torch.hsplit(x, 2) + + if self.method == "GN": + if torch.rand(1) > 0.0: + h_selected = h[random_indices] + with torch.no_grad(): + model.entity_embeddings.weight[h_selected] += torch.normal(mean=0, std=self.scaler, + size=model.entity_embeddings.weight[ + h_selected].shape, + device=model.device) + else: + r_selected = r[random_indices] + with (torch.no_grad()): + model.relation_embeddings.weight[r_selected] += torch.normal(mean=0, std=self.scaler, + size= + model.entity_embeddings.weight[ + r_selected].shape, + device=model.device) + elif self.method == "RN": + if torch.rand(1) > 0.0: + h_selected = h[random_indices] + with torch.no_grad(): + model.entity_embeddings.weight[h_selected] += torch.rand( + size=model.entity_embeddings.weight[h_selected].shape, device=model.device) * self.scaler + else: + r_selected = r[random_indices] + with torch.no_grad(): + model.relation_embeddings.weight[r_selected] += torch.rand( + size=model.entity_embeddings.weight[r_selected].shape, device=model.device) * self.scaler + else: + raise RuntimeError(f"--method is given as {self.method}!") + elif self.level == "out": + + if self.method == "RN": + # Soft Perturb ? + perturb = torch.rand(1, device=model.device) * self.scaler + # https://pytorch.org/docs/stable/generated/torch.where.html + # 1.0 => 1.0 - perturb + # 0.0 => perturb + batch[1][random_indices] = torch.where(batch[1][random_indices] == 1.0, 1.0 - perturb, perturb) + elif self.method=="Hard": + # Hard flip all + batch[1][random_indices] = torch.where(batch[1][random_indices] == 1.0, 0.0, 1.0) + else: + raise NotImplementedError(f"{self.level}") + else: + raise RuntimeError(f"--level is given as {self.level}!") diff --git a/dicee/config.py b/dicee/config.py index 8f8db6e3..dc51a339 100644 --- a/dicee/config.py +++ b/dicee/config.py @@ -1,17 +1,6 @@ import argparse from .abstracts import AbstractCallback -class ParseDict(argparse.Action): - def __call__(self, parser, namespace, values, option_string=None): - setattr(namespace, self.dest, - dict()) # set each name of the attribute to hold the created object(s) as dictionary - for value in values: - key, value = value.split('=') - if value.isdigit(): - getattr(namespace, self.dest)[key] = int(value) - continue - getattr(namespace, self.dest)[key] = value - class Namespace(argparse.Namespace): def __init__(self, **kwargs): @@ -40,13 +29,13 @@ def __init__(self, **kwargs): self.optim: str = 'Adam' "Optimizer" - self.embedding_dim: int = 32 + self.embedding_dim: int = 64 "Size of continuous vector representation of an entity/relation" - self.num_epochs: int = 100 + self.num_epochs: int = 150 "Number of pass over the training data" - self.batch_size: type[int | None] = 1024 + self.batch_size: int = 1024 "Mini-batch size if it is None, an automatic batch finder technique applied" self.lr: float = 0.1 @@ -97,7 +86,6 @@ def __init__(self, **kwargs): self.init_param: str = None """ xavier_normal or None""" - self.gradient_accumulation_steps: int = 0 """ Not tested e""" @@ -122,9 +110,14 @@ def __init__(self, **kwargs): self.random_seed: int = 0 "Random Seed" - self.sample_triples_ratio = None - self.read_only_few = None - self.pykeen_model_kwargs: ParseDict = dict() + self.sample_triples_ratio: float = None + """Read some triples that are uniformly at random sampled. Ratio being between 0 and 1""" + + self.read_only_few: int = None + """Read only first few triples """ + + self.pykeen_model_kwargs = dict() + """Additional keyword arguments for pykeen models""" def __iter__(self): # Iterate diff --git a/dicee/dataset_classes.py b/dicee/dataset_classes.py index 7095b6f2..7372a993 100644 --- a/dicee/dataset_classes.py +++ b/dicee/dataset_classes.py @@ -82,6 +82,12 @@ def construct_dataset(*, train_set: np.ndarray, entity_idxs=entity_to_idx, relation_idxs=relation_to_idx, form=form_of_labelling, label_smoothing_rate=label_smoothing_rate) + elif scoring_technique == 'AllvsAll': + # Multi-label imbalanced. + train_set = AllvsAll(train_set, + entity_idxs=entity_to_idx, + relation_idxs=relation_to_idx, + label_smoothing_rate=label_smoothing_rate) else: raise ValueError(f'Invalid scoring technique : {scoring_technique}') elif form_of_labelling == 'RelationPrediction': @@ -135,37 +141,44 @@ def __getitem__(self, idx): class KvsAll(torch.utils.data.Dataset): - """ - KvsAll a Dataset: + """ Creates a dataset for KvsAll training by inheriting from torch.utils.data.Dataset. + Let D denote a dataset for KvsAll training and be defined as D:= {(x,y)_i}_i ^N, where + x: (h,r) is an unique tuple of an entity h \in E and a relation r \in R that has been seed in the input graph. + y: denotes a multi-label vector \in [0,1]^{|E|} is a binary label. \forall y_i =1 s.t. (h r E_i) \in KG - D:= {(x,y)_i}_i ^N, where - . x:(h,r) is a unique h \in E and a relation r \in R and - . y \in [0,1]^{|E|} is a binary label. \forall y_i =1 s.t. (h r E_i) \in KG + .. note:: + TODO - Parameters - ---------- - train_set_idx - Indexed triples for the training. - entity_idxs - mapping. - relation_idxs - mapping. - form - ? - store - ? - label_smoothing_rate - ? + Parameters + ---------- + train_set_idx : numpy.ndarray + n by 3 array representing n triples - # @TODO: one can create h,r where labels are all 0s. + entity_idxs : dictonary + string representation of an entity to its integer id - Returns - ------- - torch.utils.data.Dataset - """ + relation_idxs : dictonary + string representation of a relation to its integer id + + Returns + ------- + self : torch.utils.data.Dataset + + See Also + -------- + + Notes + ----- + + Examples + -------- + >>> a = KvsAll() + >>> a + ? array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + """ def __init__(self, train_set_idx: np.ndarray, entity_idxs, relation_idxs, form, store=None, - label_smoothing_rate=0.0): + label_smoothing_rate: float = 0.0): super().__init__() assert len(train_set_idx) > 0 assert isinstance(train_set_idx, np.ndarray) @@ -221,6 +234,91 @@ def __getitem__(self, idx): return self.train_data[idx], y_vec +class AllvsAll(torch.utils.data.Dataset): + """ Creates a dataset for AllvsAll training by inheriting from torch.utils.data.Dataset. + Let D denote a dataset for AllvsAll training and be defined as D:= {(x,y)_i}_i ^N, where + x: (h,r) is a possible unique tuple of an entity h \in E and a relation r \in R. Hence N = |E| x |R| + y: denotes a multi-label vector \in [0,1]^{|E|} is a binary label. \forall y_i =1 s.t. (h r E_i) \in KG + + .. note:: + AllvsAll extends KvsAll via none existing (h,r). Hence, it adds data points that are labelled without 1s, + only with 0s. + + Parameters + ---------- + train_set_idx : numpy.ndarray + n by 3 array representing n triples + + entity_idxs : dictonary + string representation of an entity to its integer id + + relation_idxs : dictonary + string representation of a relation to its integer id + + Returns + ------- + self : torch.utils.data.Dataset + + See Also + -------- + + Notes + ----- + + Examples + -------- + >>> a = AllvsAll() + >>> a + ? array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + """ + + def __init__(self, train_set_idx: np.ndarray, entity_idxs, relation_idxs, + label_smoothing_rate=0.0): + super().__init__() + assert len(train_set_idx) > 0 + assert isinstance(train_set_idx, np.ndarray) + self.train_data = None + self.train_target = None + self.label_smoothing_rate = torch.tensor(label_smoothing_rate) + self.collate_fn = None + # (1) Create a dictionary of training data pints + # Either from tuple of entities or tuple of an entity and a relation + self.target_dim = len(entity_idxs) + # (h,r) => [t] + store = mapping_from_first_two_cols_to_third(train_set_idx) + print("Number of unique pairs:", len(store)) + for i in range(len(entity_idxs)): + for j in range(len(relation_idxs)): + if store.get((i, j), None) is None: + store[(i, j)] = list() + print("Number of unique augmented pairs:", len(store)) + assert len(store) > 0 + self.train_data = torch.LongTensor(list(store.keys())) + + if sum([len(i) for i in store.values()]) == len(store): + self.train_target = np.array(list(store.values())) + assert isinstance(self.train_target[0], np.ndarray) + else: + self.train_target = list(store.values()) + assert isinstance(self.train_target[0], list) + del store + + def __len__(self): + assert len(self.train_data) == len(self.train_target) + return len(self.train_data) + + def __getitem__(self, idx): + # 1. Initialize a vector of output. + y_vec = torch.zeros(self.target_dim) + existing_indices = self.train_target[idx] + if len(existing_indices) > 0: + y_vec[self.train_target[idx]] = 1.0 + + if self.label_smoothing_rate: + y_vec = y_vec * (1 - self.label_smoothing_rate) + (1 / y_vec.size(0)) + return self.train_data[idx], y_vec + + class KvsSampleDataset(torch.utils.data.Dataset): """ KvsSample a Dataset: @@ -320,7 +418,7 @@ def __init__(self, train_set: np.ndarray, num_entities: int, num_relations: int, # https://pytorch.org/docs/stable/data.html#multi-process-data-loading # TLDL; replace Python objects with non-refcounted representations such as Pandas, Numpy or PyArrow objects self.neg_sample_ratio = torch.tensor( - neg_sample_ratio) + neg_sample_ratio) self.train_set = torch.from_numpy(train_set).unsqueeze(1) self.length = len(self.train_set) self.num_entities = torch.tensor(num_entities) diff --git a/dicee/evaluator.py b/dicee/evaluator.py index d878f636..cff762d3 100644 --- a/dicee/evaluator.py +++ b/dicee/evaluator.py @@ -86,7 +86,7 @@ def eval(self, dataset, trained_model, form_of_labelling, during_training=False) valid_set=dataset.valid_set, test_set=dataset.test_set, trained_model=trained_model) - elif self.args.scoring_technique in ['KvsAll', 'KvsSample', '1vsAll', 'PvsAll', 'CCvsAll']: + elif self.args.scoring_technique in ["AllvsAll", "KvsAll", 'KvsSample', "1vsAll"]: self.eval_with_vs_all(train_set=dataset.train_set, valid_set=dataset.valid_set, test_set=dataset.test_set, diff --git a/dicee/helper_classes.py b/dicee/helper_classes.py index 2c0aa03b..6f890bd5 100644 --- a/dicee/helper_classes.py +++ b/dicee/helper_classes.py @@ -1,3 +1,6 @@ + + +""" import torch import torch.nn as nn import torch.nn.functional as F @@ -82,3 +85,4 @@ def forward(self, pred, target): result = torch.where(torch.gt(pred, 1. - self.alpha), torch.zeros_like(divergence), divergence) return torch.mean(result) +""" \ No newline at end of file diff --git a/dicee/knowledge_graph.py b/dicee/knowledge_graph.py index b8b0793d..9a5a35a0 100644 --- a/dicee/knowledge_graph.py +++ b/dicee/knowledge_graph.py @@ -8,9 +8,9 @@ class KG: """ Knowledge Graph """ def __init__(self, data_dir: str = None, - add_noise_rate:float=None, - sparql_endpoint:str=None, - path_single_kg:str=None, + add_noise_rate: float = None, + sparql_endpoint: str = None, + path_single_kg: str = None, path_for_deserialization: str = None, add_reciprical: bool = None, eval_model: str = None, read_only_few: int = None, sample_triples_ratio: float = None, @@ -35,7 +35,7 @@ def __init__(self, data_dir: str = None, self.num_entities = None self.num_relations = None self.data_dir = data_dir - self.path_single_kg=path_single_kg + self.path_single_kg = path_single_kg self.path_for_deserialization = path_for_deserialization self.add_reciprical = add_reciprical self.eval_model = eval_model diff --git a/dicee/knowledge_graph_embeddings.py b/dicee/knowledge_graph_embeddings.py index b6715766..ba0e5ed3 100644 --- a/dicee/knowledge_graph_embeddings.py +++ b/dicee/knowledge_graph_embeddings.py @@ -369,6 +369,91 @@ def predict_conjunctive_query(self, entity: str, relations: List[str], topk: int else: return results + def answer_multi_hop_query(self,query_structure, query, data, tnorm, neg_norm, lambda_, k_:int): + """ + @ TODO: Define types of inputs + @ TODO: Add comments + @ TODO: Add returned type + """ + # Use this + self.predict() + query_name_dict = { + ("e", ("r",)): "1p", + ("e", ("r", "r")): "2p", + ("e", ("r", "r", "r",),): "3p", + (("e", ("r",)), ("e", ("r",))): "2i", + (("e", ("r",)), ("e", ("r",)), ("e", ("r",))): "3i", + ((("e", ("r",)), ("e", ("r",))), ("r",)): "ip", + (("e", ("r", "r")), ("e", ("r",))): "pi", + # negation + (("e", ("r",)), ("e", ("r", "n"))): "2in", + (("e", ("r",)), ("e", ("r",)), ("e", ("r", "n"))): "3in", + ((("e", ("r",)), ("e", ("r", "n"))), ("r",)): "inp", + (("e", ("r", "r")), ("e", ("r", "n"))): "pin", + (("e", ("r", "r", "n")), ("e", ("r",))): "pni", + + # union + (("e", ("r",)), ("e", ("r",)), ("u",)): "2u", + ((("e", ("r",)), ("e", ("r",)), ("u",)), ("r",)): "up", + + } + print(query_name_dict) + if query_structure == (("e", ("r",)), ("e", ("r", "n"))): + # entity_scores = scores_2in(query, tnorm, neg_norm, lambda_) + pass + #3in + elif query_structure == (("e", ("r",)), ("e", ("r",)), ("e", ("r","n"))): + # entity_scores = scores_3in(model, query, tnorm, neg_norm, lambda_) + pass + #pni + elif query_structure == (("e", ("r", "r", "n")), ("e", ("r",))): + # entity_scores = scores_pni(model, query, tnorm, neg_norm, lambda_, k_) + pass + #pin + elif query_structure == (("e", ("r", "r")), ("e", ("r", "n"))): + # entity_scores = scores_pin(model, query, tnorm, neg_norm, lambda_, k_) + pass + #inp + elif query_structure == ((("e", ("r",)), ("e", ("r", "n"))), ("r",)): + # entity_scores = scores_inp(model, query, tnorm, neg_norm, lambda_, k_) + pass + #2p + elif query_structure == ("e", ("r", "r")): + # entity_scores = scores_2p(model, query, tnorm, k_) + pass + #3p + elif query_structure == ("e", ("r", "r", "r",)): + # entity_scores = scores_3p(model, query, tnorm, k_) + pass + #2i + elif query_structure == (("e", ("r",)), ("e", ("r",))): + # entity_scores = scores_2i(model, query, tnorm) + pass + #3i + elif query_structure == (("e", ("r",)), ("e", ("r",)), ("e", ("r",))): + # entity_scores = scores_3i(model, query, tnorm) + pass + #pi + elif query_structure == (("e", ("r", "r")), ("e", ("r",))): + # entity_scores = scores_pi(model, query, tnorm, k_) + pass + #ip + elif query_structure == ((("e", ("r",)), ("e", ("r",))), ("r",)): + # entity_scores = scores_ip(model, query, tnorm, k_) + pass + + #disjunction + #2u + elif query_structure == (("e", ("r",)), ("e", ("r",)), ("u",)): + # entity_scores = scores_2u(model, query, tnorm) + pass + #up + # here the second tnorm is for t-conorm (used in pairs) + elif query_structure == ((("e", ("r",)), ("e", ("r",)), ("u",)), ("r",)): + # entity_scores = scores_up(model, query, tnorm, tnorm, k_) + pass + else: + raise RuntimeError(f"Imncorrect query_structure {query_structure}") def find_missing_triples(self, confidence: float, entities: List[str] = None, relations: List[str] = None, topk: int = 10, at_most: int = sys.maxsize) -> Set: diff --git a/dicee/models/base_model.py b/dicee/models/base_model.py index b478ce9a..35b120de 100644 --- a/dicee/models/base_model.py +++ b/dicee/models/base_model.py @@ -156,16 +156,7 @@ def configure_optimizers(self, parameters=None): else: raise KeyError() return self.selected_optimizer - """ - def get_optimizer_class(self): - # default params in pytorch. - if self.optimizer_name == 'SGD': - return torch.optim.SGD - elif self.optimizer_name == 'Adam': - return torch.optim.Adam - else: - raise KeyError() - """ + def loss_function(self, yhat_batch, y_batch): return self.loss(yhat_batch, y_batch) @@ -183,7 +174,7 @@ def forward(self, x: Union[torch.LongTensor, Tuple[torch.LongTensor, torch.LongT """ :param x: a batch of inputs - :param y_idx: index of selected output labels. + :param y_idx: indices of selected outputs. :return: """ if isinstance(x, tuple): @@ -285,10 +276,9 @@ def predict_dataloader(self) -> None: def train_dataloader(self) -> None: pass - def get_triple_representation(self, indexed_triple): + def get_triple_representation(self, idx_hrt): # (1) Split input into indexes. - idx_head_entity, idx_relation, idx_tail_entity = indexed_triple[:, 0], indexed_triple[:, 1], indexed_triple[:, - 2] + idx_head_entity, idx_relation, idx_tail_entity = idx_hrt[:, 0], idx_hrt[:, 1], idx_hrt[:, 2] # (2) Retrieve embeddings & Apply Dropout & Normalization head_ent_emb = self.normalize_head_entity_embeddings( self.input_dp_ent_real(self.entity_embeddings(idx_head_entity))) diff --git a/dicee/models/clifford.py b/dicee/models/clifford.py index 55e6a2be..6a629ea6 100644 --- a/dicee/models/clifford.py +++ b/dicee/models/clifford.py @@ -554,7 +554,7 @@ def score(self, h, r, t): t0, tp, tq = self.construct_cl_multivector(t, r=self.r, p=self.p, q=self.q) if self.q > 0: - self.q_coefficients = self.q_coefficients.to(h0.device) + self.q_coefficients = self.q_coefficients.to(h0.device, non_blocking=True) h0, hp, hq, h0, rp, rq = self.apply_coefficients(h0, hp, hq, h0, rp, rq) # (4) Compute a triple score based on interactions described by the basis 1. Eq. 20 diff --git a/dicee/models/pykeen_models.py b/dicee/models/pykeen_models.py index 4bf0884e..7aada0be 100644 --- a/dicee/models/pykeen_models.py +++ b/dicee/models/pykeen_models.py @@ -1,24 +1,22 @@ import torch import torch.utils.data -import numpy as np -from typing import Tuple, Union -import pickle from pykeen.models import model_resolver from .base_model import BaseKGE -import collections - - +from collections import namedtuple +""" def load_numpy(path) -> np.ndarray: print('Loading indexed training data...', end='') with open(path, 'rb') as f: data = np.load(f) return data +""" +""" def load_pickle(*, file_path=str): with open(file_path, 'rb') as f: return pickle.load(f) - +""" class PykeenKGE(BaseKGE): """ A class for using knowledge graph embedding models implemented in Pykeen @@ -31,16 +29,12 @@ class PykeenKGE(BaseKGE): Pykeen_CP: Pykeen_HolE: Pykeen_HolE: - - Training Pykeen_QuatE with KvsAll seems to continuously increase the memory usage """ def __init__(self, args: dict): super().__init__(args) self.model_kwargs = {'embedding_dim': args['embedding_dim'], 'entity_initializer': None if args['init_param'] is None else torch.nn.init.xavier_normal_, - #"entity_regularizer": None, - #"relation_regularizer": None, "random_seed": args["random_seed"] } self.model_kwargs.update(args['pykeen_model_kwargs']) @@ -70,12 +64,13 @@ def __init__(self, args: dict): elif self.name == "TransE": self.model_kwargs["regularizer"] = None else: - print("Pykeen model have a memory leak caused by their implementation of requirlizers") - print(f"{self.name} does not seem to have any requirlizer") + print("Pykeen model have a memory leak caused by their implementation of regularizers") + print(f"{self.name} does not seem to have any regularizer") self.model = model_resolver. \ make(self.name, self.model_kwargs, triples_factory= - collections.namedtuple('triples_factory', ['num_entities', 'num_relations', 'create_inverse_triples'])( + namedtuple('triples_factory', + ['num_entities', 'num_relations', 'create_inverse_triples'])( self.num_entities, self.num_relations, False)) self.loss_history = [] self.args = args @@ -90,16 +85,6 @@ def __init__(self, args: dict): self.interaction = v else: pass - """ - if self.entity_embeddings.embedding_dim == 4 * self.embedding_dim: - self.last_dim = 4 - elif self.entity_embeddings.embedding_dim == 2 * self.embedding_dim: - self.last_dim = 2 - elif self.entity_embeddings.embedding_dim == self.embedding_dim: - self.last_dim = 0 - else: - raise NotImplementedError(self.entity_embeddings.embedding_dim) - """ def forward_k_vs_all(self, x: torch.LongTensor): """ @@ -139,20 +124,4 @@ def forward_triples(self, x: torch.LongTensor) -> torch.FloatTensor: return self.model.score_hrt(hrt_batch=x, mode=None).flatten() def forward_k_vs_sample(self, x: torch.LongTensor, target_entity_idx): - raise NotImplementedError() - - def forward(self, x: Union[torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor]], - y_idx: torch.LongTensor = None): - if isinstance(x, tuple): - x, y_idx = x - return self.forward_k_vs_sample(x=x, target_entity_idx=y_idx) - else: - batch_size, dim = x.shape - if dim == 3: - return self.forward_triples(x) - elif dim == 2: - # h, y = x[0], x[1] - # Note that y can be relation or tail entity. - return self.forward_k_vs_all(x=x) - else: - return self.forward_sequence(x=x) + raise NotImplementedError() \ No newline at end of file diff --git a/dicee/sanity_checkers.py b/dicee/sanity_checkers.py index 040e472a..b13feed2 100644 --- a/dicee/sanity_checkers.py +++ b/dicee/sanity_checkers.py @@ -21,7 +21,7 @@ def sanity_checking_with_arguments(args): print(f'embedding_dim must be strictly positive. Currently:{args.embedding_dim}') raise - if args.scoring_technique not in ['KvsSample', 'KvsAll', 'NegSample', '1vsAll', 'Pyke']: + if args.scoring_technique not in ["AllvsAll", "KvsSample", "KvsAll", "NegSample", "1vsAll", "Pyke"]: raise KeyError(f'Invalid training strategy => {args.scoring_technique}.') assert args.learning_rate > 0 diff --git a/dicee/static_funcs.py b/dicee/static_funcs.py index 62c2ce95..67579321 100644 --- a/dicee/static_funcs.py +++ b/dicee/static_funcs.py @@ -101,8 +101,6 @@ def load_model(path_of_experiment_folder: str, model_name='model.pt') -> Tuple[o except FileNotFoundError: print("relation_to_idx.p not found") relation_to_idx=dict() - #assert isinstance(entity_to_idx, dict) - #assert isinstance(relation_to_idx, dict) print(f'Done! It took {time.time() - start_time:.4f}') return model, entity_to_idx, relation_to_idx @@ -360,12 +358,8 @@ def intialize_model(args: dict) -> Tuple[object, str]: def load_json(p: str) -> dict: - try: - with open(p, 'r') as r: + with open(p, 'r') as r: args = json.load(r) - except FileNotFoundError: - print('Config file not found') - exit(1) return args @@ -426,46 +420,6 @@ def deploy_relation_prediction(pre_trained_kge, str_subject, str_object, top_k): return f'( {str_subject}, ?, {str_object} )', pd.DataFrame({'Relations': relations, 'Score': scores}) -def semi_supervised_split(train_set: np.ndarray, train_split_ratio=None, calibration_split_ratio=None): - """ - Split input triples into three splits - 1. split corresponds to the first 10% of the input - 2. split corresponds to the second 10% of the input - 3. split corresponds to the remaining data. - """ - # Divide train_set into - n, d = train_set.shape - assert d == 3 - # (1) Select X % of the first triples for the training. - train = train_set[: int(n * train_split_ratio)] - # (2) Select remaining first Y % of the triples for the calibration. - calibration = train_set[len(train):len(train) + int(n * calibration_split_ratio)] - # (3) Consider remaining triples as unlabelled. - unlabelled = train_set[-len(train) - len(calibration):] - print(f'Shapes:\tTrain{train.shape}\tCalib:{calibration.shape}\tUnlabelled:{unlabelled.shape}') - return train, calibration, unlabelled - - -def p_value(non_conf_scores, act_score): - if len(act_score.shape) < 2: - act_score = act_score.unsqueeze(-1) - - # return (torch.sum(non_conf_scores >= act_score) + 1) / (len(non_conf_scores) + 1) - return (torch.sum(non_conf_scores >= act_score, dim=-1) + 1) / (len(non_conf_scores) + 1) - - -def norm_p_value(p_values, variant): - if len(p_values.shape) < 2: - p_values = p_values.unsqueeze(0) - - if variant == 0: - norm_p_values = p_values / (torch.max(p_values, dim=-1).values.unsqueeze(-1)) - else: - norm_p_values = p_values.scatter_(1, torch.max(p_values, dim=-1).indices.unsqueeze(-1), - torch.ones_like(p_values)) - return norm_p_values - - @timeit def vocab_to_parquet(vocab_to_idx, name, path_for_serialization, print_into): # @TODO: This function should take any DASK/Pandas DataFrame or Series. diff --git a/dicee/static_preprocess_funcs.py b/dicee/static_preprocess_funcs.py index 4da54a46..e26b91ec 100644 --- a/dicee/static_preprocess_funcs.py +++ b/dicee/static_preprocess_funcs.py @@ -6,8 +6,6 @@ from .sanity_checkers import sanity_checking_with_arguments enable_log = False - - def timeit(func): @functools.wraps(func) def timeit_wrapper(*args, **kwargs): @@ -58,7 +56,7 @@ def preprocesses_input_args(args): args.eval_model = None # reciprocal checking - if args.scoring_technique in ['KvsSample', 'KvsAll', '1vsAll', 'Pyke']: + if args.scoring_technique in ["AllvsAll", "KvsSample", "KvsAll", "1vsAll"]: args.apply_reciprical_or_noise = True elif args.scoring_technique == 'NegSample': args.apply_reciprical_or_noise = False diff --git a/dicee/trainer/dice_trainer.py b/dicee/trainer/dice_trainer.py index 0e37778c..e7d6c205 100644 --- a/dicee/trainer/dice_trainer.py +++ b/dicee/trainer/dice_trainer.py @@ -3,7 +3,8 @@ from typing import Union from dicee.models.base_model import BaseKGE from dicee.static_funcs import select_model -from dicee.callbacks import PPE, FPPE, Eval, KronE, PrintCallback, KGESaveCallback, AccumulateEpochLossCallback, GN, RN +from dicee.callbacks import (PPE, FPPE, Eval, KronE, PrintCallback, KGESaveCallback, AccumulateEpochLossCallback, + Perturb) from dicee.dataset_classes import construct_dataset, reload_dataset from .torch_trainer import TorchTrainer from .torch_trainer_ddp import TorchDDPTrainer @@ -49,10 +50,8 @@ def get_callbacks(args): if isinstance(args.callbacks, list): return callbacks for k, v in args.callbacks.items(): - if k == "GN": - callbacks.append(GN(std=v['std'], epoch_ratio=v.get('epoch_ratio'))) - elif k=='RN': - callbacks.append(RN(std=v['std'], epoch_ratio=v.get('epoch_ratio'))) + if k=="Perturb": + callbacks.append(Perturb(**v)) elif k == 'FPP': callbacks.append( FPPE(num_epochs=args.num_epochs, path=args.full_storage_path, @@ -66,10 +65,9 @@ def get_callbacks(args): elif k == 'Eval': callbacks.append(Eval(path=args.full_storage_path, epoch_ratio=v.get('epoch_ratio'))) else: - raise RuntimeError('Incorrect callback') + raise RuntimeError(f'Incorrect callback:{k}') return callbacks - class DICE_Trainer: """ DICE_Trainer implement @@ -123,7 +121,7 @@ def continual_start(self): form_of_labelling: str """ - self.trainer = self.initialize_trainer(callbacks=get_callbacks(self.args), plugins=[]) + self.trainer = self.initialize_trainer(callbacks=get_callbacks(self.args)) model, form_of_labelling = self.initialize_or_load_model() assert form_of_labelling in ['EntityPrediction', 'RelationPrediction', 'Pyke'] assert self.args.scoring_technique in ['KvsSample', '1vsAll', 'KvsAll', 'NegSample'] @@ -136,7 +134,7 @@ def continual_start(self): return model, form_of_labelling @timeit - def initialize_trainer(self, callbacks: List, plugins: List) -> pl.Trainer: + def initialize_trainer(self, callbacks: List) -> pl.Trainer: """ Initialize Trainer from input arguments """ return initialize_trainer(self.args, callbacks) @@ -185,7 +183,7 @@ def start(self, dataset: KG) -> Tuple[BaseKGE, str]: return self.k_fold_cross_validation(dataset) else: self.trainer: Union[TorchTrainer, TorchDDPTrainer, pl.Trainer] - self.trainer = self.initialize_trainer(callbacks=get_callbacks(self.args), plugins=[]) + self.trainer = self.initialize_trainer(callbacks=get_callbacks(self.args)) model, form_of_labelling = self.initialize_or_load_model() self.trainer.evaluator = self.evaluator # @TODO Why do we need to sent the dataset ? @@ -215,6 +213,7 @@ def k_fold_cross_validation(self, dataset) -> Tuple[BaseKGE, str]: kf = KFold(n_splits=self.args.num_folds_for_cv, shuffle=True, random_state=1) model = None eval_folds = [] + form_of_labelling=None # (2) Iterate over (1) for (ith, (train_index, test_index)) in enumerate(kf.split(dataset.train_set)): # (2.1) Create a new copy for the callbacks diff --git a/dicee/trainer/torch_trainer.py b/dicee/trainer/torch_trainer.py index d0fcb77c..59e0c2bb 100644 --- a/dicee/trainer/torch_trainer.py +++ b/dicee/trainer/torch_trainer.py @@ -88,7 +88,7 @@ def _run_epoch(self, epoch: int) -> float: f"| ForwardBackwardUpdate:{(time.time() - start_time):.2f}sec " f"| BatchConst.:{construct_mini_batch_time:.2f}sec " f"| Mem. Usage {self.process.memory_info().rss / 1_000_000: .5}MB " - f"avail. {psutil.virtual_memory().percent} %") + f" ({psutil.virtual_memory().percent} %)") else: print( f"Epoch:{epoch + 1} " diff --git a/main.py b/main.py index 85bc31e2..2332c0d1 100755 --- a/main.py +++ b/main.py @@ -2,7 +2,6 @@ import json from dicee.executer import Execute import pytorch_lightning as pl -from dicee.config import ParseDict import argparse @@ -11,7 +10,7 @@ def get_default_arguments(description=None): parser = pl.Trainer.add_argparse_args(argparse.ArgumentParser(add_help=False)) # Default Trainer param https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#methods # Data related arguments - parser.add_argument("--path_dataset_folder", type=str, default=None, + parser.add_argument("--path_dataset_folder", type=str, default="KGs/UMLS", help="The path of a folder containing train.txt, and/or valid.txt and/or test.txt" ",e.g., KGs/UMLS") parser.add_argument("--sparql_endpoint", type=str, default=None, @@ -30,7 +29,7 @@ def get_default_arguments(description=None): default="Keci", choices=["ConEx", "AConEx", "ConvQ", "AConvQ", "ConvO", "AConvO", "QMult", "OMult", "Shallom", "DistMult", "TransE", "ComplEx", "Keci", - "Pykeen_QuatE", "Pykeen_DistMult", "Pykeen_BoxE", "Pykeen_CP", + "Pykeen_MuRE", "Pykeen_QuatE", "Pykeen_DistMult", "Pykeen_BoxE", "Pykeen_CP", "Pykeen_HolE", "Pykeen_ProjE", "Pykeen_RotatE", "Pykeen_TransE", "Pykeen_TransF", "Pykeen_TransH", "Pykeen_TransR", "Pykeen_TuckER", "Pykeen_ComplEx"], @@ -40,22 +39,25 @@ def get_default_arguments(description=None): parser.add_argument('--optim', type=str, default='Adam', help='An optimizer', choices=['Adam', 'SGD']) - parser.add_argument('--embedding_dim', type=int, default=64, + parser.add_argument('--embedding_dim', type=int, default=32, help='Number of dimensions for an embedding vector. ') - parser.add_argument("--num_epochs", type=int, default=10, help='Number of epochs for training. ') - parser.add_argument('--batch_size', type=int, default=256, help='Mini batch size. If None, automatic batch finder is applied') + parser.add_argument("--num_epochs", type=int, default=50, help='Number of epochs for training. ') + parser.add_argument('--batch_size', type=int, default=1024, + help='Mini batch size. If None, automatic batch finder is applied') parser.add_argument("--lr", type=float, default=0.1) - parser.add_argument('--callbacks', type=json.loads, default={}, - help=' {"PPE":{ "last_percent_to_consider": 10}}, {"GN": {"std":0.1}}') + parser.add_argument('--callbacks', type=json.loads, + default={}, + help='{"PPE":{ "last_percent_to_consider": 10}}' + '"Perturb": {"level": "out", "ratio": 0.2, "method": "RN", "scaler": 0.3}') parser.add_argument("--backend", type=str, default='pandas', choices=["pandas", "polars"], help='Backend for loading, preprocessing, indexing input knowledge graph.') - parser.add_argument("--trainer", type=str, default='torchCPUTrainer', + parser.add_argument("--trainer", type=str, default='PL', choices=['torchCPUTrainer', 'PL', 'torchDDP'], help='PL (pytorch lightning trainer), torchDDP (custom ddp), torchCPUTrainer (custom cpu only)') - parser.add_argument('--scoring_technique', default='KvsAll', + parser.add_argument('--scoring_technique', default="AllvsAll", help="Training technique for knowledge graph embedding model", - choices=["KvsAll", "1vsAll", "NegSample", "KvsSample"]) + choices=["AllvsAll", "KvsAll", "1vsAll", "NegSample", "KvsSample"]) parser.add_argument('--neg_ratio', type=int, default=0, help='The number of negative triples generated per positive triple.') parser.add_argument('--weight_decay', type=float, default=0.0, help='L2 penalty e.g.(0.00001)') @@ -79,7 +81,8 @@ def get_default_arguments(description=None): parser.add_argument("--save_model_at_every_epoch", type=int, default=None, help='At every X number of epochs model will be saved. If None, we save 4 times.') parser.add_argument("--label_smoothing_rate", type=float, default=0.0, help='None for not using it.') - parser.add_argument("--kernel_size", type=int, default=3, help="Square kernel size for convolution based models.") + parser.add_argument("--kernel_size", type=int, default=3, + help="Square kernel size for convolution based models.") parser.add_argument("--num_of_output_channels", type=int, default=2, help="# of output channels in convolution") parser.add_argument("--num_core", type=int, default=1, @@ -95,10 +98,7 @@ def get_default_arguments(description=None): help='P for Clifford Algebra') parser.add_argument('--q', type=int, default=0, help='Q for Clifford Algebra') - parser.add_argument("--pykeen_model_kwargs", nargs='*', action=ParseDict, - default={}, - help='Additional parameters ' - 'to be passed into a knowledge graph embedding model imported from Pykeen') + parser.add_argument('--pykeen_model_kwargs', type=json.loads, default={}) if description is None: return parser.parse_args() return parser.parse_args(description) diff --git a/pyproject.toml b/pyproject.toml index 16bf032e..c12b842e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dicee" -requires-python = ">=3.10" +requires-python = ">=3.9" version="0.0.6" authors = [ { name="Caglar Demir", email="caglardemir8@gmail.com" }, @@ -8,7 +8,7 @@ authors = [ description="Dice embedding is an hardware-agnostic framework for large-scale knowledge graph embedding applications" readme = "README.md" classifiers = [ - "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.9", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] diff --git a/setup.py b/setup.py index 2bfe671f..d5b8fdee 100644 --- a/setup.py +++ b/setup.py @@ -24,9 +24,9 @@ author_email='caglardemir8@gmail.com', url='https://github.com/dice-group/dice-embeddings', classifiers=[ - "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.9", "License :: OSI Approved :: MIT License"], - python_requires='>=3.10', + python_requires='>=3.9', long_description=long_description, long_description_content_type="text/markdown", ) diff --git a/tests/test_pykeen.py b/tests/test_pykeen.py index 835ef0ee..90877b9f 100644 --- a/tests/test_pykeen.py +++ b/tests/test_pykeen.py @@ -41,11 +41,11 @@ def test_defaultParameters_case(self, model_name): elif args.model == "Pykeen_BoxE": assert 0.85 >= result["Train"]["MRR"] >= 0.78 elif args.model == "Pykeen_RotatE": - assert 0.67 >= result["Train"]["MRR"] >= 0.64 + assert 0.67 >= result["Train"]["MRR"] >= 0.60 elif args.model == "Pykeen_CP": # 1.5M params assert 1.00 >= result["Train"]["MRR"] >= 0.99 elif args.model == "Pykeen_HolE": # 14.k params - assert 0.89 >= result["Train"]["MRR"] >= 0.88 + assert 0.95 >= result["Train"]["MRR"] >= 0.88 elif args.model == "Pykeen_ProjE": # 14.k params assert 0.88 >= result["Train"]["MRR"] >= 0.78 elif args.model == "Pykeen_TuckER": # 276.k params @@ -55,13 +55,13 @@ def test_defaultParameters_case(self, model_name): elif args.model == "Pykeen_TransF": # 14.5 k params assert 0.17 >= result["Train"]["MRR"] >= 0.16 elif args.model == "Pykeen_TransH": # 20.4 k params - assert 0.69 >= result["Train"]["MRR"] >= 0.60 + assert 0.69 >= result["Train"]["MRR"] >= 0.58 elif args.model == "Pykeen_TransD": # 29.1 k params assert 0.73 >= result["Train"]["MRR"] >= 0.60 elif args.model == "Pykeen_TransE": # 29.1 k params - assert 0.45 >= result["Train"]["MRR"] >= 0.40 + assert 0.45 >= result["Train"]["MRR"] >= 0.15 - def test_GNCallback_case(self, model_name): + def test_perturb_callback_case(self, model_name): args = template(model_name) - args.callbacks = {'GN': {"std": 0.1}} - Execute(args).start() \ No newline at end of file + args.callbacks = {"Perturb": {"level": "out", "ratio": 0.2, "method": "RN", "scaler": 0.3}} + Execute(args).start() diff --git a/tests/test_regression_all_vs_all.py b/tests/test_regression_all_vs_all.py new file mode 100644 index 00000000..00fe0ceb --- /dev/null +++ b/tests/test_regression_all_vs_all.py @@ -0,0 +1,20 @@ +from dicee.executer import Execute +import pytest +from dicee.config import Namespace + + +class TestRegressionAllvsAll: + @pytest.mark.filterwarnings('ignore::UserWarning') + def test_allvsall_kvsall(self): + args = Namespace() + args.path_dataset_folder = 'KGs/UMLS' + args.scoring_technique = 'KvsAll' + args.eval_model = 'train_val_test' + result1 = Execute(args).start() + + args = Namespace() + args.path_dataset_folder = 'KGs/UMLS' + args.scoring_technique = 'AllvsAll' + args.eval_model = 'train_val_test' + result2 = Execute(args).start() + assert result2['Test']['MRR'] >= result1['Test']['MRR'] \ No newline at end of file