Skip to content

Commit

Permalink
Merge branch 'feature/ScriptsForGridSearch' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
Demirrr committed Feb 22, 2022
2 parents 3e37b34 + 827da56 commit e06de9a
Show file tree
Hide file tree
Showing 9 changed files with 248 additions and 409 deletions.
42 changes: 27 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,34 @@ This share link expires in 72 hours. For free permanent hosting, check out Space
```
![alt text](core/figures/deploy_qmult_family.png)

### Available Models
1. [DistMult](https://arxiv.org/pdf/1412.6575.pdf)
2. [ComplEx](https://arxiv.org/pdf/1606.06357.pdf)
3. [Shallom](https://arxiv.org/pdf/2101.09090.pdf)
4. [ConEx](https://openreview.net/forum?id=6T45-4TFqaX&invitationId=eswc-conferences.org/ESWC/2021/Conference/Research_Track/Paper49/-/Camera_Ready_Revision&referrer=%5BTasks%5D(%2Ftasks))
5. [QMult](https://proceedings.mlr.press/v157/demir21a.html)
6. [OMult](https://proceedings.mlr.press/v157/demir21a.html)
7. [ConvQ](https://proceedings.mlr.press/v157/demir21a.html)
8. [ConvO](https://proceedings.mlr.press/v157/demir21a.html)
9. Contact us to add your favorite one :)


### Pretrained Models
TODO: Add a script to load and deploy a pretrained model
## Pre-trained Models
Please contact: ```[email protected] ``` or ```[email protected] ``` , if you lack hardware resources to obtain embeddings of a specific knowledge Graph.
- [English-French DBpedia Shallom embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/shallom/Shallom_EN_FR_15K_V1.zip)
- [YAGO3-10 ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/YAGO3-10.zip)
- [FB15K-237 ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/FB15K-237.zip)
- [FB15K ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/FB15K.zip)
- [WN18RR ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/WN18RR.zip)
- [WN18 ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/WN18.zip)
- [Forte ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/ConEx_Forte.zip)
- [Hepatitis ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/ConEx_Hepatitis.zip)
- [Lymphography ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/ConEx_Lymphography.zip)
- [Mammographic ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/ConEx_Mammographic.zip)
- [Animals ConEx embeddings](https://hobbitdata.informatik.uni-leipzig.de/KGE/conex/Animals.zip)
- For more please look at [Hobbit Data](https://hobbitdata.informatik.uni-leipzig.de/KGE/)

### Available Models
1. Multiplicative based KGE models:
1. [DistMult](https://arxiv.org/pdf/1412.6575.pdf)
2. [ComplEx](https://arxiv.org/pdf/1606.06357.pdf)
3. [QMult](https://proceedings.mlr.press/v157/demir21a.html)
4. [OMult](https://proceedings.mlr.press/v157/demir21a.html)
2. Feed Forward Neural Models
1. [Shallom](https://arxiv.org/pdf/2101.09090.pdf)
3. Convolutional Neural models
1. [ConEx](https://openreview.net/forum?id=6T45-4TFqaX&invitationId=eswc-conferences.org/ESWC/2021/Conference/Research_Track/Paper49/-/Camera_Ready_Revision&referrer=%5BTasks%5D(%2Ftasks))
2. [ConvQ](https://proceedings.mlr.press/v157/demir21a.html)
3. [ConvO](https://proceedings.mlr.press/v157/demir21a.html)
4. Contact us to add your favorite one :)


### Training
Expand Down Expand Up @@ -146,7 +159,6 @@ url={https://openreview.net/forum?id=6T45-4TFqaX}}
pages={179--182},
year={2021},
organization={IEEE}
```

For any questions or wishes, please contact: ```[email protected]``` or ```[email protected]```
Expand Down
133 changes: 68 additions & 65 deletions core/executer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import warnings
import os
from .models import *
from .helper_classes import LabelRelaxationLoss,LabelSmoothingLossCanonical
from .helper_classes import LabelRelaxationLoss, LabelSmoothingLossCanonical
from .dataset_classes import StandardDataModule, KvsAll, CVDataModule
from .knowledge_graph import KG
import torch
Expand All @@ -28,9 +28,8 @@
warnings.filterwarnings(action="ignore", category=DeprecationWarning)
seed_everything(1, workers=True)


# TODO later measure the spent time for every done ! operation

# TODO: Execute can inherit from Trainer and Evaluator Classes
# By doing so we can increase the modularity of our code.
class Execute:
def __init__(self, args, continuous_training=False):
# (1) Process arguments and sanity checking
Expand Down Expand Up @@ -101,7 +100,7 @@ def start(self) -> dict:
else:
message = f'{total_runtime / (60 ** 2):.3f} hours'
self.report['Runtime'] = message
self.report.update(extract_model_summary(trained_model.summarize()))

print(f'Runtime of {trained_model.name}:', total_runtime)
print(f'NumParam of {trained_model.name}:', self.report["NumParam"])
# print(f'Estimated of {trained_model.name}:', self.report["EstimatedSizeMB"])
Expand Down Expand Up @@ -172,6 +171,30 @@ def config_kge_sanity_checking(self):
if self.args.scoring_technique == 'KvsAll':
self.args.neg_ratio = None

def save_embeddings(self, embeddings: np.ndarray, indexes, path: str) -> None:
"""
:param embeddings:
:param indexes:
:param path:
:return:
"""
try:
df = pd.DataFrame(embeddings, index=indexes)
del embeddings
num_mb = df.memory_usage(index=True, deep=True).sum() / (10 ** 6)
if num_mb > 10 ** 6:
df = dd.from_pandas(df, npartitions=len(df) / 100)
# PARQUET wants columns to be stn
df.columns = df.columns.astype(str)
df.to_parquet(self.args.full_storage_path + '/' + trained_model.name + '_entity_embeddings')
else:
df.to_csv(path)
except KeyError or AttributeError as e:
print('Exception occurred at saving entity embeddings. Computation will continue')
print(e)
del df

def store(self, trained_model) -> None:
"""
Store trained_model model and save embeddings into csv file.
Expand All @@ -186,45 +209,25 @@ def store(self, trained_model) -> None:
with open(self.args.full_storage_path + '/configuration.json', 'w') as file_descriptor:
temp = vars(self.args)
json.dump(temp, file_descriptor)
print('Saving embeddings..')
# TODO: Find a faster way to store embeddings.
if trained_model.name == 'Shallom':
entity_emb = trained_model.get_embeddings()
else:
# See available memory and decide whether embeddings are stored separately or not.
available_memory = [i.split() for i in os.popen('free -h').read().splitlines()][1][-1] # ,e.g., 10Gi
available_memory_mb = float(available_memory[:-2]) * 1000
self.report.update(extract_model_summary(trained_model.summarize()))

if available_memory_mb * .01 > self.report['EstimatedSizeMB']:
""" We have enough space for data conversion"""
print('Saving embeddings..')
entity_emb, relation_ebm = trained_model.get_embeddings()
try:
df = pd.DataFrame(relation_ebm, index=self.dataset.relations_str)
df.columns = df.columns.astype(str)
num_mb = df.memory_usage(index=True, deep=True).sum() / (10 ** 6)
if num_mb > 10 ** 6:
df = dd.from_pandas(df, npartitions=len(df) / 100)
# PARQUET wants columns to be stn
df.columns = df.columns.astype(str)
df.to_parquet(self.args.full_storage_path + '/' + trained_model.name + '_relation_embeddings')
# TO READ PARQUET FILE INTO PANDAS
# m=dd.read_parquet(self.storage_path + '/' + trained_model.name + '_relation_embeddings').compute()
else:
df.to_csv(self.args.full_storage_path + '/' + trained_model.name + '_relation_embeddings.csv')
except KeyError or AttributeError as e:
print('Exception occurred at saving relation embeddings. Computation will continue')
print(e)

# Free mem del
del df
self.save_embeddings(entity_emb, indexes=self.dataset.entities_str,
path=self.args.full_storage_path + '/' + trained_model.name + '_entity_embeddings.csv')
del entity_emb
if relation_ebm is not None:
self.save_embeddings(relation_ebm, indexes=self.dataset.relations_str,
path=self.args.full_storage_path + '/' + trained_model.name + '_relation_embeddings.csv')
del relation_ebm
try:
df = pd.DataFrame(entity_emb, index=self.dataset.entities_str)
num_mb = df.memory_usage(index=True, deep=True).sum() / (10 ** 6)
if num_mb > 10 ** 6:
df = dd.from_pandas(df, npartitions=len(df) / 100)
# PARQUET wants columns to be stn
df.columns = df.columns.astype(str)
df.to_parquet(self.args.full_storage_path + '/' + trained_model.name + '_relation_embeddings')
else:
df.to_csv(self.args.full_storage_path + '/' + trained_model.name + '_entity_embeddings.csv', )
except KeyError or AttributeError as e:
print('Exception occurred at saving entity embeddings.Computation will continue')
print(e)
else:
print('There is not enough memory to store embeddings separately.')

def get_batch_1_to_N(self, input_vocab, triples, idx, output_dim) -> Tuple[np.array, torch.FloatTensor]:
""" A mini-batch for training on multi-labels (x,y) -> [0.,0.,0.,----, 1.,1,]
Expand All @@ -244,6 +247,14 @@ def get_batch_1_to_N(self, input_vocab, triples, idx, output_dim) -> Tuple[np.ar
targets[idx, input_vocab[pair]] = 1
return np.array(batch), torch.FloatTensor(targets)

@staticmethod
def model_fitting(trainer, model, train_dataloaders) -> None:
print(model)
print(model.summarize())
print("Model fitting...")
trainer.fit(model, train_dataloaders=train_dataloaders)
print("Done!")

def training_kvsall(self):
"""
Train models with KvsAll or NegativeSampling
Expand All @@ -263,15 +274,14 @@ def training_kvsall(self):
batch_size=self.args.batch_size,
num_workers=self.args.num_processes,
label_smoothing_rate=self.args.label_smoothing_rate)
# 5. Train model
self.trainer.fit(model, train_dataloaders=dataset.train_dataloader())

# 3. Train model.
self.model_fitting(trainer=self.trainer, model=model, train_dataloaders=dataset.train_dataloader())
# 4. Test model on the training dataset if it is needed.
if self.args.eval_on_train:
res = self.evaluate_lp_k_vs_all(model, self.dataset.train_set,
f'Evaluate {model.name} on Train set', form_of_labelling)
self.report['Train'] = res

# 6. Test model on validation and test sets if possible.
# 5. Test model on the validation and test dataset if it is needed.
if self.args.eval:
if len(self.dataset.valid_set) > 0:
res = self.evaluate_lp_k_vs_all(model, self.dataset.valid_set,
Expand All @@ -289,7 +299,6 @@ def training_1vsall(self):
model, form_of_labelling = select_model(self.args)
print(f'1vsAll training starts: {model.name}')
form_of_labelling = '1VsAll'

# 2. Create training data.
dataset = StandardDataModule(train_set_idx=self.dataset.train_set,
valid_set_idx=self.dataset.valid_set,
Expand All @@ -301,27 +310,23 @@ def training_1vsall(self):
batch_size=self.args.batch_size,
num_workers=self.args.num_processes
)

# 3. Display the selected model's architecture.
if self.args.label_relaxation_rate:
model.loss=LabelRelaxationLoss(alpha=self.args.label_relaxation_rate)
#model.loss=LabelSmoothingLossCanonical()
model.loss = LabelRelaxationLoss(alpha=self.args.label_relaxation_rate)
# model.loss=LabelSmoothingLossCanonical()

elif self.args.label_smoothing_rate:
model.loss = nn.CrossEntropyLoss(label_smoothing=self.args.label_smoothing_rate)
else:
model.loss = nn.CrossEntropyLoss()

print(model)
print(model.loss)
# 5. Train model
self.trainer.fit(model, train_dataloaders=dataset.train_dataloader())
# 3. Train model
self.model_fitting(trainer=self.trainer, model=model, train_dataloaders=dataset.train_dataloader())
# 4. Test model on the training dataset if it is needed.
if self.args.eval_on_train:
res = self.evaluate_lp_k_vs_all(model, self.dataset.train_set,
f'Evaluate {model.name} on train set', form_of_labelling)
self.report['Train'] = res

# 6. Test model on validation and test sets if possible.
# 5. Test model on the validation and test dataset if it is needed.
if self.args.eval:
if len(self.dataset.valid_set) > 0:
res = self.evaluate_lp_k_vs_all(model, self.dataset.valid_set,
Expand Down Expand Up @@ -353,15 +358,13 @@ def training_negative_sampling(self) -> pl.LightningModule:
batch_size=self.args.batch_size,
num_workers=self.args.num_processes
)
print('Done!\n')
print(model)
print('Fitting the model...')
self.trainer.fit(model, train_dataloaders=dataset.train_dataloader())
print('Done!\n')
# 3. Train model
self.model_fitting(trainer=self.trainer, model=model, train_dataloaders=dataset.train_dataloader())
# 4. Test model on the training dataset if it is needed.
if self.args.eval_on_train:
res = self.evaluate_lp(model, self.dataset.train_set, f'Evaluate {model.name} on Train set')
self.report['Train'] = res

# 5. Test model on the validation and test dataset if it is needed.
if self.args.eval:
if len(self.dataset.valid_set) > 0:
self.report['Val'] = self.evaluate_lp(model, self.dataset.valid_set, 'Evaluation of Validation set')
Expand Down Expand Up @@ -606,8 +609,8 @@ def k_fold_cross_validation(self) -> pl.LightningModule:
batch_size=self.args.batch_size,
num_workers=self.args.num_processes
)
# 5. Train model
trainer.fit(model, train_dataloaders=dataset.train_dataloader())
# 3. Train model
self.model_fitting(trainer=trainer, model=model, train_dataloaders=dataset.train_dataloader())

# 6. Test model on validation and test sets if possible.
res = self.evaluate_lp_k_vs_all(model, test_set_for_i_th_fold, form_of_labelling=form_of_labelling)
Expand Down
18 changes: 16 additions & 2 deletions core/models/base_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytorch_lightning as pl
import torch
from pytorch_lightning.utilities.types import TRAIN_DATALOADERS, EVAL_DATALOADERS
from torch import nn
from torch.nn import functional as F
from torchmetrics import Accuracy as accuracy
Expand All @@ -8,6 +9,7 @@


class BaseKGE(pl.LightningModule):

def __init__(self, learning_rate=.1):
super().__init__()
self.name = 'Not init'
Expand Down Expand Up @@ -38,11 +40,11 @@ def forward(self, x):

def training_step(self, batch, batch_idx):
x_batch, y_batch = batch
pred_batch=self.forward(x_batch)
pred_batch = self.forward(x_batch)
train_loss = self.loss_function(pred_batch, y_batch)
return {'loss': train_loss}

#def training_epoch_end(self, outputs) -> None:
# def training_epoch_end(self, outputs) -> None:
# """ DBpedia debugging removed."""
# #avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
# #self.log('avg_loss', avg_loss, on_epoch=False, prog_bar=True)
Expand Down Expand Up @@ -78,3 +80,15 @@ def test_step(self, batch, batch_idx):
def test_epoch_end(self, outputs: List[Any]):
avg_test_accuracy = torch.stack([x['test_accuracy'] for x in outputs]).mean()
self.log('avg_test_accuracy', avg_test_accuracy, on_epoch=True, prog_bar=True)

def test_dataloader(self) -> EVAL_DATALOADERS:
pass

def val_dataloader(self) -> EVAL_DATALOADERS:
pass

def predict_dataloader(self) -> EVAL_DATALOADERS:
pass

def train_dataloader(self) -> TRAIN_DATALOADERS:
pass
Loading

0 comments on commit e06de9a

Please sign in to comment.