Merge branch 'release/1.1.0'

Accenture · Aug 16, 2019 · 19c5321 · 19c5321
2 parents ac825df + a0de5fe
commit 19c5321
Show file tree

Hide file tree

Showing 75 changed files with 11,069 additions and 626,962 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,3 +1,2 @@
-include ampligraph/latent_features/prime_number_list.txt
 include ampligraph/logger.conf
 
diff --git a/README.md b/README.md
@@ -1,13 +1,13 @@
-# AmpliGraph
+# ![AmpliGraph](docs/img/ampligraph_logo_transparent_300.png)
 
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2595043.svg)](https://doi.org/10.5281/zenodo.2595043)
 
 [![Documentation Status](https://readthedocs.org/projects/ampligraph/badge/?version=latest)](http://ampligraph.readthedocs.io/?badge=latest)
 
 
-**Open source Python library that predicts links between concepts in a knowledge graph.**
+**Open source library based on TensorFlow that predicts links between concepts in a knowledge graph.**
 
-AmpliGraph is a suite of neural machine learning models for relational Learning, a branch of machine learning
+**AmpliGraph** is a suite of neural machine learning models for relational Learning, a branch of machine learning
 that deals with supervised learning on knowledge graphs.
 
 
@@ -40,10 +40,10 @@ It then combines embeddings with model-specific scoring functions to predict uns
 
 AmpliGraph includes the following submodules:
 
-* **KG Loaders**: Helper functions to load datasets (knowledge graphs).
-* **Latent Feature Models**: knowledge graph embedding models. AmpliGraph contains: TransE, DistMult, ComplEx, HolE. (More to come!)
-* **Evaluation**: Metrics and evaluation protocols to assess the predictive power of the models.
-
+* **Datasets**: helper functions to load datasets (knowledge graphs).
+* **Models**: knowledge graph embedding models. AmpliGraph contains TransE, DistMult, ComplEx, HolE. (More to come!)
+* **Evaluation**: metrics and evaluation protocols to assess the predictive power of the models.
+* **Discovery**: High-level convenience APIs for knowledge discovery (discover new facts, cluster entities, predict near duplicates).
 
 
 ## Installation
@@ -70,7 +70,7 @@ Install from pip or conda:
 **CPU-only**
 
 ```
-pip install tensorflow==1.13.1
+pip install "tensorflow>=1.13.1,<2.0"
 
 or
 
@@ -80,7 +80,7 @@ conda install tensorflow=1.13.1
 **GPU support**
 
 ```
-pip install tensorflow-gpu==1.13.1
+pip install "tensorflow-gpu>=1.13.1,<2.0"
 
 or
 
@@ -114,19 +114,32 @@ pip install -e .
 ```python
 >> import ampligraph
 >> ampligraph.__version__
-'1.0.3'
+'1.1.0'
 ```
 
 
 ## Predictive Power Evaluation (MRR Filtered)
 
-|          |FB15k |WN18   |WN18RR |FB15K-237|YAGO3-10 |
-|----------|------|-------|-------|---------|---------|
-| TransE   | 0.55 | 0.50  | 0.23  | 0.31    | 0.24    |
-| DistMult | 0.79 | 0.83  | 0.44  | 0.29    | 0.49    |
-| ComplEx  | 0.79 | 0.94  | 0.44  | 0.30    | 0.50    |
-| HolE     | 0.80 | 0.94  | 0.47  | 0.28    | 0.50    |
+AmpliGraph includes implementations of TransE, DistMult, ComplEx and HolE. 
+Their predictive power is reported below and compared against the state-of-the-art results in literature.
+[More details available here](https://docs.ampligraph.org/en/latest/experiments.html).
+
+|                 |FB15K-237 |WN18RR   |YAGO3-10   | FB15k      |WN18           |
+|-----------------|----------|---------|-----------|------------|---------------|
+| Literature Best | **0.35***| 0.48*   | 0.49*     | **0.84**** | **0.95***     |
+| TransE (AmpliGraph)         |  0.31    | 0.22    | 0.49      | 0.63       | 0.65          |
+| DistMult (AmpliGraph)        |  0.31    | 0.45    | 0.49      | 0.78       | 0.82          |
+| ComplEx  (AmpliGraph)       |  0.32    | **0.50**| **0.50**  | 0.80       | 0.94          |
+| HolE (AmpliGraph)           |  0.31    | 0.47    | **0.50**  | 0.80       | 0.93          |
+
+
+<sub>
+* Timothee Lacroix, Nicolas Usunier, and Guillaume Obozinski. Canonical tensor decomposition for knowledge base 
+completion. In International Conference on Machine Learning, 2869–2878. 2018. <br/>
+**  Kadlec, Rudolf, Ondrej Bajgar, and Jan Kleindienst. "Knowledge base completion: Baselines strike back.
+ " arXiv preprint arXiv:1705.10744 (2017).
 
+</sub>
 
 ## Documentation
 
@@ -159,7 +172,8 @@ If you instead use AmpliGraph in an academic publication, cite as:
           Sumit Pai and
           Chan Le Van and
           Rory McGrath and
-          Nicholas McCarthy},
+          Nicholas McCarthy and
+          Pedro Tabacof},
  title = {{AmpliGraph: a Library for Representation Learning on Knowledge Graphs}},
  month = mar,
  year  = 2019,

diff --git a/ampligraph/__init__.py b/ampligraph/__init__.py
@@ -9,7 +9,7 @@
 import logging.config
 import pkg_resources
 
-__version__ = '1.0.3'
-__all__ = ['datasets', 'latent_features', 'evaluation']
+__version__ = '1.1.0'
+__all__ = ['datasets', 'latent_features', 'discovery', 'evaluation', 'utils']
 
 logging.config.fileConfig(pkg_resources.resource_filename(__name__, 'logger.conf'), disable_existing_loggers=False)
diff --git a/ampligraph/datasets/__init__.py b/ampligraph/datasets/__init__.py
@@ -10,5 +10,11 @@
 from .datasets import load_from_csv, load_from_rdf, load_fb15k, load_wn18, load_fb15k_237, load_from_ntriples, \
     load_yago3_10, load_wn18rr
 
+from .abstract_dataset_adapter import AmpligraphDatasetAdapter
+from .sqlite_adapter import SQLiteAdapter
+from .numpy_adapter import NumpyDatasetAdapter
+
+
 __all__ = ['load_from_csv', 'load_from_rdf', 'load_from_ntriples', 'load_wn18', 'load_fb15k',
-           'load_fb15k_237',  'load_yago3_10', 'load_wn18rr']
+           'load_fb15k_237', 'load_yago3_10', 'load_wn18rr', 
+           'AmpligraphDatasetAdapter', 'NumpyDatasetAdapter', 'SQLiteAdapter']
diff --git a/ampligraph/datasets/abstract_dataset_adapter.py b/ampligraph/datasets/abstract_dataset_adapter.py
@@ -0,0 +1,147 @@
+import abc
+
+
+class AmpligraphDatasetAdapter(abc.ABC):
+    """Abstract class for dataset adapters
+       Developers can design in similar format to adapt data from different sources to feed to ampligraph.
+    """
+    def __init__(self):
+        """Initialize the class variables
+        """
+        self.dataset = {}
+
+        # relation to idx mappings
+        self.rel_to_idx = {}
+        # entities to idx mappings
+        self.ent_to_idx = {}
+        # Mapped status of each dataset
+        self.mapped_status = {}
+
+    def use_mappings(self, rel_to_idx, ent_to_idx):
+        """Use an existing mapping with the datasource.
+        """
+        self.rel_to_idx = rel_to_idx
+        self.ent_to_idx = ent_to_idx
+        # set the mapped status to false, since we are changing the dictionary
+        for key in self.dataset.keys():
+            self.mapped_status[key] = False
+
+    def generate_mappings(self, use_all=False):
+        """Generate mappings from either train set or use all dataset to generate mappings
+        Parameters
+        ----------
+        use_all : boolean
+            If True, it generates mapping from all the data. If False, it only uses training set to generate mappings
+            
+        Returns
+        -------
+        rel_to_idx : dictionary
+            Relation to idx mapping dictionary
+        ent_to_idx : dictionary
+            entity to idx mapping dictionary
+        """
+        raise NotImplementedError('Abstract Method not implemented!')
+
+    def get_size(self, dataset_type="train"):
+        """Returns the size of the specified dataset
+        Parameters
+        ----------
+        dataset_type : string
+            type of the dataset
+            
+        Returns
+        -------
+        size : int
+            size of the specified dataset
+        """
+
+        raise NotImplementedError('Abstract Method not implemented!')
+
+    def set_data(self, dataset, dataset_type=None, mapped_status=False):
+        """set the dataset based on the type
+        Parameters
+        ----------
+        dataset : nd-array or dictionary
+            dataset of triples 
+        dataset_type : string
+            if the dataset parameter is an nd- array then this indicates the type of the data being based
+        mapped_status : bool
+            indicates whether the data has already been mapped to the indices
+            
+        """
+        raise NotImplementedError('Abstract Method not implemented!')
+
+    def map_data(self, remap=False):
+        """map the data to the mappings of ent_to_idx and rel_to_idx
+        Parameters
+        ----------
+        remap : boolean
+            remap the data, if already mapped. One would do this if the dictionary is updated.
+        """
+        raise NotImplementedError('Abstract Method not implemented!')
+
+    def set_filter(self, filter_triples):
+        """set's the filter that need to be used while generating evaluation batch
+        Parameters
+        ----------
+        filter_triples : nd-array
+            triples that would be used as filter
+        """
+        raise NotImplementedError('Abstract Method not implemented!')
+
+    def get_next_train_batch(self, batch_size=1, dataset_type="train"):
+        """Generator that returns the next batch of data.
+        
+        Parameters
+        ----------
+        batch_size : int
+            data size that needs to be returned
+        dataset_type: string
+            indicates which dataset to use
+        Returns
+        -------
+        batch_output : nd-array
+            yields a batch of triples from the dataset type specified
+        """
+        raise NotImplementedError('Abstract Method not implemented!')
+
+    def get_next_eval_batch(self, batch_size=1, dataset_type="test"):
+        """Generator that returns the next batch of data.
+        
+        Parameters
+        ----------
+        batch_size : int
+            data size that needs to be returned
+        dataset_type: string
+            indicates which dataset to use
+        Returns
+        -------
+        batch_output : nd-array
+            yields a batch of triples from the dataset type specified
+        """
+        raise NotImplementedError('Abstract Method not implemented!')
+
+    def get_next_batch_with_filter(self, batch_size=1, dataset_type="test"):
+        """Generator that returns the next batch of data along with the filter.
+        
+        Parameters
+        ----------
+        batch_size : int
+            data size that needs to be returned
+        dataset_type: string
+            indicates which dataset to use
+        Returns
+        -------
+        batch_output : nd-array [n,3]
+            yields a batch of triples from the dataset type specified
+        participating_objects : nd-array [n,1]
+            all objects that were involved in the s-p-? relation
+        participating_subjects : nd-array [n,1]
+            all subjects that were involved in the ?-p-o relation
+        """
+        raise NotImplementedError('Abstract Method not implemented!')
+
+    def cleanup(self):
+        """Cleans up the internal state
+        """
+        raise NotImplementedError('Abstract Method not implemented!')
Original file line number	Diff line number	Diff line change
		@@ -1,3 +1,2 @@
		include ampligraph/latent_features/prime_number_list.txt
		include ampligraph/logger.conf