From c4a5b632bb5dbbb8a93502837a987983d5985d2d Mon Sep 17 00:00:00 2001 From: MartinBoeckling075 Date: Thu, 1 Sep 2022 11:09:48 +0200 Subject: [PATCH 1/3] igraph-based random walker --- pyrdf2vec/walkers/igraph.py | 198 ++++++++++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 pyrdf2vec/walkers/igraph.py diff --git a/pyrdf2vec/walkers/igraph.py b/pyrdf2vec/walkers/igraph.py new file mode 100644 index 00000000..c27a13d6 --- /dev/null +++ b/pyrdf2vec/walkers/igraph.py @@ -0,0 +1,198 @@ +from hashlib import md5 +from typing import List, Optional, Set +import random + +import attr + +import numpy as np +import pandas as pd + +from igraph import Graph + +from itertools import groupby + +from pyrdf2vec.graphs import KG, Vertex +from pyrdf2vec.typings import EntityWalks, SWalk, Walk +from pyrdf2vec.walkers import Walker + +@attr.s +class RandomWalker(Walker): + """Random walking strategy which extracts walks from a root node using the + Depth First Search (DFS) algorithm if a maximum number of walks is + specified, otherwise the Breadth First Search (BFS) algorithm is used. + + Attributes: + _is_support_remote: True if the walking strategy can be used with a + remote Knowledge Graph, False Otherwise + Defaults to True. + kg: The global KG used later on for the worker process. + Defaults to None. + max_depth: The maximum depth of one walk. + max_walks: The maximum number of walks per entity. + Defaults to None. + md5_bytes: The number of bytes to keep after hashing objects in + MD5. Hasher allows to reduce the memory occupied by a long + text. If md5_bytes is None, no hash is applied. + Defaults to 8. + random_state: The random state to use to keep random determinism with + the walking strategy. + Defaults to None. + sampler: The sampling strategy. + Defaults to UniformSampler. + with_reverse: True to extracts parents and children hops from an + entity, creating (max_walks * max_walks) walks of 2 * depth, + allowing also to centralize this entity in the walks. False + otherwise. + Defaults to False. + + """ + + md5_bytes = attr.ib( + kw_only=True, + type=Optional[int], + default=8, + repr=False, + ) + + def __init__(self, kg: KG): + """ + Transform each pyRDF2Vec KG object into igraph graph + + Args: + kg: Knowledge Graph of PYRDF2Vec object + """ + # extract nodes and edges from KG storing them into tuples + nodeTuple = tuple((vertex for vertex in kg._vertices if not vertex.predicate)) + predicateTuple = tuple((vertex for vertex in kg._vertices if vertex.predicate)) + # merge node and edge tuples into one + tupleValue = nodeTuple + predicateTuple + # transform tuple into graph and store into class variable + self.graph = Graph.TupleList(tupleValue, directed=True, edge_attrs='description') + print(self.graph.summary()) + + def predicateGeneration(self, pathList): + """Generate path sequence for a list of single paths based on graph object + using shortest path algorithm + + Args: + pathList: List of paths for one vertex ID + + Returns: + List of path sequences in a tuple + """ + graph = self.graph + predValues = np.array([e.attributes()['description'] for e in graph.es(pathList)]) + nodeSequence = np.array([graph.vs().select(e.tuple).get_attribute_values('name') for e in graph.es(pathList)]).flatten() + nodeSequence = np.array([key for key, _group in groupby(nodeSequence)]) + pathSequence = np.insert(predValues, np.arange(len(nodeSequence)), nodeSequence) + pathSequence = tuple(pathSequence) + return pathSequence + + + def _bfs(self, is_reverse:bool, idNumber: int): + """Extracts random walks for an entity based on Knowledge Graph using + the Depth First Search (DFS) algorithm. + + Args: + is_reverse: True to get the parent neighbors instead of the child + neighbors, False otherwise. + Defaults to False + idNumber: ID number of a node within a graph + Returns: + nodeIndex: Index of node in graph + dfsList: List of unique walks for the provided entitiy + """ + # extract node index for vertices + nodeIndex = self.graph.vs.find(idNumber).index + # define orientation of graph + orient = 'out' if is_reverse else'all' + # perform breadth-first search extraction + bfsList = self.graph.bfsiter(nodeIndex, orient, advanced=True) + return nodeIndex, bfsList + + def _dfs(self, is_reverse:bool, idNumber): + """Extracts random walks for an entity based on Knowledge Graph using + the Depth First Search (DFS) algorithm. + + Args: + is_reverse: True to get the parent neighbors instead of the child + neighbors, False otherwise. + Defaults to False + idNumber: ID number of a node within a graph + Returns: + nodeIndex: Index of node in graph + dfsList: List of unique walks for the provided entitiy + """ + assert self.max_walks is not None + nodeIndex = self.graph.vs.find(idNumber).index + orient = 'out' if is_reverse else'all' + dfsList = self.graph.dfsiter(nodeIndex, orient, advanced=True) + return nodeIndex, dfsList + + def extract_walks(self, entity: Vertex): + """Extracts random walks for an entity based on Knowledge Graph using + the Depth First Search (DFS) algorithm if a maximum number of walks is + specified, otherwise the Breadth First Search (BFS) algorithm is used. + + Args: + entity: The root node to extract walks. + Returns: + The list of unique walks for the provided entity. + """ + fct_search = self._bfs if self.max_walks is None else self._dfs + nodeIndex, fctList = fct_search(self.with_reverse, entity) + distanceList = tuple((nodePath for nodePath in fctList if nodePath[1] <= self.distance)) + vertexList = tuple((vertexElement[0] for vertexElement in distanceList)) + # limit maximum walks to maximum length of walkSequence length + maxWalks = len(vertexList) if len(vertexList) < self.max_walks else self.max_walks + # random sample defined maximumWalk from vertexList list + random.seed(15) + vertexList = random.sample(vertexList, maxWalks) + shortestPathList = self.graph.get_shortest_paths(v=nodeIndex, to=vertexList, output='epath') + pathSequence = list(map(self.predicateGeneration, shortestPathList)) + return pathSequence + + + def _map_vertex(self, entity: Vertex, pos: int) -> str: + """Maps certain vertices to MD5 hashes to save memory. For entities of + interest (provided by the user to the extract function) and predicates, + the string representation is kept. + + Args: + entity: The entity to be mapped. + pos: The position of the entity in the walk. + + Returns: + A hash (string) or original string representation. + + """ + if ( + entity.name in self._entities + or pos % 2 == 1 + or self.md5_bytes is None + ): + return entity.name + else: + ent_hash = md5(entity.name.encode()).digest() + return str(ent_hash[: self.md5_bytes]) + + + def _extract(self, kg: KG, entity: Vertex) -> EntityWalks: + """Extracts random walks for an entity based on a Knowledge Graph. + + Args: + kg: The Knowledge Graph. + entity: The root node to extract walks. + + Returns: + A dictionary having the entity as key and a list of tuples as value + corresponding to the extracted walks. + + """ + canonical_walks: Set[SWalk] = set() + for walk in self.extract_walks(kg, entity): + canonical_walk: List[str] = [ + self._map_vertex(vertex, i) for i, vertex in enumerate(walk) + ] + canonical_walks.add(tuple(canonical_walk)) + return {entity.name: list(canonical_walks)} \ No newline at end of file From 63442d8eada692e1577bc1527100a5b84f9af7ad Mon Sep 17 00:00:00 2001 From: MartinBoeckling075 Date: Thu, 1 Sep 2022 14:30:03 +0200 Subject: [PATCH 2/3] Initial Documentation --- docs/api/pyrdf2vec.walkers.igraph.rst | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/api/pyrdf2vec.walkers.igraph.rst diff --git a/docs/api/pyrdf2vec.walkers.igraph.rst b/docs/api/pyrdf2vec.walkers.igraph.rst new file mode 100644 index 00000000..23330a17 --- /dev/null +++ b/docs/api/pyrdf2vec.walkers.igraph.rst @@ -0,0 +1,6 @@ +pyrdf2vec.walkers.igraph module +=============================== +.. automodule:: pyrdf2vec.walkers.igraph + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file From abfe43fcd1a45084f39171a59be832d8fdcb4620 Mon Sep 17 00:00:00 2001 From: MartinBoeckling075 Date: Thu, 1 Sep 2022 14:39:40 +0200 Subject: [PATCH 3/3] Updated Igraph module --- pyrdf2vec/walkers/igraph.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pyrdf2vec/walkers/igraph.py b/pyrdf2vec/walkers/igraph.py index c27a13d6..2f0ef0c9 100644 --- a/pyrdf2vec/walkers/igraph.py +++ b/pyrdf2vec/walkers/igraph.py @@ -54,7 +54,7 @@ class RandomWalker(Walker): repr=False, ) - def __init__(self, kg: KG): + def transformKG(self, kg: KG): """ Transform each pyRDF2Vec KG object into igraph graph @@ -68,7 +68,6 @@ def __init__(self, kg: KG): tupleValue = nodeTuple + predicateTuple # transform tuple into graph and store into class variable self.graph = Graph.TupleList(tupleValue, directed=True, edge_attrs='description') - print(self.graph.summary()) def predicateGeneration(self, pathList): """Generate path sequence for a list of single paths based on graph object @@ -89,7 +88,7 @@ def predicateGeneration(self, pathList): return pathSequence - def _bfs(self, is_reverse:bool, idNumber: int): + def _bfs(self, kg: KG , idNumber: int, is_reverse:bool = False): """Extracts random walks for an entity based on Knowledge Graph using the Depth First Search (DFS) algorithm. @@ -102,8 +101,9 @@ def _bfs(self, is_reverse:bool, idNumber: int): nodeIndex: Index of node in graph dfsList: List of unique walks for the provided entitiy """ + graph = self.transformKG() # extract node index for vertices - nodeIndex = self.graph.vs.find(idNumber).index + nodeIndex = graph.vs.find(idNumber).index # define orientation of graph orient = 'out' if is_reverse else'all' # perform breadth-first search extraction @@ -129,7 +129,7 @@ def _dfs(self, is_reverse:bool, idNumber): dfsList = self.graph.dfsiter(nodeIndex, orient, advanced=True) return nodeIndex, dfsList - def extract_walks(self, entity: Vertex): + def extract_walks(self, entity: Vertex) -> List[Walk]: """Extracts random walks for an entity based on Knowledge Graph using the Depth First Search (DFS) algorithm if a maximum number of walks is specified, otherwise the Breadth First Search (BFS) algorithm is used.