From c4a5b632bb5dbbb8a93502837a987983d5985d2d Mon Sep 17 00:00:00 2001
From: MartinBoeckling075 <m_boeck@outlook.de>
Date: Thu, 1 Sep 2022 11:09:48 +0200
Subject: [PATCH 1/3] igraph-based random walker

---
 pyrdf2vec/walkers/igraph.py | 198 ++++++++++++++++++++++++++++++++++++
 1 file changed, 198 insertions(+)
 create mode 100644 pyrdf2vec/walkers/igraph.py

diff --git a/pyrdf2vec/walkers/igraph.py b/pyrdf2vec/walkers/igraph.py
new file mode 100644
index 00000000..c27a13d6
--- /dev/null
+++ b/pyrdf2vec/walkers/igraph.py
@@ -0,0 +1,198 @@
+from hashlib import md5
+from typing import List, Optional, Set
+import random
+
+import attr
+
+import numpy as np
+import pandas as pd
+
+from igraph import Graph
+
+from itertools import groupby
+
+from pyrdf2vec.graphs import KG, Vertex
+from pyrdf2vec.typings import EntityWalks, SWalk, Walk
+from pyrdf2vec.walkers import Walker
+
+@attr.s
+class RandomWalker(Walker):
+    """Random walking strategy which extracts walks from a root node using the
+    Depth First Search (DFS) algorithm if a maximum number of walks is
+    specified, otherwise the Breadth First Search (BFS) algorithm is used.
+
+    Attributes:
+        _is_support_remote: True if the walking strategy can be used with a
+            remote Knowledge Graph, False Otherwise
+            Defaults to True.
+        kg: The global KG used later on for the worker process.
+            Defaults to None.
+        max_depth: The maximum depth of one walk.
+        max_walks: The maximum number of walks per entity.
+            Defaults to None.
+        md5_bytes: The number of bytes to keep after hashing objects in
+            MD5. Hasher allows to reduce the memory occupied by a long
+            text. If md5_bytes is None, no hash is applied.
+            Defaults to 8.
+        random_state: The random state to use to keep random determinism with
+            the walking strategy.
+            Defaults to None.
+        sampler: The sampling strategy.
+            Defaults to UniformSampler.
+        with_reverse: True to extracts parents and children hops from an
+            entity, creating (max_walks * max_walks) walks of 2 * depth,
+            allowing also to centralize this entity in the walks. False
+            otherwise.
+            Defaults to False.
+
+    """
+
+    md5_bytes = attr.ib(
+        kw_only=True,
+        type=Optional[int],
+        default=8,
+        repr=False,
+    )
+
+    def __init__(self, kg: KG):
+        """
+        Transform each pyRDF2Vec KG object into igraph graph
+
+        Args:
+            kg: Knowledge Graph of PYRDF2Vec object
+        """
+        # extract nodes and edges from KG storing them into tuples
+        nodeTuple = tuple((vertex for vertex in kg._vertices if not vertex.predicate))
+        predicateTuple = tuple((vertex for vertex in kg._vertices if vertex.predicate))
+        # merge node and edge tuples into one
+        tupleValue = nodeTuple + predicateTuple
+        # transform tuple into graph and store into class variable
+        self.graph = Graph.TupleList(tupleValue, directed=True, edge_attrs='description')
+        print(self.graph.summary())
+
+    def predicateGeneration(self, pathList):
+        """Generate path sequence for a list of single paths based on graph object
+        using shortest path algorithm
+        
+        Args:
+            pathList: List of paths for one vertex ID
+        
+        Returns:
+            List of path sequences in a tuple
+        """
+        graph = self.graph
+        predValues = np.array([e.attributes()['description'] for e in graph.es(pathList)])
+        nodeSequence = np.array([graph.vs().select(e.tuple).get_attribute_values('name') for e in graph.es(pathList)]).flatten()
+        nodeSequence = np.array([key for key, _group in groupby(nodeSequence)])
+        pathSequence = np.insert(predValues, np.arange(len(nodeSequence)), nodeSequence)
+        pathSequence = tuple(pathSequence)
+        return pathSequence
+
+
+    def _bfs(self, is_reverse:bool, idNumber: int):
+        """Extracts random walks for an entity based on Knowledge Graph using
+        the Depth First Search (DFS) algorithm.
+        
+        Args:
+            is_reverse: True to get the parent neighbors instead of the child
+                neighbors, False otherwise.
+                Defaults to False
+            idNumber: ID number of a node within a graph
+        Returns: 
+            nodeIndex: Index of node in graph
+            dfsList: List of unique walks for the provided entitiy
+        """
+        # extract node index for vertices
+        nodeIndex = self.graph.vs.find(idNumber).index
+        # define orientation of graph
+        orient = 'out' if is_reverse else'all'
+        # perform breadth-first search extraction
+        bfsList = self.graph.bfsiter(nodeIndex, orient, advanced=True)
+        return nodeIndex, bfsList
+    
+    def _dfs(self, is_reverse:bool, idNumber):
+        """Extracts random walks for an entity based on Knowledge Graph using
+        the Depth First Search (DFS) algorithm.
+        
+        Args:
+            is_reverse: True to get the parent neighbors instead of the child
+                neighbors, False otherwise.
+                Defaults to False
+            idNumber: ID number of a node within a graph
+        Returns:
+            nodeIndex: Index of node in graph
+            dfsList: List of unique walks for the provided entitiy
+        """
+        assert self.max_walks is not None
+        nodeIndex = self.graph.vs.find(idNumber).index
+        orient = 'out' if is_reverse else'all'
+        dfsList = self.graph.dfsiter(nodeIndex, orient, advanced=True)
+        return nodeIndex, dfsList
+    
+    def extract_walks(self, entity: Vertex):
+        """Extracts random walks for an entity based on Knowledge Graph using
+        the Depth First Search (DFS) algorithm if a maximum number of walks is
+        specified, otherwise the Breadth First Search (BFS) algorithm is used.
+
+        Args:
+            entity: The root node to extract walks.
+        Returns:
+            The list of unique walks for the provided entity.
+        """
+        fct_search = self._bfs if self.max_walks is None else self._dfs
+        nodeIndex, fctList = fct_search(self.with_reverse, entity)
+        distanceList = tuple((nodePath for nodePath in fctList if nodePath[1] <= self.distance))
+        vertexList = tuple((vertexElement[0] for vertexElement in distanceList))
+        # limit maximum walks to maximum length of walkSequence length
+        maxWalks = len(vertexList) if len(vertexList) < self.max_walks else self.max_walks
+        # random sample defined maximumWalk from vertexList list
+        random.seed(15)
+        vertexList = random.sample(vertexList, maxWalks)
+        shortestPathList = self.graph.get_shortest_paths(v=nodeIndex, to=vertexList, output='epath')
+        pathSequence = list(map(self.predicateGeneration, shortestPathList))
+        return pathSequence
+    
+
+    def _map_vertex(self, entity: Vertex, pos: int) -> str:
+        """Maps certain vertices to MD5 hashes to save memory. For entities of
+        interest (provided by the user to the extract function) and predicates,
+        the string representation is kept.
+
+        Args:
+            entity: The entity to be mapped.
+            pos: The position of the entity in the walk.
+
+        Returns:
+            A hash (string) or original string representation.
+
+        """
+        if (
+            entity.name in self._entities
+            or pos % 2 == 1
+            or self.md5_bytes is None
+        ):
+            return entity.name
+        else:
+            ent_hash = md5(entity.name.encode()).digest()
+            return str(ent_hash[: self.md5_bytes])
+
+
+    def _extract(self, kg: KG, entity: Vertex) -> EntityWalks:
+        """Extracts random walks for an entity based on a Knowledge Graph.
+
+        Args:
+            kg: The Knowledge Graph.
+            entity: The root node to extract walks.
+
+        Returns:
+            A dictionary having the entity as key and a list of tuples as value
+            corresponding to the extracted walks.
+
+        """
+        canonical_walks: Set[SWalk] = set()
+        for walk in self.extract_walks(kg, entity):
+            canonical_walk: List[str] = [
+                self._map_vertex(vertex, i) for i, vertex in enumerate(walk)
+            ]
+            canonical_walks.add(tuple(canonical_walk))
+        return {entity.name: list(canonical_walks)}
\ No newline at end of file

From 63442d8eada692e1577bc1527100a5b84f9af7ad Mon Sep 17 00:00:00 2001
From: MartinBoeckling075 <m_boeck@outlook.de>
Date: Thu, 1 Sep 2022 14:30:03 +0200
Subject: [PATCH 2/3] Initial Documentation

---
 docs/api/pyrdf2vec.walkers.igraph.rst | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 docs/api/pyrdf2vec.walkers.igraph.rst

diff --git a/docs/api/pyrdf2vec.walkers.igraph.rst b/docs/api/pyrdf2vec.walkers.igraph.rst
new file mode 100644
index 00000000..23330a17
--- /dev/null
+++ b/docs/api/pyrdf2vec.walkers.igraph.rst
@@ -0,0 +1,6 @@
+pyrdf2vec.walkers.igraph module
+===============================
+.. automodule:: pyrdf2vec.walkers.igraph
+    :members:
+    :undoc-members:
+    :show-inheritance:
\ No newline at end of file

From abfe43fcd1a45084f39171a59be832d8fdcb4620 Mon Sep 17 00:00:00 2001
From: MartinBoeckling075 <m_boeck@outlook.de>
Date: Thu, 1 Sep 2022 14:39:40 +0200
Subject: [PATCH 3/3] Updated Igraph module

---
 pyrdf2vec/walkers/igraph.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyrdf2vec/walkers/igraph.py b/pyrdf2vec/walkers/igraph.py
index c27a13d6..2f0ef0c9 100644
--- a/pyrdf2vec/walkers/igraph.py
+++ b/pyrdf2vec/walkers/igraph.py
@@ -54,7 +54,7 @@ class RandomWalker(Walker):
         repr=False,
     )
 
-    def __init__(self, kg: KG):
+    def transformKG(self, kg: KG):
         """
         Transform each pyRDF2Vec KG object into igraph graph
 
@@ -68,7 +68,6 @@ def __init__(self, kg: KG):
         tupleValue = nodeTuple + predicateTuple
         # transform tuple into graph and store into class variable
         self.graph = Graph.TupleList(tupleValue, directed=True, edge_attrs='description')
-        print(self.graph.summary())
 
     def predicateGeneration(self, pathList):
         """Generate path sequence for a list of single paths based on graph object
@@ -89,7 +88,7 @@ def predicateGeneration(self, pathList):
         return pathSequence
 
 
-    def _bfs(self, is_reverse:bool, idNumber: int):
+    def _bfs(self, kg: KG , idNumber: int, is_reverse:bool = False):
         """Extracts random walks for an entity based on Knowledge Graph using
         the Depth First Search (DFS) algorithm.
         
@@ -102,8 +101,9 @@ def _bfs(self, is_reverse:bool, idNumber: int):
             nodeIndex: Index of node in graph
             dfsList: List of unique walks for the provided entitiy
         """
+        graph = self.transformKG()
         # extract node index for vertices
-        nodeIndex = self.graph.vs.find(idNumber).index
+        nodeIndex = graph.vs.find(idNumber).index
         # define orientation of graph
         orient = 'out' if is_reverse else'all'
         # perform breadth-first search extraction
@@ -129,7 +129,7 @@ def _dfs(self, is_reverse:bool, idNumber):
         dfsList = self.graph.dfsiter(nodeIndex, orient, advanced=True)
         return nodeIndex, dfsList
     
-    def extract_walks(self, entity: Vertex):
+    def extract_walks(self, entity: Vertex) -> List[Walk]:
         """Extracts random walks for an entity based on Knowledge Graph using
         the Depth First Search (DFS) algorithm if a maximum number of walks is
         specified, otherwise the Breadth First Search (BFS) algorithm is used.