From 5c4ed2261440588667b464abdaebbd3792eb7d61 Mon Sep 17 00:00:00 2001 From: max-laptop Date: Tue, 2 Jul 2024 11:23:02 +0200 Subject: [PATCH 1/6] added new mode for datastructures.Graph.rdflib_to_graph(..., relation_tail_merging: bool) The relation_tail_merging mode applied the relation-tail merging method, as introduces in our new paper. --- mindwalc/datastructures.py | 129 ++++++++++++++++++++++++++++++------- 1 file changed, 106 insertions(+), 23 deletions(-) diff --git a/mindwalc/datastructures.py b/mindwalc/datastructures.py index a69f0fb..7eb8a49 100644 --- a/mindwalc/datastructures.py +++ b/mindwalc/datastructures.py @@ -22,9 +22,10 @@ class Vertex(object): - def __init__(self, name, predicate=False, _from=None, _to=None): + def __init__(self, name, predicate=False, _from=None, _to=None, relation_modified=False): self.name = name self.predicate = predicate + self.relation_modified = relation_modified self._from = _from self._to = _to @@ -115,39 +116,64 @@ def extract_neighborhood(self, instance, depth=8): for neighbor in self.get_neighbors(v): new_explore.add(neighbor) to_explore = new_explore - + return neighborhood @staticmethod - def rdflib_to_graph(rdflib_g, label_predicates=[]): + def rdflib_to_graph(rdflib_g, label_predicates=[], relation_tail_merging=True): + ''' + Converts an rdflib graph to a Graph object. + During the conversion, a multi-relation graph (head)-[relation]->(tail) (aka subject, predicate, object)is converted to a non-relational graph. + e.g. converting it to (head)-->(relation)-->(tail), or, if apply_relation_tail_merging is True, to (head)-->(relation_tail). + + :param rdflib_g: An rdflib graph, e.g. loaded with rdflib.Graph().parse('file.n3') + :param label_predicates: a list of predicates that are used as labels, and should not be converted to edges? + :param relation_tail_merging: If true, relation-tail-merging is applioed, as described in the paper + "Investigating and Optimizing MINDWALC Node Classification to Extract Interpretable DTs from KGs": + The process of relation-tail merging works as follows: First, a specific tail node is + selected, t, as well as a set of nr relations of identical type, r, where the topological + form (*)-r->(t) is given. The process of relation-tail merging then involves inserting + a new node, rt, so that (*)-r->(t) turns into (*)-->(rt)-->(t). The new directional + edges, -->, are now typeless, and the new inserted node, rt, represents a relationmodified node and is + named accordingly in the form _. + :return: A Graph object of type datastructures::Graph + ''' + kg = Graph() - for (s, p, o) in rdflib_g: + for (s, p, o) in rdflib_g: if p not in label_predicates: + + # Literals are attribute values in RDF, for instance, a person’s name, the date of birth, height, etc. + if isinstance(s, rdflib.term.Literal) and not str(s): + s = "EmptyLiteral" + if isinstance(p, rdflib.term.Literal) and not str(p): + p = "EmptyLiteral" + if isinstance(o, rdflib.term.Literal) and not str(o): + o = "EmptyLiteral" + s = str(s) p = str(p) o = str(o) - if isinstance(s, rdflib.term.BNode): - s_v = Vertex(str(s), wildcard=True) - elif isinstance(s, rdflib.term.Literal): - s_v = Vertex(str(s), literal=True) - else: - s_v = Vertex(str(s)) - - if isinstance(o, rdflib.term.BNode): - o_v = Vertex(str(o), wildcard=True) - elif isinstance(s, rdflib.term.Literal): - o_v = Vertex(str(o), literal=True) + s_v = Vertex(s) + + if relation_tail_merging: + o_v_relation_mod = Vertex(f"{p}_MODIFIED_{o}", relation_modified=True) + o_v = Vertex(o) + kg.add_vertex(s_v) + kg.add_vertex(o_v_relation_mod) + kg.add_vertex(o_v) + kg.add_edge(s_v, o_v_relation_mod) + kg.add_edge(o_v_relation_mod, o_v) else: - o_v = Vertex(str(o)) - - p_v = Vertex(str(p), predicate=True, _from=s_v, _to=o_v) - kg.add_vertex(s_v) - kg.add_vertex(p_v) - kg.add_vertex(o_v) - kg.add_edge(s_v, p_v) - kg.add_edge(p_v, o_v) + o_v = Vertex(o) + p_v = Vertex(p, predicate=True, _from=s_v, _to=o_v) + kg.add_vertex(s_v) + kg.add_vertex(p_v) + kg.add_vertex(o_v) + kg.add_edge(s_v, p_v) + kg.add_edge(p_v, o_v) return kg @@ -326,3 +352,60 @@ def _convert_node_to_dot(self, node_vis_props): s += 'Node' + str(num) + ' -> ' + 'Node' + str(num + amount_subnodes_left + 1) + ' [label="true"];\n' return s + +if __name__ == "__main__": + from tree_builder import MINDWALCTree, MINDWALCForest, MINDWALCTransform + import pandas as pd + from sklearn.metrics import accuracy_score, confusion_matrix + + # load graph: + rdf_file = 'data/AIFB/aifb.n3' + _format = 'n3' + label_predicates = [ # these predicates will be deleted, otherwise clf task would get to easy? + rdflib.URIRef('http://swrc.ontoware.org/ontology#affiliation'), + rdflib.URIRef('http://swrc.ontoware.org/ontology#employs'), + rdflib.URIRef('http://swrc.ontoware.org/ontology#carriedOutBy') + ] + g = rdflib.Graph() + g.parse(rdf_file, format=_format) + + # load train data: + train_file = 'data/AIFB/AIFB_test.tsv' + test_file = 'data/AIFB/AIFB_train.tsv' + entity_col = 'person' + label_col = 'label_affiliation' + test_data = pd.read_csv(train_file, sep='\t') + train_data = pd.read_csv(test_file, sep='\t') + + train_entities = [rdflib.URIRef(x) for x in train_data[entity_col]] + train_labels = train_data[label_col] + + test_entities = [rdflib.URIRef(x) for x in test_data[entity_col]] + test_labels = test_data[label_col] + + + # convert to non relational graphs using relation-to-node convertion: + kg = Graph.rdflib_to_graph(g, label_predicates=label_predicates, relation_tail_merging=False) + verts_a = len(kg.vertices) + print(f"generated graph using relation-to-node-convertion has {str(float(verts_a)/1000).replace('.', ',')} vertices") + clf = MINDWALCTree(path_max_depth=8, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) + clf.fit(kg, train_entities, train_labels) + preds = clf.predict(kg, test_entities) + print(f"accuracy: {accuracy_score(test_labels, preds)}") + + print() + + # convert to non relational graphs using relation-tail-merging: + kg = Graph.rdflib_to_graph(g, label_predicates=label_predicates, relation_tail_merging=True) + verts_b = len(kg.vertices) + print( + f"generated graph using relation_tail_merging has {str(float(verts_b)/1000).replace('.', ',')} vertices") + clf = MINDWALCTree(path_max_depth=8, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) + clf.fit(kg, train_entities, train_labels) + preds = clf.predict(kg, test_entities) + print(f"accuracy: {accuracy_score(test_labels, preds)}") + + print(f"\nrelation_tail_merging reduced the number of vertices by {verts_a - verts_b} ({round((verts_a - verts_b)/verts_a *100, 0)} %)") + + + From 7af83926f16ae1bb585fdb388e64e53aadbe4599 Mon Sep 17 00:00:00 2001 From: max-laptop Date: Tue, 2 Jul 2024 11:39:53 +0200 Subject: [PATCH 2/6] added new function "graph_to_neo4j()" to class datastructures.Graph (is very useful for exploring and debugging our processed graph) --- mindwalc/datastructures.py | 102 ++++++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 2 deletions(-) diff --git a/mindwalc/datastructures.py b/mindwalc/datastructures.py index 7eb8a49..bccacb9 100644 --- a/mindwalc/datastructures.py +++ b/mindwalc/datastructures.py @@ -176,6 +176,102 @@ def rdflib_to_graph(rdflib_g, label_predicates=[], relation_tail_merging=True): kg.add_edge(p_v, o_v) return kg + def graph_to_neo4j(self, uri='bolt://localhost', user='neo4j', password='password'): + ''' + Converts the graph to a neo4j database. Needs an empty running neo4j db. + :param uri: address where neo4j db is running + :param user: username of neo4j db + :param password: password of neo4j db + :return: None + ''' + + try: + from neo4j import GraphDatabase + except ImportError: + raise ImportError("Please install the neo4j-driver package to use this function.") + from tqdm import tqdm + + use_nodes_for_predicates = True # if false, the predicates are used as edges. Otherwise as nodes. + relation_name = 'R' + + driver = GraphDatabase.driver(uri, auth=(user, password)) + with driver.session() as session: + # check if db is empty: + node_count = session.run("MATCH (n) return count(n)").single().value() + if node_count > 0: + print("Neo4j database is not empty, aborting graph to neo4h db convertion to avoid data loss.") + return + + for v in self.vertices: + if not v.predicate: + # name = v.name.split('/')[-1] + name = v.name.replace("'", "") + session.run(f"CREATE (a:Node" + (":RelationModified" if v.relation_modified else "") + + " {name: '" + name + "'})") # .split(' ')[0] + '_' + vertex.__hash__() + + for v in tqdm(self.vertices): + if not v.predicate: + # v_name = v.name.split('/')[-1] + v_name = v.name.replace("'", "") + + node_type = "Node" + (":RelationModified" if v.relation_modified else "") + + ids_v = [r["id(v)"] for r in + session.run( + "MATCH (v:" + node_type + " {name: '" + v_name + "'}) where not (v:Predicate) RETURN id(v)")] + if len(ids_v) == 0: + raise Exception(f"no id found for {v_name}") + elif len(ids_v) == 1: + id_v = ids_v[0] + else: + raise Exception(f"multiple ids found for {v_name}: {ids_v}") + + for pred in self.get_neighbors(v): + + if pred.predicate: + pred_name = "".join( + [c for c in pred.name.split('/')[-1].replace("#", "_").replace('-', '_') if + not c.isdigit()]) + pred_name = pred_name[1:] if pred_name[0] in ["_", "-"] else pred_name + + for obj in self.get_neighbors(pred): + # obj_name = obj.name.split('/')[-1] + obj_name = obj.name.replace("'", "") + + ids_obj = [r["id(obj)"] for r in + session.run( + "MATCH (obj:Node {name: '" + obj_name + "'}) where not (obj:Predicate) RETURN id(obj)")] + if len(ids_obj) == 0: + raise Exception(f"no id found for {obj_name}") + elif len(ids_obj) == 1: + id_obj = ids_obj[0] + else: + raise Exception(f"multiple ids found for {obj_name}: {ids_obj}") + + if use_nodes_for_predicates: + q = (f"MATCH (a), (b) WHERE ID(a)={id_v} AND ID(b)={id_obj} " + "MERGE (a)-[:") + relation_name + "]->(c:Predicate {name: '" + pred_name + "'})-[:" + relation_name + "]->(b)" + else: + q = f"MATCH (a), (b) WHERE ID(a)={id_v} AND ID(b)={id_obj} MERGE (a)-[:" + pred_name + "]->(b)" + session.run(q) + + else: + obj_name = pred.name.replace("'", "") + + ids_obj = [r["id(obj)"] for r in + session.run( + "MATCH (obj:Node {name: '" + obj_name + "'}) RETURN id(obj)")] + if len(ids_obj) == 0: + raise Exception(f"no id found for {obj_name}") + elif len(ids_obj) == 1: + id_obj = ids_obj[0] + else: + raise Exception(f"multiple ids found for {obj_name}: {ids_obj}") + + q = f"MATCH (a), (b) WHERE ID(a)={id_v} AND ID(b)={id_obj} MERGE (a)-[:" + relation_name + "]->(b)" + session.run(q) + + driver.close() class Neighborhood(object): def __init__(self): @@ -357,6 +453,7 @@ def _convert_node_to_dot(self, node_vis_props): from tree_builder import MINDWALCTree, MINDWALCForest, MINDWALCTransform import pandas as pd from sklearn.metrics import accuracy_score, confusion_matrix + import sys # load graph: rdf_file = 'data/AIFB/aifb.n3' @@ -386,9 +483,10 @@ def _convert_node_to_dot(self, node_vis_props): # convert to non relational graphs using relation-to-node convertion: kg = Graph.rdflib_to_graph(g, label_predicates=label_predicates, relation_tail_merging=False) + #kg.graph_to_neo4j(password=sys.argv[1]) verts_a = len(kg.vertices) print(f"generated graph using relation-to-node-convertion has {str(float(verts_a)/1000).replace('.', ',')} vertices") - clf = MINDWALCTree(path_max_depth=8, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) + clf = MINDWALCTree(path_max_depth=6, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) clf.fit(kg, train_entities, train_labels) preds = clf.predict(kg, test_entities) print(f"accuracy: {accuracy_score(test_labels, preds)}") @@ -400,7 +498,7 @@ def _convert_node_to_dot(self, node_vis_props): verts_b = len(kg.vertices) print( f"generated graph using relation_tail_merging has {str(float(verts_b)/1000).replace('.', ',')} vertices") - clf = MINDWALCTree(path_max_depth=8, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) + clf = MINDWALCTree(path_max_depth=6, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) clf.fit(kg, train_entities, train_labels) preds = clf.predict(kg, test_entities) print(f"accuracy: {accuracy_score(test_labels, preds)}") From 02edb63dd2b2e061ea5540a0e2376ff3f4b3788f Mon Sep 17 00:00:00 2001 From: max-laptop Date: Tue, 2 Jul 2024 12:43:44 +0200 Subject: [PATCH 3/6] changed some default values --- mindwalc/datastructures.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/mindwalc/datastructures.py b/mindwalc/datastructures.py index bccacb9..1d69edb 100644 --- a/mindwalc/datastructures.py +++ b/mindwalc/datastructures.py @@ -120,7 +120,7 @@ def extract_neighborhood(self, instance, depth=8): return neighborhood @staticmethod - def rdflib_to_graph(rdflib_g, label_predicates=[], relation_tail_merging=True): + def rdflib_to_graph(rdflib_g, label_predicates=[], relation_tail_merging=False): ''' Converts an rdflib graph to a Graph object. During the conversion, a multi-relation graph (head)-[relation]->(tail) (aka subject, predicate, object)is converted to a non-relational graph. @@ -485,8 +485,12 @@ def _convert_node_to_dot(self, node_vis_props): kg = Graph.rdflib_to_graph(g, label_predicates=label_predicates, relation_tail_merging=False) #kg.graph_to_neo4j(password=sys.argv[1]) verts_a = len(kg.vertices) - print(f"generated graph using relation-to-node-convertion has {str(float(verts_a)/1000).replace('.', ',')} vertices") - clf = MINDWALCTree(path_max_depth=6, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) + rels_a = len(kg.transition_matrix) + + print(f"generated graph using relation-to-node-convertion has " + f"{str(float(verts_a)/1000).replace('.', ',')} vertices") + print(f"and {str(float(rels_a)/1000).replace('.', ',')} relations") + clf = MINDWALCTree(path_max_depth=8, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) clf.fit(kg, train_entities, train_labels) preds = clf.predict(kg, test_entities) print(f"accuracy: {accuracy_score(test_labels, preds)}") @@ -496,14 +500,17 @@ def _convert_node_to_dot(self, node_vis_props): # convert to non relational graphs using relation-tail-merging: kg = Graph.rdflib_to_graph(g, label_predicates=label_predicates, relation_tail_merging=True) verts_b = len(kg.vertices) - print( - f"generated graph using relation_tail_merging has {str(float(verts_b)/1000).replace('.', ',')} vertices") - clf = MINDWALCTree(path_max_depth=6, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) + rels_b = len(kg.transition_matrix) + print(f"generated graph using relation_tail_merging has " + f"{str(float(verts_b)/1000).replace('.', ',')} vertices") + print(f"and {str(float(rels_b)/1000).replace('.', ',')} relations") + clf = MINDWALCTree(path_max_depth=8, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) clf.fit(kg, train_entities, train_labels) preds = clf.predict(kg, test_entities) print(f"accuracy: {accuracy_score(test_labels, preds)}") - print(f"\nrelation_tail_merging reduced the number of vertices by {verts_a - verts_b} ({round((verts_a - verts_b)/verts_a *100, 0)} %)") + print(f"\nrelation-tail merging reduced the number of vertices by {verts_a - verts_b} ({round((verts_a - verts_b)/verts_a *100, 2)} %)") + print(f"relation-tail merging reduced the number of relations by {rels_a - rels_b} ({round((rels_a - rels_b)/rels_a *100, 2)} %)") From 3e7c8d6516e63c2791ecdc1cbb6f05ff9f88cd92 Mon Sep 17 00:00:00 2001 From: max-laptop Date: Tue, 2 Jul 2024 13:11:42 +0200 Subject: [PATCH 4/6] added new mode to function datastructures.Graph.rdflib_to_graph(..., skip_literals: bool). If True, literals (=node properties/attributes) are skipped during the conversion. Otherwise, they are converted to nodes. so that a node (n: {name: 'John'}) becomes (n)-->(name)-->(john). --- mindwalc/datastructures.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/mindwalc/datastructures.py b/mindwalc/datastructures.py index 1d69edb..5bc6d7e 100644 --- a/mindwalc/datastructures.py +++ b/mindwalc/datastructures.py @@ -120,7 +120,7 @@ def extract_neighborhood(self, instance, depth=8): return neighborhood @staticmethod - def rdflib_to_graph(rdflib_g, label_predicates=[], relation_tail_merging=False): + def rdflib_to_graph(rdflib_g, label_predicates=[], relation_tail_merging=False, skip_literals=False): ''' Converts an rdflib graph to a Graph object. During the conversion, a multi-relation graph (head)-[relation]->(tail) (aka subject, predicate, object)is converted to a non-relational graph. @@ -136,6 +136,8 @@ def rdflib_to_graph(rdflib_g, label_predicates=[], relation_tail_merging=False): a new node, rt, so that (*)-r->(t) turns into (*)-->(rt)-->(t). The new directional edges, -->, are now typeless, and the new inserted node, rt, represents a relationmodified node and is named accordingly in the form _. + :param skip_literals: If True, literals (=node properties/attributes) are skipped during the conversion. + Otherwise, they are converted to nodes. so that a node (n: {name: 'John'}) becomes (n)-->(name)-->(john). :return: A Graph object of type datastructures::Graph ''' @@ -144,6 +146,9 @@ def rdflib_to_graph(rdflib_g, label_predicates=[], relation_tail_merging=False): for (s, p, o) in rdflib_g: if p not in label_predicates: + if skip_literals and isinstance(o, rdflib.term.Literal): + continue + # Literals are attribute values in RDF, for instance, a person’s name, the date of birth, height, etc. if isinstance(s, rdflib.term.Literal) and not str(s): s = "EmptyLiteral" @@ -465,6 +470,7 @@ def _convert_node_to_dot(self, node_vis_props): ] g = rdflib.Graph() g.parse(rdf_file, format=_format) + skip_literals = True # load train data: train_file = 'data/AIFB/AIFB_test.tsv' @@ -482,7 +488,8 @@ def _convert_node_to_dot(self, node_vis_props): # convert to non relational graphs using relation-to-node convertion: - kg = Graph.rdflib_to_graph(g, label_predicates=label_predicates, relation_tail_merging=False) + kg = Graph.rdflib_to_graph(g, label_predicates=label_predicates, relation_tail_merging=False, + skip_literals=skip_literals) #kg.graph_to_neo4j(password=sys.argv[1]) verts_a = len(kg.vertices) rels_a = len(kg.transition_matrix) @@ -498,7 +505,8 @@ def _convert_node_to_dot(self, node_vis_props): print() # convert to non relational graphs using relation-tail-merging: - kg = Graph.rdflib_to_graph(g, label_predicates=label_predicates, relation_tail_merging=True) + kg = Graph.rdflib_to_graph(g, label_predicates=label_predicates, relation_tail_merging=True, + skip_literals=skip_literals) verts_b = len(kg.vertices) rels_b = len(kg.transition_matrix) print(f"generated graph using relation_tail_merging has " From ab7b5254455bee6e44a204a5b2144cf29931bc06 Mon Sep 17 00:00:00 2001 From: max-laptop Date: Wed, 3 Jul 2024 12:04:04 +0200 Subject: [PATCH 5/6] Fixed issue an issue in tree_builder.py: Function MINDWALCMixin._generate_candidates(): during walk candidate collection, only even walking depths (2, 4, 6...) where collected. But if we process an relation-tail merged graph, we also need to consider odd walking depths! --- mindwalc/datastructures.py | 9 +++++++-- mindwalc/tree_builder.py | 5 +++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/mindwalc/datastructures.py b/mindwalc/datastructures.py index 5bc6d7e..58190ec 100644 --- a/mindwalc/datastructures.py +++ b/mindwalc/datastructures.py @@ -471,6 +471,7 @@ def _convert_node_to_dot(self, node_vis_props): g = rdflib.Graph() g.parse(rdf_file, format=_format) skip_literals = True + path_max_depth = 8 # load train data: train_file = 'data/AIFB/AIFB_test.tsv' @@ -497,8 +498,10 @@ def _convert_node_to_dot(self, node_vis_props): print(f"generated graph using relation-to-node-convertion has " f"{str(float(verts_a)/1000).replace('.', ',')} vertices") print(f"and {str(float(rels_a)/1000).replace('.', ',')} relations") - clf = MINDWALCTree(path_max_depth=8, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) + clf = MINDWALCTree(path_max_depth=path_max_depth, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) clf.fit(kg, train_entities, train_labels) + clf.tree_.visualize("./data/AIFB/aifb_MINDWALCtree1", _view=False, + meta_infos="Training method: MINDWALCTree, relation-to-node-converted graph") preds = clf.predict(kg, test_entities) print(f"accuracy: {accuracy_score(test_labels, preds)}") @@ -512,8 +515,10 @@ def _convert_node_to_dot(self, node_vis_props): print(f"generated graph using relation_tail_merging has " f"{str(float(verts_b)/1000).replace('.', ',')} vertices") print(f"and {str(float(rels_b)/1000).replace('.', ',')} relations") - clf = MINDWALCTree(path_max_depth=8, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) + clf = MINDWALCTree(path_max_depth=path_max_depth, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) clf.fit(kg, train_entities, train_labels) + clf.tree_.visualize("./data/AIFB/aifb_MINDWALCtree2", _view=False, + meta_infos="Training method: MINDWALCTree, relation-tail merged graph") preds = clf.predict(kg, test_entities) print(f"accuracy: {accuracy_score(test_labels, preds)}") diff --git a/mindwalc/tree_builder.py b/mindwalc/tree_builder.py index 5dead8d..b159602 100644 --- a/mindwalc/tree_builder.py +++ b/mindwalc/tree_builder.py @@ -47,8 +47,9 @@ def _generate_candidates(self, neighborhoods, sample_frac=None, """Generates an iterable with all possible walk candidates.""" # Generate a set of all possible (vertex, depth) combinations walks = set() - for d in range(2, self.path_max_depth + 1, 2): - for neighborhood in neighborhoods: + #for d in range(2, self.path_max_depth + 1, 2): + for neighborhood in neighborhoods: + for d in neighborhood.depth_map.keys(): for vertex in neighborhood.depth_map[d]: walks.add((vertex, d)) From ae4acf2d9eb076285a1a96890b55d6179943c5ce Mon Sep 17 00:00:00 2001 From: max-laptop Date: Thu, 4 Jul 2024 12:37:45 +0200 Subject: [PATCH 6/6] fixed the edge-counter in the test script in datastructures.py --- mindwalc/datastructures.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mindwalc/datastructures.py b/mindwalc/datastructures.py index 58190ec..30cb0fe 100644 --- a/mindwalc/datastructures.py +++ b/mindwalc/datastructures.py @@ -471,7 +471,7 @@ def _convert_node_to_dot(self, node_vis_props): g = rdflib.Graph() g.parse(rdf_file, format=_format) skip_literals = True - path_max_depth = 8 + path_max_depth = 10 # load train data: train_file = 'data/AIFB/AIFB_test.tsv' @@ -493,11 +493,10 @@ def _convert_node_to_dot(self, node_vis_props): skip_literals=skip_literals) #kg.graph_to_neo4j(password=sys.argv[1]) verts_a = len(kg.vertices) - rels_a = len(kg.transition_matrix) - + edges_a = sum([len(x) for x in kg.transition_matrix.values()]) print(f"generated graph using relation-to-node-convertion has " f"{str(float(verts_a)/1000).replace('.', ',')} vertices") - print(f"and {str(float(rels_a)/1000).replace('.', ',')} relations") + print(f"and {str(float(edges_a) / 1000).replace('.', ',')} edges") clf = MINDWALCTree(path_max_depth=path_max_depth, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) clf.fit(kg, train_entities, train_labels) clf.tree_.visualize("./data/AIFB/aifb_MINDWALCtree1", _view=False, @@ -511,10 +510,10 @@ def _convert_node_to_dot(self, node_vis_props): kg = Graph.rdflib_to_graph(g, label_predicates=label_predicates, relation_tail_merging=True, skip_literals=skip_literals) verts_b = len(kg.vertices) - rels_b = len(kg.transition_matrix) + edges_b = sum([len(x) for x in kg.transition_matrix.values()]) print(f"generated graph using relation_tail_merging has " f"{str(float(verts_b)/1000).replace('.', ',')} vertices") - print(f"and {str(float(rels_b)/1000).replace('.', ',')} relations") + print(f"and {str(float(edges_b) / 1000).replace('.', ',')} edges") clf = MINDWALCTree(path_max_depth=path_max_depth, min_samples_leaf=1, max_tree_depth=None, n_jobs=1) clf.fit(kg, train_entities, train_labels) clf.tree_.visualize("./data/AIFB/aifb_MINDWALCtree2", _view=False, @@ -523,7 +522,5 @@ def _convert_node_to_dot(self, node_vis_props): print(f"accuracy: {accuracy_score(test_labels, preds)}") print(f"\nrelation-tail merging reduced the number of vertices by {verts_a - verts_b} ({round((verts_a - verts_b)/verts_a *100, 2)} %)") - print(f"relation-tail merging reduced the number of relations by {rels_a - rels_b} ({round((rels_a - rels_b)/rels_a *100, 2)} %)") - - + print(f"relation-tail merging reduced the number of edges by {edges_a - edges_b} ({round((edges_a - edges_b) / edges_a * 100, 2)} %)")