diff --git a/examples/index_all.py b/examples/index_all.py index 9c60836..ea431fb 100644 --- a/examples/index_all.py +++ b/examples/index_all.py @@ -29,6 +29,9 @@ def main(): """Main method""" args = parse_args() driller = Driller(config_path=args.config) + # this method should be called only once, when initializing + # a database for the first time + driller.init_db() driller.drill_batch() diff --git a/graphrepo/drillers/batch_utils.py b/graphrepo/drillers/batch_utils.py index 42bbe97..f0f9fd6 100644 --- a/graphrepo/drillers/batch_utils.py +++ b/graphrepo/drillers/batch_utils.py @@ -11,24 +11,17 @@ def batch(iterable, n=1): yield iterable[ndx:min(ndx + n, l)] -def index_commits(graph, commits, batch_size=100, index=True): +def index_commits(graph, commits, batch_size=100): query = """ UNWIND {commits} AS c - MERGE (:Commit { hash: c.hash, - commit_hash: c.commit_hash, - timestamp: c.timestamp, - is_merge: c.is_merge, - project_id: c.project_id, - message: c.message, - dmm_unit_complexity: c.dmm_unit_complexity, - dmm_unit_interfacing: c.dmm_unit_interfacing, - dmm_unit_size: c.dmm_unit_size - }) + MERGE (nc :Commit { hash: c.hash}) + ON CREATE SET + nc = c + ON MATCH SET + nc = c """ for b in batch(commits, batch_size): graph.run(query, commits=b) - if index: - create_index_commits(graph) def index_parent_commits(graph, parents, batch_size=100): @@ -42,31 +35,26 @@ def index_parent_commits(graph, parents, batch_size=100): graph.run(query, ac=b) -def index_authors(graph, authors, batch_size=100, index=True): +def index_authors(graph, authors, batch_size=100): query = """ UNWIND {authors} AS a - MERGE (:Developer { hash: a.hash, - email: a.email, - name: a.name - }) + MERGE (nd:Developer { hash: a.hash}) + ON CREATE SET nd = a + ON MATCH SET nd = a """ for b in batch(authors, batch_size): graph.run(query, authors=b) - if index: - create_index_authors(graph) -def index_branches(graph, branches, batch_size=100, index=True): +def index_branches(graph, branches, batch_size=100): query = """ UNWIND {branches} AS a - MERGE (:Branch { hash: a.hash, - name:a.name, - project_id: a.project_id}) + MERGE (nb:Branch { hash: a.hash}) + ON CREATE SET nb = a + ON MATCH SET nb = a """ for b in batch(branches, batch_size): graph.run(query, branches=b) - if index: - create_index_branches(graph) def index_branch_commits(graph, bc, batch_size=100): @@ -80,32 +68,27 @@ def index_branch_commits(graph, bc, batch_size=100): graph.run(query, ac=b) -def index_files(graph, files, batch_size=100, index=True): +def index_files(graph, files, batch_size=100): query = """ UNWIND {files} AS f - MERGE (:File { hash: f.hash, - project_id: f.project_id, - type:f.type, name: f.name}) + MERGE (nf:File { hash: f.hash}) + ON CREATE SET nf = f + ON MATCH SET nf = f """ for b in batch(files, batch_size): graph.run(query, files=b) - if index: - create_index_files(graph) -def index_methods(graph, methods, batch_size=100, index=True): +def index_methods(graph, methods, batch_size=100): query = """ UNWIND {methods} AS f - MERGE (:Method { hash: f.hash, - project_id: f.project_id, - name: f.name, - file_name: f.file_name}) + MERGE (nm:Method { hash: f.hash}) + ON CREATE SET nm = f + ON MATCH SET nm = f """ for b in batch(methods, batch_size): graph.run(query, methods=b) - if index: - create_index_methods(graph) def index_author_commits(graph, ac, batch_size=100): @@ -161,85 +144,94 @@ def create_index_authors(graph): graph.run(query) -def create_index_commits(graph): - hash_q = """ - CREATE INDEX ON :Commit(hash) - """ +def create_index_commits(graph, hash=True): + if hash: + hash_q = """ + CREATE INDEX ON :Commit(hash) + """ + graph.run(hash_q) + pid_q = """ CREATE INDEX ON :Commit(project_id) """ - graph.run(hash_q) + graph.run(pid_q) -def create_index_branches(graph): - hash_q = """ - CREATE INDEX ON :Branch(hash) - """ +def create_index_branches(graph, hash=True): + if hash: + hash_q = """ + CREATE INDEX ON :Branch(hash) + """ + graph.run(hash_q) + pid_q = """ CREATE INDEX ON :Branch(project_id) """ - graph.run(hash_q) graph.run(pid_q) -def create_index_files(graph): - hash_q = """ - CREATE INDEX ON :File(hash) - """ +def create_index_files(graph, hash=True): + if hash: + hash_q = """ + CREATE INDEX ON :File(hash) + """ + graph.run(hash_q) + pid_q = """ CREATE INDEX ON :File(project_id) """ - graph.run(hash_q) graph.run(pid_q) -def create_index_methods(graph): - hash_q = """ - CREATE INDEX ON :Method(hash) - """ +def create_index_methods(graph, hash=True): + if hash: + hash_q = """ + CREATE INDEX ON :Method(hash) + """ + graph.run(hash_q) + pid_q = """ CREATE INDEX ON :Method(project_id) """ - graph.run(hash_q) graph.run(pid_q) def index_all(graph, developers, commits, parents, dev_commits, branches, branches_commits, files, commit_files, methods, file_methods, - commit_methods, batch_size=100, index=True): + commit_methods, batch_size=100): total = datetime.now() developers = list({v['hash']: v for v in developers}.values()) print('Indexing ', len(developers), ' authors') start = datetime.now() - index_authors(graph, developers, batch_size, index) + index_authors(graph, developers, batch_size) print('Indexed authors in: \t', datetime.now()-start) print('Indexing ', len(commits), ' commits') start = datetime.now() - index_commits(graph, commits, batch_size, index) + index_commits(graph, commits, batch_size) print('Indexed commits in: \t', datetime.now()-start) branches = list({v['hash']: v for v in branches}.values()) branches_commits = list({str(i): i for i in branches_commits}.values()) print('Indexing ', len(branches), ' branches') start = datetime.now() - index_branches(graph, branches, batch_size, index) + index_branches(graph, branches, batch_size) index_branch_commits(graph, branches_commits, batch_size) print('Indexed branches in: \t', datetime.now()-start) files = list({v['hash']: v for v in files}.values()) print('Indexing ', len(files), ' files') start = datetime.now() - index_files(graph, files, batch_size, index) + index_files(graph, files, batch_size) print('Indexed files in: \t', datetime.now()-start) methods = list({v['hash']: v for v in methods}.values()) print('Indexing ', len(methods), ' methods') start = datetime.now() - index_methods(graph, methods, batch_size, index) + index_methods(graph, methods, batch_size) print('Indexed methods in: \t', datetime.now()-start) parents = list({str(i): i for i in parents}.values()) @@ -272,19 +264,19 @@ def index_all(graph, developers, commits, parents, dev_commits, branches, print('Indexing took: \t', datetime.now()-total) -def index_cache(graph, cache, batch_size=100, index=True): +def index_cache(graph, cache, batch_size=100): total = datetime.now() index_authors(graph, list( - {v['hash']: v for v in cache.data['developers']}.values()), batch_size, index) - index_commits(graph, cache.data['commits'], batch_size, index) + {v['hash']: v for v in cache.data['developers']}.values()), batch_size) + index_commits(graph, cache.data['commits'], batch_size) index_branches(graph, list( - {v['hash']: v for v in cache.data['branches']}.values()), batch_size, index) + {v['hash']: v for v in cache.data['branches']}.values()), batch_size) index_branch_commits(graph, list( {str(i): i for i in cache.data['branches_commits']}.values()), batch_size) index_files(graph, list( - {v['hash']: v for v in cache.data['files']}.values()), batch_size, index) + {v['hash']: v for v in cache.data['files']}.values()), batch_size) index_methods(graph, list( - {v['hash']: v for v in cache.data['methods']}.values()), batch_size, index) + {v['hash']: v for v in cache.data['methods']}.values()), batch_size) index_parent_commits(graph, list( {str(i): i for i in cache.data['parents']}.values()), batch_size) index_author_commits(graph, cache.data['dev_commits'], batch_size) diff --git a/graphrepo/drillers/db_init.py b/graphrepo/drillers/db_init.py index 0c863ee..0c0a0d4 100644 --- a/graphrepo/drillers/db_init.py +++ b/graphrepo/drillers/db_init.py @@ -10,10 +10,11 @@ def create_hash_constraints(graph): graph.run(q) -def create_indices(graph): +def create_indices(graph, hash_index=True): """Initializes all indexes for database""" - utils.create_index_authors(graph) - utils.create_index_branches(graph) - utils.create_index_commits(graph) - utils.create_index_files(graph) - utils.create_index_methods(graph) + if hash_index: + utils.create_index_authors(graph) + utils.create_index_branches(graph, hash_index) + utils.create_index_commits(graph, hash_index) + utils.create_index_files(graph, hash_index) + utils.create_index_methods(graph, hash_index) diff --git a/graphrepo/drillers/default.py b/graphrepo/drillers/default.py index 01e1dd5..f0b3663 100644 --- a/graphrepo/drillers/default.py +++ b/graphrepo/drillers/default.py @@ -76,7 +76,7 @@ def init_db(self): try: self._check_connection() db_init.create_hash_constraints(self.graph) - db_init.create_indices(self.graph) + db_init.create_indices(self.graph, hash_index=False) except Exception as e: raise e diff --git a/graphrepo/utils.py b/graphrepo/utils.py index 6c29a69..b89eaa0 100644 --- a/graphrepo/utils.py +++ b/graphrepo/utils.py @@ -52,12 +52,19 @@ def load_json(path): return json.load(json_file) -def get_file_hash(file, project_id=None): +def get_file_hash(file, project_id=None, use_new_path=False): name = '' - if file.old_path and not file.new_path: - name = name+file.old_path - else: + if not file.old_path and file.new_path: name = name+file.new_path + elif file.old_path and not file.new_path: + name = name+file.old_path + elif file.old_path and file.new_path: + if file.old_path != file.new_path: + print(file.old_path, file.new_path) + if use_new_path: + name = name+file.new_path + else: + name = name + file.old_path name = name+file.filename name = project_id + name if project_id else name @@ -141,6 +148,7 @@ def format_branch_commit(bhash, chash): def format_file(file, project_id): return { 'hash': get_file_hash(file, project_id), + 'new_hash': get_file_hash(file, project_id, use_new_path=True), 'name': file.filename, 'project_id': project_id, 'type': '.' + file.filename.split('.')[-1:][0]