Skip to content

Commit

Permalink
Merge pull request #40 from NullConvergence/feature/db_init
Browse files Browse the repository at this point in the history
Unique
  • Loading branch information
xserban authored Oct 7, 2020
2 parents 249cbf3 + 9c27e00 commit dca3466
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 82 deletions.
3 changes: 3 additions & 0 deletions examples/index_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ def main():
"""Main method"""
args = parse_args()
driller = Driller(config_path=args.config)
# this method should be called only once, when initializing
# a database for the first time
driller.init_db()
driller.drill_batch()


Expand Down
134 changes: 63 additions & 71 deletions graphrepo/drillers/batch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,17 @@ def batch(iterable, n=1):
yield iterable[ndx:min(ndx + n, l)]


def index_commits(graph, commits, batch_size=100, index=True):
def index_commits(graph, commits, batch_size=100):
query = """
UNWIND {commits} AS c
MERGE (:Commit { hash: c.hash,
commit_hash: c.commit_hash,
timestamp: c.timestamp,
is_merge: c.is_merge,
project_id: c.project_id,
message: c.message,
dmm_unit_complexity: c.dmm_unit_complexity,
dmm_unit_interfacing: c.dmm_unit_interfacing,
dmm_unit_size: c.dmm_unit_size
})
MERGE (nc :Commit { hash: c.hash})
ON CREATE SET
nc = c
ON MATCH SET
nc = c
"""
for b in batch(commits, batch_size):
graph.run(query, commits=b)
if index:
create_index_commits(graph)


def index_parent_commits(graph, parents, batch_size=100):
Expand All @@ -42,31 +35,26 @@ def index_parent_commits(graph, parents, batch_size=100):
graph.run(query, ac=b)


def index_authors(graph, authors, batch_size=100, index=True):
def index_authors(graph, authors, batch_size=100):
query = """
UNWIND {authors} AS a
MERGE (:Developer { hash: a.hash,
email: a.email,
name: a.name
})
MERGE (nd:Developer { hash: a.hash})
ON CREATE SET nd = a
ON MATCH SET nd = a
"""
for b in batch(authors, batch_size):
graph.run(query, authors=b)
if index:
create_index_authors(graph)


def index_branches(graph, branches, batch_size=100, index=True):
def index_branches(graph, branches, batch_size=100):
query = """
UNWIND {branches} AS a
MERGE (:Branch { hash: a.hash,
name:a.name,
project_id: a.project_id})
MERGE (nb:Branch { hash: a.hash})
ON CREATE SET nb = a
ON MATCH SET nb = a
"""
for b in batch(branches, batch_size):
graph.run(query, branches=b)
if index:
create_index_branches(graph)


def index_branch_commits(graph, bc, batch_size=100):
Expand All @@ -80,32 +68,27 @@ def index_branch_commits(graph, bc, batch_size=100):
graph.run(query, ac=b)


def index_files(graph, files, batch_size=100, index=True):
def index_files(graph, files, batch_size=100):
query = """
UNWIND {files} AS f
MERGE (:File { hash: f.hash,
project_id: f.project_id,
type:f.type, name: f.name})
MERGE (nf:File { hash: f.hash})
ON CREATE SET nf = f
ON MATCH SET nf = f
"""
for b in batch(files, batch_size):
graph.run(query, files=b)
if index:
create_index_files(graph)


def index_methods(graph, methods, batch_size=100, index=True):
def index_methods(graph, methods, batch_size=100):
query = """
UNWIND {methods} AS f
MERGE (:Method { hash: f.hash,
project_id: f.project_id,
name: f.name,
file_name: f.file_name})
MERGE (nm:Method { hash: f.hash})
ON CREATE SET nm = f
ON MATCH SET nm = f
"""

for b in batch(methods, batch_size):
graph.run(query, methods=b)
if index:
create_index_methods(graph)


def index_author_commits(graph, ac, batch_size=100):
Expand Down Expand Up @@ -161,85 +144,94 @@ def create_index_authors(graph):
graph.run(query)


def create_index_commits(graph):
hash_q = """
CREATE INDEX ON :Commit(hash)
"""
def create_index_commits(graph, hash=True):
if hash:
hash_q = """
CREATE INDEX ON :Commit(hash)
"""
graph.run(hash_q)

pid_q = """
CREATE INDEX ON :Commit(project_id)
"""
graph.run(hash_q)

graph.run(pid_q)


def create_index_branches(graph):
hash_q = """
CREATE INDEX ON :Branch(hash)
"""
def create_index_branches(graph, hash=True):
if hash:
hash_q = """
CREATE INDEX ON :Branch(hash)
"""
graph.run(hash_q)

pid_q = """
CREATE INDEX ON :Branch(project_id)
"""
graph.run(hash_q)
graph.run(pid_q)


def create_index_files(graph):
hash_q = """
CREATE INDEX ON :File(hash)
"""
def create_index_files(graph, hash=True):
if hash:
hash_q = """
CREATE INDEX ON :File(hash)
"""
graph.run(hash_q)

pid_q = """
CREATE INDEX ON :File(project_id)
"""
graph.run(hash_q)
graph.run(pid_q)


def create_index_methods(graph):
hash_q = """
CREATE INDEX ON :Method(hash)
"""
def create_index_methods(graph, hash=True):
if hash:
hash_q = """
CREATE INDEX ON :Method(hash)
"""
graph.run(hash_q)

pid_q = """
CREATE INDEX ON :Method(project_id)
"""
graph.run(hash_q)
graph.run(pid_q)


def index_all(graph, developers, commits, parents, dev_commits, branches,
branches_commits, files, commit_files, methods, file_methods,
commit_methods, batch_size=100, index=True):
commit_methods, batch_size=100):

total = datetime.now()

developers = list({v['hash']: v for v in developers}.values())
print('Indexing ', len(developers), ' authors')
start = datetime.now()
index_authors(graph, developers, batch_size, index)
index_authors(graph, developers, batch_size)
print('Indexed authors in: \t', datetime.now()-start)

print('Indexing ', len(commits), ' commits')
start = datetime.now()
index_commits(graph, commits, batch_size, index)
index_commits(graph, commits, batch_size)
print('Indexed commits in: \t', datetime.now()-start)

branches = list({v['hash']: v for v in branches}.values())
branches_commits = list({str(i): i for i in branches_commits}.values())
print('Indexing ', len(branches), ' branches')
start = datetime.now()
index_branches(graph, branches, batch_size, index)
index_branches(graph, branches, batch_size)
index_branch_commits(graph, branches_commits, batch_size)
print('Indexed branches in: \t', datetime.now()-start)

files = list({v['hash']: v for v in files}.values())
print('Indexing ', len(files), ' files')
start = datetime.now()
index_files(graph, files, batch_size, index)
index_files(graph, files, batch_size)
print('Indexed files in: \t', datetime.now()-start)

methods = list({v['hash']: v for v in methods}.values())
print('Indexing ', len(methods), ' methods')
start = datetime.now()
index_methods(graph, methods, batch_size, index)
index_methods(graph, methods, batch_size)
print('Indexed methods in: \t', datetime.now()-start)

parents = list({str(i): i for i in parents}.values())
Expand Down Expand Up @@ -272,19 +264,19 @@ def index_all(graph, developers, commits, parents, dev_commits, branches,
print('Indexing took: \t', datetime.now()-total)


def index_cache(graph, cache, batch_size=100, index=True):
def index_cache(graph, cache, batch_size=100):
total = datetime.now()
index_authors(graph, list(
{v['hash']: v for v in cache.data['developers']}.values()), batch_size, index)
index_commits(graph, cache.data['commits'], batch_size, index)
{v['hash']: v for v in cache.data['developers']}.values()), batch_size)
index_commits(graph, cache.data['commits'], batch_size)
index_branches(graph, list(
{v['hash']: v for v in cache.data['branches']}.values()), batch_size, index)
{v['hash']: v for v in cache.data['branches']}.values()), batch_size)
index_branch_commits(graph, list(
{str(i): i for i in cache.data['branches_commits']}.values()), batch_size)
index_files(graph, list(
{v['hash']: v for v in cache.data['files']}.values()), batch_size, index)
{v['hash']: v for v in cache.data['files']}.values()), batch_size)
index_methods(graph, list(
{v['hash']: v for v in cache.data['methods']}.values()), batch_size, index)
{v['hash']: v for v in cache.data['methods']}.values()), batch_size)
index_parent_commits(graph, list(
{str(i): i for i in cache.data['parents']}.values()), batch_size)
index_author_commits(graph, cache.data['dev_commits'], batch_size)
Expand Down
13 changes: 7 additions & 6 deletions graphrepo/drillers/db_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ def create_hash_constraints(graph):
graph.run(q)


def create_indices(graph):
def create_indices(graph, hash_index=True):
"""Initializes all indexes for database"""
utils.create_index_authors(graph)
utils.create_index_branches(graph)
utils.create_index_commits(graph)
utils.create_index_files(graph)
utils.create_index_methods(graph)
if hash_index:
utils.create_index_authors(graph)
utils.create_index_branches(graph, hash_index)
utils.create_index_commits(graph, hash_index)
utils.create_index_files(graph, hash_index)
utils.create_index_methods(graph, hash_index)
2 changes: 1 addition & 1 deletion graphrepo/drillers/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def init_db(self):
try:
self._check_connection()
db_init.create_hash_constraints(self.graph)
db_init.create_indices(self.graph)
db_init.create_indices(self.graph, hash_index=False)
except Exception as e:
raise e

Expand Down
16 changes: 12 additions & 4 deletions graphrepo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,19 @@ def load_json(path):
return json.load(json_file)


def get_file_hash(file, project_id=None):
def get_file_hash(file, project_id=None, use_new_path=False):
name = ''
if file.old_path and not file.new_path:
name = name+file.old_path
else:
if not file.old_path and file.new_path:
name = name+file.new_path
elif file.old_path and not file.new_path:
name = name+file.old_path
elif file.old_path and file.new_path:
if file.old_path != file.new_path:
print(file.old_path, file.new_path)
if use_new_path:
name = name+file.new_path
else:
name = name + file.old_path

name = name+file.filename
name = project_id + name if project_id else name
Expand Down Expand Up @@ -141,6 +148,7 @@ def format_branch_commit(bhash, chash):
def format_file(file, project_id):
return {
'hash': get_file_hash(file, project_id),
'new_hash': get_file_hash(file, project_id, use_new_path=True),
'name': file.filename,
'project_id': project_id,
'type': '.' + file.filename.split('.')[-1:][0]
Expand Down

0 comments on commit dca3466

Please sign in to comment.