Skip to content

Commit

Permalink
Add dependency parser task for maven ecosystem
Browse files Browse the repository at this point in the history
GithubDependencyTreeTask performs dependency parsing currently for maven ecosystem by using mvn dependency:tree plugin.
  • Loading branch information
abs51295 committed Mar 1, 2018
1 parent 740c74e commit 5fe8bf0
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 2 deletions.
8 changes: 8 additions & 0 deletions f8a_worker/dispatcher/flows/osioAnalysisFlow.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
flow-definitions:
- name: 'osioAnalysisFlow'
queue: '{DEPLOYMENT_PREFIX}_api_osioAnalysisFlow_v0'
edges:
- from:
to:
- 'dependency_tree'
7 changes: 7 additions & 0 deletions f8a_worker/dispatcher/nodes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,12 @@
max_retry: 0
queue: '{DEPLOYMENT_PREFIX}_{WORKER_ADMINISTRATION_REGION}_KeywordsSummaryTask_tagging_v0'
storage: 'S3KeywordsSummary'
- name: 'dependency_tree'
classname: 'GithubDependencyTreeTask'
import: 'f8a_worker.workers'
max_retry: 0
queue: '{DEPLOYMENT_PREFIX}_{WORKER_ADMINISTRATION_REGION}_GithubDependencyTreeTask_v0'
storage: 'BayesianPostgres'

flows:
- 'bayesianFlow'
Expand All @@ -301,6 +307,7 @@
- 'keywordsSummaryFlow'
- 'keywordsApiSummaryFlow'
- 'keywordsPrioritySummaryFlow'
- 'osioAnalysisFlow'

storages:
- name: 'BayesianPostgres'
Expand Down
5 changes: 3 additions & 2 deletions f8a_worker/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,14 @@ def config():
"/usr/bin/true"])

@classmethod
def clone(cls, url, path, depth=None, branch=None, single_branch=False):
def clone(cls, url, path, timeout=300, depth=None, branch=None, single_branch=False):
"""Clone repository provided as url to specific path.
:param url: str
:param path: str
:param depth: str
:param branch: str
:param timeout: int
:return: instance of Git()
"""
orig_url = url
Expand All @@ -63,7 +64,7 @@ def clone(cls, url, path, depth=None, branch=None, single_branch=False):
if single_branch:
cmd.extend(["--single-branch"])
try:
TimedCommand.get_command_output(cmd, graceful=False)
TimedCommand.get_command_output(cmd, graceful=False, timeout=timeout)
except TaskError as exc:
raise TaskError("Unable to clone: %s" % orig_url) from exc
return cls(path=path)
Expand Down
9 changes: 9 additions & 0 deletions f8a_worker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,3 +648,12 @@ def get_response(url, headers=None, sleep_time=2, retry_count=10):
message = "Failed to get results from {url} with {err}".format(url=url, err=err)
logger.error(message)
raise TaskError(message) from err


def add_maven_coords_to_set(coordinates_str, gav_set):
artifact_coords = MavenCoordinates.from_str(coordinates_str)
gav_set.add("{group_id}:{artifact_id}:{version}".format(
group_id=artifact_coords.groupId,
artifact_id=artifact_coords.artifactId,
version=artifact_coords.version
))
1 change: 1 addition & 0 deletions f8a_worker/workers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@
from f8a_worker.workers.result_collector import ResultCollector, PackageResultCollector
from f8a_worker.workers.stackaggregator import StackAggregatorTask
from f8a_worker.workers.stackaggregator_v2 import StackAggregatorV2Task
from f8a_worker.workers.dependency_parser import GithubDependencyTreeTask
77 changes: 77 additions & 0 deletions f8a_worker/workers/dependency_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
Output: TBD
"""

from f8a_worker.base import BaseTask
from f8a_worker.errors import TaskError
from f8a_worker.utils import TimedCommand, cwd, MavenCoordinates, add_maven_coords_to_set
from f8a_worker.process import Git
from tempfile import TemporaryDirectory
from pathlib import Path
import re


class GithubDependencyTreeTask(BaseTask):
"""Finds out direct and indirect dependencies from a given github repository."""

def execute(self, arguments=None):
"""Main execute method """
self._strict_assert(arguments.get('github_repo'))
self._strict_assert(arguments.get('github_sha'))
self._strict_assert(arguments.get('email_ids'))
github_repo = arguments.get('github_repo')
github_sha = arguments.get('github_sha')
dependencies = list(GithubDependencyTreeTask.extract_dependencies(github_repo, github_sha))
return {"dependencies": dependencies}

@staticmethod
def extract_dependencies(github_repo, github_sha):

"""Extract the dependencies information.
Currently assuming repository is maven repository.
"""

with TemporaryDirectory() as workdir:
repo = Git.clone(url=github_repo, path=workdir, timeout=3600)
repo.reset(revision=github_sha, hard=True)
with cwd(repo.repo_path):
cmd = ["mvn", "org.apache.maven.plugins:maven-dependency-plugin:3.0.2:tree",
"-DoutputType=dot",
"-DoutputFile={filename}".format(filename=
Path.cwd().joinpath("dependency-tree.txt")),
"-DappendOutput=true"]
timed_cmd = TimedCommand(cmd)
status, output, error = timed_cmd.run(timeout=3600)
if status != 0 or not Path("dependency-tree.txt").is_file():
raise TaskError(error)
with open("dependency-tree.txt") as f:
return GithubDependencyTreeTask.parse_maven_dependency_tree(f.readlines())

@staticmethod
def parse_maven_dependency_tree(dependency_tree):

"""Parses the dot representation of maven dependency tree.
For available representations of dependency tree see
http://maven.apache.org/plugins/maven-dependency-plugin/tree-mojo.html#outputType
"""
dot_file_parser_regex = re.compile('"(.*?)"')
set_pom_names = set()
set_package_names = set()
for line in dependency_tree:
matching_lines_list = dot_file_parser_regex.findall(line)
# If there's only one string, it means this a pom name.
if len(matching_lines_list) == 1:
# Remove scope from package name. Package name is of the form:
# <group-id>:<artifact-id>:<packaging>:<?classifier>:<version>:<scope>
matching_line = matching_lines_list[0].rsplit(':', 1)[0]
add_maven_coords_to_set(matching_line, set_pom_names)
else:
for matching_line in matching_lines_list:
matching_line = matching_line.rsplit(':', 1)[0]
add_maven_coords_to_set(matching_line, set_package_names)

# Remove pom names from actual package names.
return set_package_names.difference(set_pom_names)

0 comments on commit 5fe8bf0

Please sign in to comment.