diff --git a/f8a_worker/dispatcher/flows/osioAnalysisFlow.yml b/f8a_worker/dispatcher/flows/osioAnalysisFlow.yml new file mode 100644 index 000000000..901cc1f39 --- /dev/null +++ b/f8a_worker/dispatcher/flows/osioAnalysisFlow.yml @@ -0,0 +1,8 @@ +--- + flow-definitions: + - name: 'osioAnalysisFlow' + queue: '{DEPLOYMENT_PREFIX}_api_osioAnalysisFlow_v0' + edges: + - from: + to: + - 'dependency_tree' \ No newline at end of file diff --git a/f8a_worker/dispatcher/nodes.yml b/f8a_worker/dispatcher/nodes.yml index d8c177c1e..3ce6de916 100644 --- a/f8a_worker/dispatcher/nodes.yml +++ b/f8a_worker/dispatcher/nodes.yml @@ -280,6 +280,12 @@ max_retry: 0 queue: '{DEPLOYMENT_PREFIX}_{WORKER_ADMINISTRATION_REGION}_KeywordsSummaryTask_tagging_v0' storage: 'S3KeywordsSummary' + - name: 'dependency_tree' + classname: 'GithubDependencyTreeTask' + import: 'f8a_worker.workers' + max_retry: 0 + queue: '{DEPLOYMENT_PREFIX}_{WORKER_ADMINISTRATION_REGION}_GithubDependencyTreeTask_v0' + storage: 'BayesianPostgres' flows: - 'bayesianFlow' @@ -301,6 +307,7 @@ - 'keywordsSummaryFlow' - 'keywordsApiSummaryFlow' - 'keywordsPrioritySummaryFlow' + - 'osioAnalysisFlow' storages: - name: 'BayesianPostgres' diff --git a/f8a_worker/process.py b/f8a_worker/process.py index e46ac413e..6630b85e4 100644 --- a/f8a_worker/process.py +++ b/f8a_worker/process.py @@ -42,13 +42,14 @@ def config(): "/usr/bin/true"]) @classmethod - def clone(cls, url, path, depth=None, branch=None, single_branch=False): + def clone(cls, url, path, timeout=300, depth=None, branch=None, single_branch=False): """Clone repository provided as url to specific path. :param url: str :param path: str :param depth: str :param branch: str + :param timeout: int :return: instance of Git() """ orig_url = url @@ -63,7 +64,7 @@ def clone(cls, url, path, depth=None, branch=None, single_branch=False): if single_branch: cmd.extend(["--single-branch"]) try: - TimedCommand.get_command_output(cmd, graceful=False) + TimedCommand.get_command_output(cmd, graceful=False, timeout=timeout) except TaskError as exc: raise TaskError("Unable to clone: %s" % orig_url) from exc return cls(path=path) diff --git a/f8a_worker/utils.py b/f8a_worker/utils.py index a9d2a19d8..f0aeb8fdc 100644 --- a/f8a_worker/utils.py +++ b/f8a_worker/utils.py @@ -648,3 +648,12 @@ def get_response(url, headers=None, sleep_time=2, retry_count=10): message = "Failed to get results from {url} with {err}".format(url=url, err=err) logger.error(message) raise TaskError(message) from err + + +def add_maven_coords_to_set(coordinates_str, gav_set): + artifact_coords = MavenCoordinates.from_str(coordinates_str) + gav_set.add("{group_id}:{artifact_id}:{version}".format( + group_id=artifact_coords.groupId, + artifact_id=artifact_coords.artifactId, + version=artifact_coords.version + )) diff --git a/f8a_worker/workers/__init__.py b/f8a_worker/workers/__init__.py index b2471c7e8..ebc47d1a5 100644 --- a/f8a_worker/workers/__init__.py +++ b/f8a_worker/workers/__init__.py @@ -31,3 +31,4 @@ from f8a_worker.workers.result_collector import ResultCollector, PackageResultCollector from f8a_worker.workers.stackaggregator import StackAggregatorTask from f8a_worker.workers.stackaggregator_v2 import StackAggregatorV2Task +from f8a_worker.workers.dependency_parser import GithubDependencyTreeTask diff --git a/f8a_worker/workers/dependency_parser.py b/f8a_worker/workers/dependency_parser.py new file mode 100644 index 000000000..31ee100e4 --- /dev/null +++ b/f8a_worker/workers/dependency_parser.py @@ -0,0 +1,77 @@ +""" +Output: TBD + +""" + +from f8a_worker.base import BaseTask +from f8a_worker.errors import TaskError +from f8a_worker.utils import TimedCommand, cwd, MavenCoordinates, add_maven_coords_to_set +from f8a_worker.process import Git +from tempfile import TemporaryDirectory +from pathlib import Path +import re + + +class GithubDependencyTreeTask(BaseTask): + """Finds out direct and indirect dependencies from a given github repository.""" + + def execute(self, arguments=None): + """Main execute method """ + self._strict_assert(arguments.get('github_repo')) + self._strict_assert(arguments.get('github_sha')) + self._strict_assert(arguments.get('email_ids')) + github_repo = arguments.get('github_repo') + github_sha = arguments.get('github_sha') + dependencies = list(GithubDependencyTreeTask.extract_dependencies(github_repo, github_sha)) + return {"dependencies": dependencies} + + @staticmethod + def extract_dependencies(github_repo, github_sha): + + """Extract the dependencies information. + + Currently assuming repository is maven repository. + """ + + with TemporaryDirectory() as workdir: + repo = Git.clone(url=github_repo, path=workdir, timeout=3600) + repo.reset(revision=github_sha, hard=True) + with cwd(repo.repo_path): + cmd = ["mvn", "org.apache.maven.plugins:maven-dependency-plugin:3.0.2:tree", + "-DoutputType=dot", + "-DoutputFile={filename}".format(filename= + Path.cwd().joinpath("dependency-tree.txt")), + "-DappendOutput=true"] + timed_cmd = TimedCommand(cmd) + status, output, error = timed_cmd.run(timeout=3600) + if status != 0 or not Path("dependency-tree.txt").is_file(): + raise TaskError(error) + with open("dependency-tree.txt") as f: + return GithubDependencyTreeTask.parse_maven_dependency_tree(f.readlines()) + + @staticmethod + def parse_maven_dependency_tree(dependency_tree): + + """Parses the dot representation of maven dependency tree. + + For available representations of dependency tree see + http://maven.apache.org/plugins/maven-dependency-plugin/tree-mojo.html#outputType + """ + dot_file_parser_regex = re.compile('"(.*?)"') + set_pom_names = set() + set_package_names = set() + for line in dependency_tree: + matching_lines_list = dot_file_parser_regex.findall(line) + # If there's only one string, it means this a pom name. + if len(matching_lines_list) == 1: + # Remove scope from package name. Package name is of the form: + # ::::: + matching_line = matching_lines_list[0].rsplit(':', 1)[0] + add_maven_coords_to_set(matching_line, set_pom_names) + else: + for matching_line in matching_lines_list: + matching_line = matching_line.rsplit(':', 1)[0] + add_maven_coords_to_set(matching_line, set_package_names) + + # Remove pom names from actual package names. + return set_package_names.difference(set_pom_names)