diff --git a/README.md b/README.md index b7b3eff5..8dacd801 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,8 @@ Run the tool using the following command structure: ### Arguments: ``` -usage: main.py [-h] -p PROJECT_REPO_NAME -v RELEASE_VERSION_OLD [-vn RELEASE_VERSION_NEW] -s [-d] [-n] -pm {yarn-classic,yarn-berry,pnpm,npm,maven} [--pnpm-scope] +usage: main.py [-h] -p PROJECT_REPO_NAME -v RELEASE_VERSION_OLD [-vn RELEASE_VERSION_NEW] -s [-d] [-n] -pm {yarn-classic,yarn-berry,pnpm,npm,maven} [--pnpm-scope] [--debug] [--check-source-code] + [--check-release-tags] [--check-deprecated] [--check-forks] [--check-provenance] [--check-code-signature] options: -h, --help show this help message and exit @@ -67,12 +68,20 @@ options: Run static analysis and generate a markdown report of the project -d, --differential-analysis Run differential analysis and generate a markdown report of the project - -n, --name-match Compare the package names with the name in the in the package.json file. This option will slow down the execution time due to the API rate limit of - code search. + -n, --name-match Compare the package names with the name in the in the package.json file. This option will slow down the execution time due to the API rate limit of code search. -pm {yarn-classic,yarn-berry,pnpm,npm,maven}, --package-manager {yarn-classic,yarn-berry,pnpm,npm,maven} The package manager used in the project. - --pnpm-scope Extract dependencies from pnpm with a specific scope using 'pnpm list --filter --depth Infinity' command. Configure the scope in tool_config.py - file. + --pnpm-scope Extract dependencies from pnpm with a specific scope using 'pnpm list --filter --depth Infinity' command. Configure the scope in tool_config.py file. + --debug Enable debug mode. + +smell checks: + --check-source-code Check for dependencies with no link to source code repositories + --check-release-tags Check for dependencies with no tag/commit sha for release + --check-deprecated Check for deprecated dependencies + --check-forks Check for dependencies that are forks + --check-provenance Check for dependencies with no build attestation + --check-code-signature + Check for dependencies with missing/invalid code signature ``` ### Example usage: @@ -147,6 +156,8 @@ specified in the lockfile/pom/similar is not found. They come from a combination work and our own research on this subject. These formats are the following: +
Tag formats + - `` - `v` - `r-` @@ -157,11 +168,32 @@ These formats are the following: - `_v` - `-` - `_` +- `@` +- `-v` +- `_v` +- `-` +- `_` +- `@` +- `-v` +- `_v` +- `-` +- `_` - `release/` - `-release` - `v.` - `p1-p2-p3` +As examples of what `package_name`, `repo_name`, and `project_name` could be, `maven-surefire` +is an interesting dependency: + +- `maven-surefire-common` is the package name +- `maven-surefire` is the repo name (we remove the owner prefix) +- `surefire` is the project name + +In particular, there are many `maven-*` dependencies whose tags follow these last conventions. + +
+ Note than this does not mean that if `dirty-waters` does not find a tag, it doesn't exist: it means that it either doesn't exist, or that its format is not one of the above. diff --git a/flake.lock b/flake.lock index 15f7fcbc..847e6174 100644 --- a/flake.lock +++ b/flake.lock @@ -2,11 +2,11 @@ "nodes": { "nixpkgs": { "locked": { - "lastModified": 1736867362, - "narHash": "sha256-i/UJ5I7HoqmFMwZEH6vAvBxOrjjOJNU739lnZnhUln8=", + "lastModified": 1737885640, + "narHash": "sha256-GFzPxJzTd1rPIVD4IW+GwJlyGwBDV1Tj5FLYwDQQ9sM=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "9c6b49aeac36e2ed73a8c472f1546f6d9cf1addc", + "rev": "4e96537f163fad24ed9eb317798a79afc85b51b7", "type": "github" }, "original": { @@ -18,11 +18,11 @@ }, "nixpkgs_2": { "locked": { - "lastModified": 1735488052, - "narHash": "sha256-EYaJtjLImMXzYEf9h52hkuPolXqKG/cAJlCOba6emL0=", + "lastModified": 1737917096, + "narHash": "sha256-wOo5jWu88VRbm0TTNl9KxE4nIkfnXVKxLvZwpTn75wk=", "owner": "nixos", "repo": "nixpkgs", - "rev": "adaa9f280329b5f814e8dc83eceddd42b20f72f4", + "rev": "a47cb26bbe26d63321cbb96de6d1981d790d9748", "type": "github" }, "original": { @@ -37,11 +37,11 @@ "nixpkgs": "nixpkgs_2" }, "locked": { - "lastModified": 1736836246, - "narHash": "sha256-bFvBMziYvFtB/Hly+O4WtBGeiDoz7eb2dVQbOvIrHHM=", + "lastModified": 1737933698, + "narHash": "sha256-MpPHyTCrI7dpiRgzZTH7PEMLZCvI2Dc3iOr4GSaQ/II=", "owner": "nix-community", "repo": "pyproject.nix", - "rev": "3db43c7414fce4ce94ca67545233d251d306385a", + "rev": "78ea10a115be7b7ae45b241d06392e014988a162", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index b434b70c..bca0d021 100644 --- a/flake.nix +++ b/flake.nix @@ -22,6 +22,6 @@ in { - devShells.x86_64-linux.default = pkgs.mkShell { packages = [ pythonEnv pkgs.maven ]; }; + devShells.x86_64-linux.default = pkgs.mkShell { packages = [ pythonEnv pkgs.maven pkgs.yarn pkgs.pnpm pkgs.act ]; }; }; } \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 3ac1e0d0..78114369 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ readme = "README.md" requires-python = ">=3.12" keywords = [ "software supply chain", "ssc", "dependencies", "npm",] classifiers = [ "Intended Audience :: Developers", "Topic :: Software Development :: Build Tools", "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent",] -dependencies = [ "attrs == 24.2.0", "cattrs == 24.1.2", "certifi == 2024.8.30", "charset-normalizer == 3.4.0", "exceptiongroup == 1.2.2", "GitPython == 3.1.43", "idna == 3.10", "numpy == 2.1.2", "pandas == 2.2.3", "platformdirs == 4.3.6", "python-dateutil == 2.9.0.post0", "pytz == 2024.2", "requests == 2.32.3", "requests-cache == 1.2.1", "six == 1.16.0", "tabulate == 0.9.0", "tqdm == 4.66.5", "typing_extensions == 4.12.2", "tzdata == 2024.2", "url-normalize == 1.4.3", "urllib3 == 2.2.3",] +dependencies = [ "attrs == 24.2.0", "cattrs == 24.1.2", "certifi == 2024.8.30", "charset-normalizer == 3.4.0", "exceptiongroup == 1.2.2", "GitPython == 3.1.43", "idna == 3.10", "numpy == 2.1.2", "pandas == 2.2.3", "platformdirs == 4.3.6", "python-dateutil == 2.9.0.post0", "pytz == 2024.2", "PyYAML == 6.0.2", "requests == 2.32.3", "requests-cache == 1.2.1", "six == 1.16.0", "tabulate == 0.9.0", "tqdm == 4.66.5", "typing_extensions == 4.12.2", "tzdata == 2024.2", "url-normalize == 1.4.3", "urllib3 == 2.2.3", "xmltodict == 0.14.2",] [[project.authors]] name = "Raphina Liu" email = "raphina@kth.se" diff --git a/requirements.txt b/requirements.txt index 05ef5995..38435594 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ pandas==2.2.3 platformdirs==4.3.6 python-dateutil==2.9.0.post0 pytz==2024.2 +PyYAML==6.0.2 requests==2.32.3 requests-cache==1.2.1 six==1.16.0 @@ -19,3 +20,4 @@ typing_extensions==4.12.2 tzdata==2024.2 url-normalize==1.4.3 urllib3==2.2.3 +xmltodict==0.14.2 diff --git a/tool/compare_commits.py b/tool/compare_commits.py index 9a58dd7a..9eb28a14 100644 --- a/tool/compare_commits.py +++ b/tool/compare_commits.py @@ -1,33 +1,33 @@ import requests import logging import os -from tool_config import setup_cache - -github_token = os.getenv("GITHUB_API_TOKEN") - -headers = { - "Authorization": f"Bearer {github_token}", - "Accept": "application/vnd.github.v3+json", -} - - -def tag_format(tag, package_name): - tag_formats = [ - f"{tag}", - f"v{tag}", - f"r{tag}", - f"release-{tag}", - f"parent-{tag}", - f"{package_name}@{tag}", - f"{package_name}-v{tag}", - f"{package_name}_v{tag}", - f"{package_name}-{tag}", - f"{package_name}_{tag}", - # Below: further tag formats found in the AROMA paper, table 3: https://dl.acm.org/doi/pdf/10.1145/3643764 - f"release/{tag}", - f"{tag}-release", - f"v.{tag}", - ] +from tool_config import get_cache_manager, make_github_request + +cache_manager = get_cache_manager() + + +def tag_format(tag, package_name, repo_name): + _, repo_name = repo_name.split("/") # splits owner and repo name + project_name = repo_name.split("-")[-1] # deals with lots of maven- repos (e.g., surefire, etc) + tag_formats = set( + [ + f"{tag}", + f"v{tag}", + f"v_{tag}", + f"r{tag}", + f"release-{tag}", + f"parent-{tag}", + # Below: further tag formats found in the AROMA paper, table 3: https://dl.acm.org/doi/pdf/10.1145/3643764 + f"release/{tag}", + f"{tag}-release", + f"v.{tag}", + ] + + [ + f"{name}{suffix}" + for name in [package_name, repo_name, project_name] + for suffix in [f"@{tag}", f"-v{tag}", f"_v{tag}", f"-{tag}", f"_{tag}"] + ] + ) only_package_name, artifact_id_parts = None, None if "/" in package_name: # NPM-based @@ -38,169 +38,228 @@ def tag_format(tag, package_name): artifact_id_parts = only_package_name.split("-") if only_package_name: - tag_formats.append(f"{only_package_name}@{tag}") - tag_formats.append(f"{only_package_name}-v{tag}") - tag_formats.append(f"{only_package_name}-{tag}") - tag_formats.append(f"{only_package_name}_{tag}") + tag_formats.add(f"{only_package_name}@{tag}") + tag_formats.add(f"{only_package_name}-v{tag}") + tag_formats.add(f"{only_package_name}-{tag}") + tag_formats.add(f"{only_package_name}_{tag}") if artifact_id_parts and len(artifact_id_parts) > 1: # p1, p2, p3 from AROMA # needs to be reversed with [::-1] because p1 is actually the last element, p2 the 2nd to last, etc - tag_formats.extend(["-".join(artifact_id_parts[::-1][: i + 1]) + tag for i in range(len(artifact_id_parts))]) + tag_formats.update(["-".join(artifact_id_parts[::-1][: i + 1]) + tag for i in range(len(artifact_id_parts))]) return tag_formats -def get_commit_authors(headers, packages_data): - logging.info("Getting commits...") +def find_existing_tags(tag_formats, repo_name): + for tag_format in tag_formats: + tag_url = f"https://api.github.com/repos/{repo_name}/git/ref/tags/{tag_format}" + response = make_github_request(tag_url, silent=True) + if response: + return tag_format + return None + + +def get_commit_info(commit): + if commit.get("committer") is None: + committer_login = "No committer info" + return None + + sha = commit.get("sha") + node_id = commit.get("node_id") + commit_url = commit.get("url") + author_data = commit.get("commit").get("author") + author_name = author_data.get("name") + author_email = author_data.get("email") + author_info = commit.get("author") + + if author_info is None: + author_login = "No author info" + author_type = "No author info" + author_id = "No author info" + else: + author_login = commit.get("author").get("login", "No_author_login") + author_id = commit.get("author").get("id", "No_author_id") + author_type = commit.get("author").get("type", "No_author_type") + + return { + "sha": sha, + "node_id": node_id, + "commit_url": commit_url, + "name": author_name, + "email": author_email, + "login": author_login, + "a_type": author_type, + "id": author_id, + } + + +def get_authors_from_response(url, data, package_info): + result = { + "repo": package_info.get("repo_pure"), + "repo_name": package_info.get("repo_name"), + "category": package_info.get("message"), + "compare_url": url, + } + + authors_info = [] + commits = data.get("commits") + if commits: + for commit in commits: + # Retrieve commit info from cache + commit_info = cache_manager.commit_comparison_cache.get_authors_from_url(commit.get("url")) + if not commit_info: + commit_info = get_commit_info(commit) + cache_manager.commit_comparison_cache.cache_authors_from_url(commit.get("url"), commit_info) + + if commit_info: + authors_info.append(commit_info) + result.update( + { + "authors": authors_info, + "tag1": package_info.get("chosen_v1"), + "tag2": package_info.get("chosen_v2"), + } + ) + else: + result.update( + { + "tag1": package_info.get("version1"), + "tag2": package_info.get("version2"), + "commits_info_message": "No commits found", + "status_code": 200, + } + ) + + return result + + +def get_authors_from_tags(tag1, tag2, package, package_info): + repo_name = package_info.get("repo_name") + tag_formats_old = tag_format(tag1, package, repo_name) + existing_tag_format_old = find_existing_tags(tag_formats_old, repo_name) + tag_formats_new = tag_format(tag2, package, repo_name) + existing_tag_format_new = find_existing_tags(tag_formats_new, repo_name) + category = package_info.get("message") + + compare_url = ( + f"https://api.github.com/repos/{repo_name}/compare/{existing_tag_format_old}...{existing_tag_format_new}" + ) + response = make_github_request(compare_url, max_retries=2) + + if not response: + status_old = "GitHub old tag not found" + status_new = "GitHub new tag not found" + old_tag_found, new_tag_found = False, False + if existing_tag_format_old: + status_old = existing_tag_format_old + old_tag_found = True + for tag_old in tag_formats_old: + old_tag_url = f"https://api.github.com/repos/{repo_name}/git/ref/tags/{tag_old}" + response = requests.get(old_tag_url) + if response.status_code == 200: + status_old = tag_old + old_tag_found = True + break + + if not old_tag_found: + for tag_new in tag_formats_new: + new_tag_url = f"https://api.github.com/repos/{repo_name}/git/ref/tags/{tag_new}" + response = requests.get(new_tag_url) + if response.status_code == 200: + status_new = tag_new + new_tag_found = True + break + + return { + "tag1": existing_tag_format_old if existing_tag_format_old else tag_formats_old[-1], + "tag2": existing_tag_format_new if existing_tag_format_new else tag_formats_new[-1], + "status_old": status_old, + "status_new": status_new, + "category": "Upgraded package", + "repo_name": package_info.get("repo_name"), + } + + return get_authors_from_response(compare_url, response, package_info) + + +def get_patch_authors(repo_name, patch_name, path, release_version_sha, headers): + url = f"https://api.github.com/repos/{repo_name}/commits?path=.yarn/patches/{path}&sha={release_version_sha}" + patch_info = { + "patch_name": patch_name, + "repo_name": repo_name, + "commit_url": url, + } + + response = make_github_request(url, headers=headers) + authors_info = [] + if response: + for commit in response: + sha = commit.get("sha") + node_id = commit.get("node_id") + commit_url = commit.get("url") + author_data = commit.get("commit").get("author") + author_name = author_data.get("name") + author_email = author_data.get("email") + author_info = commit.get("author") + author_type = author_data.get("type") + if author_info is None: + author_login = "null" + else: + author_login = commit.get("author").get("login") + author_id = commit.get("author").get("id") + if commit.get("committer") is None: + committer_login = "null" + else: + committer_login = commit.get("committer").get("login") + committer_id = commit.get("committer").get("id") + committer_type = commit.get("committer").get("type") + + authors_info.append( + { + "sha": sha, + "node_id": node_id, + "commit_url": commit_url, + "name": author_name, + "email": author_email, + "login": author_login, + "a_type": author_type, + "id": author_id, + "committer_login": committer_login, + "committer_id": committer_id, + "c_type": committer_type, + } + ) + patch_info.update( + { + "category": "patch", + "authors": authors_info, + } + ) + else: + patch_info.update( + { + "authors": None, + "error": True, + "error_message": response.status_code, + } + ) + + return patch_info + +def get_commit_authors(packages_data): + logging.info("Getting commits for packages...") authors_per_package = {} for package, package_info in packages_data.items(): if package_info.get("compare_message") == "COMPARE": - print(f"Getting commits of {package}...") - repo = package_info.get("repo_pure") - repo_name = package_info.get("repo_name") - category = package_info.get("message") - - tag1 = package_info.get("version1") - tag2 = package_info.get("version2") - tag1_chosen = package_info.get("chosen_v1") tag2_chosen = package_info.get("chosen_v2") - - authors_info = [] - - comparison_found = False - compare_urls = [] - - tag_formats_new = tag_format(tag2_chosen, package) - tag_formats_old = tag_format(tag1_chosen, package) - - for tag_format_old, tag_format_new in zip(tag_formats_old, tag_formats_new): - compare_urls.append( - f"https://api.github.com/repos/{repo_name}/compare/{tag_format_old}...{tag_format_new}" - ) - - for compare_url in compare_urls: - # try: - response = requests.get(compare_url, headers=headers) - if response.status_code == 200: - comparison_found = True - break - - old_tag_urls = [] - new_tag_urls = [] - - if comparison_found is False: - for tag_old in tag_formats_old: - old_tag_urls.append(f"https://api.github.com/repos/{repo_name}/git/ref/tags/{tag_old}") - for tag_new in tag_formats_new: - new_tag_urls.append(f"https://api.github.com/repos/{repo_name}/git/ref/tags/{tag_new}") - - for old_tag_url in old_tag_urls: - try: - response = requests.get(old_tag_url, headers=headers) - if response.status_code == 200: - status_old = tag_old - break - else: - status_old = "GitHub old tag not found" - category = "Upgraded package" - except (ValueError, KeyError) as e: - logging.error("Error: %s", str(e)) - print(f"Error: {e}") - # Error_old = f"{e}" - continue - - for new_tag_url in new_tag_urls: - try: - response = requests.get(new_tag_url, headers=headers) - if response.status_code == 200: - status_new = tag_new - break - else: - status_new = "GitHub new tag not found" - category = "Upgraded package" - except (ValueError, KeyError) as e: - logging.error("Error: %s", str(e)) - print(f"Error: {e}") - continue - - authors_per_package[package] = { - "repo_name": repo_name, - "tag1": tag_old, - "status_old": status_old, - "tag2": tag_new, - "status_new": status_new, - "category": category, - } - - else: - response_json = response.json() - commits = response_json.get("commits") - - if commits: - for commit in commits: - sha = commit.get("sha") - node_id = commit.get("node_id") - commit_url = commit.get("url") - author_data = commit.get("commit").get("author") - author_name = author_data.get("name") - author_email = author_data.get("email") - author_info = commit.get("author") - - if author_info is None: - author_login = "No author info" - author_type = "No author info" - author_id = "No author info" - else: - author_login = commit.get("author").get("login", "No_author_login") - author_id = commit.get("author").get("id", "No_author_id") - author_type = commit.get("author").get("type", "No_author_type") - - if commit.get("committer") is None: - committer_login = "No committer info" - else: - committer_login = commit.get("committer").get("login", None) - committer_id = commit.get("committer").get("id", None) - committer_type = commit.get("committer").get("type", None) - - authors_info.append( - { - "sha": sha, - "node_id": node_id, - "commit_url": commit_url, - "name": author_name, - "email": author_email, - "login": author_login, - "a_type": author_type, - "id": author_id, - "committer_login": committer_login, - "committer_id": committer_id, - "c_type": committer_type, - } - ) - - authors_per_package[package] = { - "repo": repo, - "repo_name": repo_name, - "tag1": tag1_chosen, - "tag2": tag2_chosen, - "category": category, - "compare_url": compare_url, - "authors": authors_info, - } - - else: - authors_per_package[package] = { - "repo": repo, - "repo_name": repo_name, - "tag1": tag1, - "tag2": tag2, - "category": category, - "compare_url": compare_url, - "status_code": response.status_code, - "commits_info_message": "No commits found", - } + data = cache_manager.commit_comparison_cache.get_authors_from_tags(package, tag1_chosen, tag2_chosen) + if not data: + # Cache miss, get authors from GitHub + data = get_authors_from_tags(tag1_chosen, tag2_chosen, package, package_info) + cache_manager.commit_comparison_cache.cache_authors_from_tags(package, tag1_chosen, tag2_chosen, data) + authors_per_package[package] = data else: authors_per_package[package] = { @@ -220,30 +279,30 @@ def get_commit_authors(headers, packages_data): def get_patch_commits(headers, repo_name, release_version, patch_data): logging.info("Getting commits for patches...") - - get_release_v_api = f"https://api.github.com/repos/{repo_name}/tags?per_page=100" - - grv_response = requests.get(get_release_v_api, headers=headers) - grv_response_json = grv_response.json() - - for release in grv_response_json: - if release.get("name") == release_version: - release_version_sha = release.get("commit").get("sha") - break + release_version_sha = cache_manager.github_cache.get_tag_to_sha(repo_name, release_version) + if not release_version_sha: + get_release_v_api = f"https://api.github.com/repos/{repo_name}/git/ref/tags/{release_version}" + response = requests.get(get_release_v_api, headers=headers) + if response.status_code == 200: + response_json = response.json() + release_version_sha = response_json.get("object").get("sha") else: release_version_sha = None + cache_manager.github_cache.cache_tag_to_sha( + repo_name, release_version, "No release found" if release_version_sha is None else release_version_sha + ) + elif release_version_sha == "No release found": + release_version_sha = None authors_per_patches = {} - for changed_patch, details in patch_data.items(): authors_info = [] path = details.get("patch_file_path") if path is None: - api = None authors_per_patches[changed_patch] = { "patch_file_path": path, "repo_name": repo_name, - "api": api, + "api": None, "error": True, "error_message": "No patch file path found", } @@ -259,72 +318,20 @@ def get_patch_commits(headers, repo_name, release_version, patch_data): } continue - api = f"https://api.github.com/repos/{repo_name}/commits?path=.yarn/patches/{path}&sha={release_version_sha}" - response = requests.get(api, headers=headers) - - if response.status_code == 200: - response_json = response.json() - for commit in response_json: - sha = commit.get("sha") - node_id = commit.get("node_id") - commit_url = commit.get("url") - author_data = commit.get("commit").get("author") - author_name = author_data.get("name") - author_email = author_data.get("email") - author_info = commit.get("author") - author_type = author_data.get("type") - if author_info is None: - author_login = "null" - else: - author_login = commit.get("author").get("login") - author_id = commit.get("author").get("id") - if commit.get("committer") is None: - committer_login = "null" - else: - committer_login = commit.get("committer").get("login") - committer_id = commit.get("committer").get("id") - committer_type = commit.get("committer").get("type") - - authors_info.append( - { - "sha": sha, - "node_id": node_id, - "commit_url": commit_url, - "name": author_name, - "email": author_email, - "login": author_login, - "a_type": author_type, - "id": author_id, - "committer_login": committer_login, - "committer_id": committer_id, - "c_type": committer_type, - } - ) - else: - authors_per_patches[changed_patch] = { - "patch_name": changed_patch, - "repo_name": repo_name, - "commit_url": api, - "authors": None, - "error": True, - "error_message": response.status_code, - } - - authors_per_patches[changed_patch] = { - "patch_name": changed_patch, - "repo_name": repo_name, - "category": "patch", - "commit_url": api, - "authors": authors_info, - } + data = cache_manager.commit_comparison_cache.get_patch_authors(repo_name, path, release_version_sha) + if not data: + # Cache miss, get authors from GitHub + data = get_patch_authors(repo_name, changed_patch, path, release_version_sha, headers) + cache_manager.commit_comparison_cache.cache_patch_authors(repo_name, path, release_version_sha, data) + authors_per_patches[changed_patch] = data return authors_per_patches def get_commit_results(api_headers, repo_name, release_version, patch_data, packages_data): - setup_cache("package_commits") + cache_manager._setup_requests_cache(cache_name="compare_commits") authors_per_patches_result = get_patch_commits(api_headers, repo_name, release_version, patch_data) - authors_per_package_result = get_commit_authors(headers, packages_data) + authors_per_package_result = get_commit_authors(packages_data) commit_results = {**authors_per_patches_result, **authors_per_package_result} return commit_results diff --git a/tool/compare_packages.py b/tool/compare_packages.py index 130c40cc..04329769 100644 --- a/tool/compare_packages.py +++ b/tool/compare_packages.py @@ -1,6 +1,14 @@ import re import logging +MESSAGE_TO_VERSIONS_MAPPING = { + "newly_added": "Newly added package", + "deleted": "Deleted package", + "upgraded": "Upgraded package", + "downgraded": "Downgraded package", + "no_change": "No change", +} + def parse_dependencies(file_path): dependencies = {} @@ -96,63 +104,48 @@ def is_version_greater(v1, v2): def category_dependencies(dep_file_1, dep_file_2): - differences = choose_compare_version(dep_file_1, dep_file_2) - newly_added_pkg = {} - deleted_pkg = {} - upgraded_pkg = {} - downgraded_pkg = {} - no_change_pkg = {} - - for dep, versions in differences.items(): - v1, v2 = versions["chosen_v1"], versions["chosen_v2"] - allv1, allv2 = versions["version1"], versions["version2"] - + def categorize_dependency(v1, v2, allv1, allv2): if v1 is None or v2 is None: if not allv1: - differences[dep]["message"] = "Newly added package" - newly_added_pkg[dep] = { - "version1": allv1, - "version2": allv2, - "message": "Newly added package", - } + return "newly_added", allv1, allv2 elif not allv2: - differences[dep]["message"] = "Deleted package" - deleted_pkg[dep] = { - "version1": allv1, - "version2": allv2, - "message": "Deleted package", - } - + return "deleted", allv1, allv2 else: if is_version_greater(v2, v1): - differences[dep]["message"] = "Upgraded package" - upgraded_pkg[dep] = { - "version1": v1, - "version2": v2, - "message": "Upgraded package", - } + return "upgraded", v1, v2 elif is_version_greater(v1, v2): - differences[dep]["message"] = "Downgraded package" - downgraded_pkg[dep] = { - "version1": v1, - "version2": v2, - "message": "Downgraded package", - } + return "downgraded", v1, v2 else: - differences[dep]["message"] = "No change" - no_change_pkg[dep] = { - "version1": v1, - "version2": v2, - "message": "No change", - } + return "no_change", v1, v2 + return None + + differences = choose_compare_version(dep_file_1, dep_file_2) + gathered_categories = { + "newly_added": {}, + "deleted": {}, + "upgraded": {}, + "downgraded": {}, + "no_change": {}, + } + + for dep, versions in differences.items(): + v1, v2 = versions["chosen_v1"], versions["chosen_v2"] + allv1, allv2 = versions["version1"], versions["version2"] + + categorized_dependency = categorize_dependency(v1, v2, allv1, allv2) + if categorized_dependency: + category, v1, v2 = categorized_dependency + message = MESSAGE_TO_VERSIONS_MAPPING[category] + differences[dep]["message"] = message + gathered_categories[category][dep] = { + "version1": v1, + "version2": v2, + "message": message, + } return ( differences, - newly_added_pkg, - deleted_pkg, - upgraded_pkg, - downgraded_pkg, - no_change_pkg, + *gathered_categories.values(), ) @@ -301,6 +294,7 @@ def get_repo_from_SA(dep_file_1, dep_file_2, SA_old, SA_new): def changed_patch(package_data_old, package_data_new): + logging.info("Comparing patches...") patches_change = {} no_change_patches = {} if package_data_old and package_data_new: diff --git a/tool/database/github_commit.db b/tool/database/github_commit.db deleted file mode 100644 index c7e00006..00000000 Binary files a/tool/database/github_commit.db and /dev/null differ diff --git a/tool/database/github_pr_data.db b/tool/database/github_pr_data.db deleted file mode 100644 index ea3be274..00000000 Binary files a/tool/database/github_pr_data.db and /dev/null differ diff --git a/tool/database/github_prr_data_new.db b/tool/database/github_prr_data_new.db deleted file mode 100644 index 7f0798b5..00000000 Binary files a/tool/database/github_prr_data_new.db and /dev/null differ diff --git a/tool/database/github_repo_info_all.db b/tool/database/github_repo_info_all.db deleted file mode 100644 index 95eb3ebf..00000000 Binary files a/tool/database/github_repo_info_all.db and /dev/null differ diff --git a/tool/extract_deps.py b/tool/extract_deps.py index aed07810..b097160d 100644 --- a/tool/extract_deps.py +++ b/tool/extract_deps.py @@ -10,10 +10,15 @@ import sys import shutil from collections import defaultdict +import json +import hashlib +from pathlib import Path +import yaml -from tool_config import PNPM_LIST_COMMAND +from tool_config import PNPM_LIST_COMMAND, get_cache_manager logger = logging.getLogger(__name__) +cache_manager = get_cache_manager() MVN_DEPENDENCY_PLUGIN = "org.apache.maven.plugins:maven-dependency-plugin:3.8.1" append_dependency_goal = lambda goal: f"{MVN_DEPENDENCY_PLUGIN}:{goal}" @@ -37,7 +42,7 @@ def extract_deps_from_pnpm_lockfile(pnpm_lockfile_yaml): yaml_version = yaml_data.get("lockfileVersion") if yaml_version != "9.0": logging.error("Invalid pnpm lockfile version: %s", yaml_version) - print("The pnpm lockfile version is not supported(yet): ", yaml_version) + logging.error("The pnpm lockfile version is not supported(yet): ", yaml_version) # end the process sys.exit(1) @@ -188,12 +193,6 @@ def extract_deps_from_v1_yarn(yarn_lock_file): return {"resolutions": [], "patches": []} -def extract_deps_from_pnpm_lock_yaml(pnpm_lock_yaml_file): - """ - Extract dependencies from a pnpm-lock.yaml file. - """ - - def get_pnpm_dep_tree(folder_path, version_tag, project_repo_name): """ Get pnpm dependency tree for the given project. @@ -247,7 +246,7 @@ def get_pnpm_dep_tree(folder_path, version_tag, project_repo_name): logging.info("Getting pnpm dependency tree by running %s", PNPM_LIST_COMMAND) command = PNPM_LIST_COMMAND - print("Getting pnpm dependency tree...") + logging.info("Getting pnpm dependency tree...") result = subprocess.run( command, check=True, @@ -273,7 +272,7 @@ def get_pnpm_dep_tree(folder_path, version_tag, project_repo_name): return result.stdout.splitlines(), folder_path except subprocess.CalledProcessError as e: - print(f"An error occurred: {e}") + logging.error(f"An error occurred: {e}") sys.exit(1) finally: @@ -309,8 +308,9 @@ def extract_deps_from_pnpm_mono(folder_path, version_tag, project_repo_name): logging.info("Extracting dependencies from pnpm list output") for line in tree: + # TODO: what's this? if "ledger-live-desktop" in line or "production dependency, optional only, dev only" in line: - print("ledger-live-desktop found") + logging.info("ledger-live-desktop found") continue match = dep_pattern.search(line) @@ -318,7 +318,7 @@ def extract_deps_from_pnpm_mono(folder_path, version_tag, project_repo_name): dep_name = match.group(1).strip() dep_version = match.group(2).strip() dependencies[dep_name].append(dep_version) - print("dependency found", dep_name) + logging.info("dependency found", dep_name) # logging.info(f"Number of dependencies({version_tag}): {len(dependencies)}") @@ -365,6 +365,16 @@ def extract_deps_from_pnpm_mono(folder_path, version_tag, project_repo_name): return deps_list_data +def get_pom_hash(repo_path): + """Generate a hash of the pom.xml file to detect changes""" + pom_path = Path(repo_path) / "pom.xml" + if not pom_path.exists(): + return None + + with open(pom_path, "rb") as f: + return hashlib.sha256(f.read()).hexdigest() + + def extract_deps_from_maven(repo_path): """ Extract dependencies from a Maven package, given the path to its locally cloned repo. @@ -376,48 +386,56 @@ def extract_deps_from_maven(repo_path): dict: A dictionary containing the extracted dependencies. """ - def parse_mvn_dependency_logs(file_path): + def parse_mvn_dependency_logs(log_file): """ - Parse the logs generated by the Maven dependency plugin. + Parse Maven dependency resolution logs to extract dependency information. Args: - file_path (str): The path to the log file. + log_file (str): Path to the Maven dependency resolution log file Returns: - List[Dict]: A list of dictionaries containing the dependencies. Each dictionary - contains the groupId, artifactId, and version of a dependency. + list: List of dictionaries containing dependency information """ + dependencies = [] + + try: + with open(log_file, "r") as f: + for line in f: + parts = line.strip().split(":") + if len(parts) >= 3: # Minimum required parts, [2] would be type + dep_info = {"groupId": parts[0], "artifactId": parts[1], "version": parts[3].split()[0]} + dependencies.append(dep_info) + + except FileNotFoundError: + logging.error("Dependency log file not found: %s", log_file) + except Exception as e: + logging.error("Error parsing dependency log: %s", str(e)) + + return dependencies + + # Generate a cache key based on the repo path and pom.xml hash + pom_hash = get_pom_hash(repo_path) + if not pom_hash: + logging.error("No pom.xml found in %s", repo_path) + return {"resolutions": [], "patches": []} - result = [] - with open(file_path, "r") as file: - for line in file: - line = line.strip() - # If the line starts with :::, - # we want to keep the first, second, and fourth parts - # Otherwise, ignore the line - if re.match(r"^\S+:\S+:\S+:\S+", line): - parts = line.split(":") - result.append( - { - "groupId": parts[0], - "artifactId": parts[1], - "version": parts[3].split()[0], - } - ) - return result - - # First, switch to the repository directory + cached_deps = cache_manager.maven_cache.get_dependencies(repo_path, pom_hash) + if cached_deps: + logging.info(f"Using cached Maven dependencies for {repo_path}") + return cached_deps + + # If we reach here, we need to resolve dependencies current_dir = os.getcwd() os.chdir(repo_path) retrieval_commands = { - "regular": [ # "Regular" dependencies + "regular": [ "mvn", RESOLVE_GOAL, "-Dsort=true", f"-DoutputFile={RESOLVE_LOG}", ], - "plugins": [ # Plugin dependencies + "plugins": [ "mvn", RESOLVE_PLUGINS_GOAL, "-Dsort=true", @@ -426,31 +444,34 @@ def parse_mvn_dependency_logs(file_path): } try: - # First, running both commands to get the dependencies + # Run Maven commands to resolve dependencies subprocess.run(retrieval_commands["regular"], check=True) subprocess.run(retrieval_commands["plugins"], check=True) - # Then, parsing them from the log files + + # Parse the dependency logs retrieved_deps = parse_mvn_dependency_logs(RESOLVE_LOG) retrieved_plugins = parse_mvn_dependency_logs(RESOLVE_PLUGINS_LOG) - # Go back to the original directory + + # Go back to original directory os.chdir(current_dir) + + # Format the dependencies parsed_deps = [f"{dep['groupId']}:{dep['artifactId']}@{dep['version']}" for dep in retrieved_deps] parsed_plugins = [ f"{plugin['groupId']}:{plugin['artifactId']}@{plugin['version']}" for plugin in retrieved_plugins ] - # Using a set to avoid duplicates - resolutions = set(parsed_deps + parsed_plugins) - deps_list_data = {"resolutions": resolutions, "patches": []} - # TODO: confirm resolutions? + # Create the result + deps_list_data = {"resolutions": list(set(parsed_deps + parsed_plugins)), "patches": []} + + # Cache the results + cache_manager.maven_cache.cache_dependencies(repo_path, pom_hash, deps_list_data) + return deps_list_data except subprocess.CalledProcessError as e: - print(f"An error occurred: {e}") - logging.error( - "An error occurred while extracting dependencies from pom.xml file: %s", - str(e), - ) + os.chdir(current_dir) + logging.error("Error resolving Maven dependencies: %s", str(e)) return {"resolutions": [], "patches": []} diff --git a/tool/get_pr_info.py b/tool/get_pr_info.py index a85f66ab..18aeb778 100644 --- a/tool/get_pr_info.py +++ b/tool/get_pr_info.py @@ -5,11 +5,11 @@ import time import copy import logging +from tool_config import get_cache_manager, make_github_request +cache_manager = get_cache_manager() GITHUB_TOKEN = os.getenv("GITHUB_API_TOKEN") -# if not GITHUB_TOKEN: -# raise ValueError("GitHub API token is not set in the environment variables.") headers = { "Authorization": f"Bearer {GITHUB_TOKEN}", @@ -19,17 +19,6 @@ url = "https://api.github.com/graphql" -conn = sqlite3.connect("database/github_pr_data.db") -c = conn.cursor() - -c.execute( - """CREATE TABLE IF NOT EXISTS pr_info_sample - (package TEXT, commit_sha TEXT, commit_node_id TEXT, pr_data TEXT)""" -) - -conn.commit() - - def fetch_pull_requests(commit_node_id): query = """ query Edges($nodeId: ID!, $first: Int) { @@ -103,25 +92,8 @@ def fetch_pull_requests(commit_node_id): "nodeId": f"{commit_node_id}", "first": 5, } - - body = json.dumps({"query": query, "variables": variables}) - - response = requests.post(url, data=body, headers=headers) - - if response.status_code != 200: - # retry 10 sec later and try 5 times - for i in range(5): - print(f"Retrying in 10 seconds...") - time.sleep(10) - response = requests.post(url, data=body, headers=headers) - if response.status_code == 200: - break - else: - raise Exception(response.status_code, response.text) - - pr_info = response.json() - - return pr_info + body = {"query": query, "variables": variables} + return make_github_request(url, method="POST", json_data=body, headers=headers, max_retries=5) def get_pr_info(data): @@ -133,7 +105,7 @@ def get_pr_info(data): for package, info in commits_data.items(): repo_name = info.get("repo_name") - print(f"Checking PR info in {package}'s repository: ", repo_name) + logging.info(f"Checking PR info in {package}'s repository: {repo_name}") authors = info.get("authors", []) for author in authors: @@ -141,23 +113,20 @@ def get_pr_info(data): commit_node_id = author.get("node_id") commit_url = author.get("commit_url") - c.execute( - "SELECT pr_data FROM pr_info_sample WHERE commit_node_id=?", - (commit_node_id,), - ) - result = c.fetchone() - - if result: - pr_info = json.loads(result[0]) - else: + pr_data = cache_manager.github_cache.get_pr_info(commit_node_id) + if not pr_data: if commit_node_id: pr_info = fetch_pull_requests(commit_node_id) - - c.execute( - "INSERT INTO pr_info_sample (package, commit_sha, commit_node_id, pr_data) VALUES (?, ?, ?, ?)", - (package, commit_sha, commit_node_id, json.dumps(pr_info)), + cache_manager.github_cache.cache_pr_info( + { + "package": package, + "commit_sha": commit_sha, + "commit_node_id": commit_node_id, + "pr_info": pr_info, + } ) - conn.commit() + else: + pr_info = pr_data["pr_info"] all_info = { "package": package, diff --git a/tool/get_pr_review.py b/tool/get_pr_review.py index 4f736052..524383a4 100644 --- a/tool/get_pr_review.py +++ b/tool/get_pr_review.py @@ -5,11 +5,12 @@ import json import copy import logging +from tool_config import get_cache_manager, make_github_request +cache_manager = get_cache_manager() GITHUB_TOKEN = os.getenv("GITHUB_API_TOKEN") - headers = { "Authorization": f"Bearer {GITHUB_TOKEN}", "Accept": "application/vnd.github.v4+json", @@ -17,22 +18,8 @@ url = "https://api.github.com/graphql" -script_dir = Path(__file__).parent.absolute() -database_file = script_dir / "database" / "github_prr_data_new.db" -# print(database_file) - -conn = sqlite3.connect(database_file) -c = conn.cursor() - -c.execute( - """CREATE TABLE IF NOT EXISTS new_pr_reviewinfo_6 - (package TEXT, repo TEXT, author TEXT, first_prr_data TEXT, search_string TEXT)""" -) - -conn.commit() - -def get_first_pr_info(search_string): +def get_first_pr_info(repo_name, review_author_login): query = """ query($query: String!, $type: SearchType!, $last: Int!) {search(query: $query, type: $type, last: $last) @@ -76,24 +63,14 @@ def get_first_pr_info(search_string): """ + search_string = f"repo:{repo_name} is:pr reviewed-by:{review_author_login} sort:author-date-asc" variables = {"query": f"{search_string}", "last": 1, "type": "ISSUE"} - - body = json.dumps({"query": query, "variables": variables}) - - response = requests.post(url, data=body, headers=headers) - - if response.status_code != 200: - raise Exception(response.status_code, response.text) - - first_prr_info = response.json() - - return first_prr_info + body = {"query": query, "variables": variables} + return make_github_request(url, method="POST", json_data=body, headers=headers) def get_pr_review_info(data): logging.info("Getting PR review info...") - print("Processing PR info...") - pr_data = copy.deepcopy(data) for package, info in pr_data.items(): @@ -114,37 +91,14 @@ def get_pr_review_info(data): if merge_state == "MERGED" and len(reviewer_info) >= 1: for reviewer in reviewer_info: review_author_login = reviewer.get("review_author") - # review_author_type = reviewer.get("review_author_type") review_id = reviewer.get("review_id") - search_string = ( - f"repo:{repo_name} is:pr reviewed-by:{review_author_login} sort:author-date-asc" - ) - - c.execute( - "SELECT first_prr_data FROM new_pr_reviewinfo_6 WHERE author=? AND repo=? and search_string=?", - (review_author_login, repo_name, search_string), - ) - result = c.fetchone() - - if result: - first_pr_info = json.loads(result[0]) - print(f"get from db:{review_author_login}") - else: + first_pr_info = cache_manager.github_cache.get_pr_review(repo_name, review_author_login) + if not first_pr_info: if review_author_login: - first_pr_info = get_first_pr_info(search_string) - - c.execute( - "INSERT INTO new_pr_reviewinfo_6 (package, repo, author, first_prr_data, search_string) VALUES (?, ?, ?, ?, ?)", - ( - package, - repo_name, - review_author_login, - json.dumps(first_pr_info), - search_string, - ), + first_pr_info = get_first_pr_info(repo_name, review_author_login) + cache_manager.github_cache.cache_pr_review( + package, repo_name, review_author_login, first_pr_info ) - conn.commit() - useful_info = first_pr_info.get("data", {}).get("search", {}).get("nodes", []) first_review_info = useful_info[0] if useful_info else {} all_useful_first_prr_info = first_review_info.get("reviews", {}).get("edges", []) @@ -186,9 +140,9 @@ def get_pr_review_info(data): reviewer["prr_data"] = useful_pr_info else: - print(f"No authors for package:{package}") + logging.info(f"No authors for package:{package}") info["prr_data"] = None - print("PR review info processed.") + logging.info("PR review info processed.") return pr_data diff --git a/tool/get_user_commit_info.py b/tool/get_user_commit_info.py index 4d447594..9e45de59 100644 --- a/tool/get_user_commit_info.py +++ b/tool/get_user_commit_info.py @@ -4,42 +4,23 @@ import sqlite3 import time from pathlib import Path -from tool_config import setup_cache +from tool_config import get_cache_manager, make_github_request, clone_repo, get_last_page_info +import git +import logging -script_dir = Path(__file__).parent.absolute() -database_file = script_dir / "database" / "github_commit.db" -# print("Database file: ", database_file) +cache_manager = get_cache_manager() -conn = sqlite3.connect(database_file) -c = conn.cursor() +def get_repo_author_commits(api_url): + # Since we can't return the commits in ascending date order, we'll just return the latest commit + # This response also holds the number of pages, so the last page will have the first commit + search_url = f"{api_url}&per_page=1" + last_page = get_last_page_info(search_url, max_retries=2, retry_delay=2, sleep_between_requests=2) + if not last_page: + return None -c.execute( - """CREATE TABLE IF NOT EXISTS commit_data ( - api_url TEXT PRIMARY KEY, - earliest_commit_sha TEXT, - repo_name TEXT, - package TEXT, - author_login TEXT, - author_commit_sha TEXT, - author_login_in_1st_commit TEXT, - author_id_in_1st_commit TEXT)""" -) - -conn.commit() - - -# logging.info("Cache [github_cache_cache] setup complete") - - -github_token = os.getenv("GITHUB_API_TOKEN") -# if not github_token: -# raise ValueError("GitHub API token is not set in the environment variables.") - -headers = { - "Authorization": f"Bearer {github_token}", - "Accept": "application/vnd.github.v3+json", -} + last_page_url = f"{search_url}&page={last_page}" + return make_github_request(last_page_url, max_retries=2, retry_delay=2, sleep_between_requests=2) def get_user_first_commit_info(data): @@ -52,23 +33,15 @@ def get_user_first_commit_info(data): Returns: dict: A dictionary with updated package information including first commit details. """ - setup_cache("github_commits_info") - - failed_api_urls = set() - - earliest_commit_sha = None - author_login_in_commit = None - author_id_in_commit = None - first_time_commit = None + cache_manager._setup_requests_cache("get_user_commit_info") + logging.info("Getting user commit information") packages_data = copy.deepcopy(data) - for package, info in packages_data.items(): - print(f"Processing {package}") repo_name = info["repo_name"] - - if info.get("authors"): - for author in info.get("authors"): + authors = info.get("authors") + if authors: + for author in authors: author_login = author.get("login", "No_author_login") commit_sha = author.get("sha", "No_commit_sha") author_type = author.get("a_type", "No_author_type") @@ -88,23 +61,17 @@ def get_user_first_commit_info(data): if "[bot]" in author_login or author_type == "Bot": commit_result["earliest_commit_sha"] = "It might be a bot" commit_result["commit_notice"] = "Bot author detected" - else: - api_url = f"https://api.github.com/search/commits?q=repo:{repo_name}+author:{author_login}+sort:author-date-asc" - - c.execute( - "SELECT earliest_commit_sha, author_login_in_1st_commit, author_id_in_1st_commit FROM commit_data WHERE api_url = ?", - (api_url,), - ) - data = c.fetchone() - + api_url = f"https://api.github.com/repos/{repo_name}/commits?author={author_login}" + data = cache_manager.user_commit_cache.get_user_commit(api_url) if data: + # Retrieved data from cache ( earliest_commit_sha, author_login_in_commit, author_id_in_commit, ) = data - first_time_commit = True if earliest_commit_sha == commit_sha else False + first_time_commit = earliest_commit_sha == commit_sha commit_result.update( { @@ -116,87 +83,39 @@ def get_user_first_commit_info(data): "commit_notice": "Data retrieved from cache", } ) - else: - max_retries = 2 - base_wait_time = 2 - retries = 0 - success = False - - while retries < max_retries and not success and api_url not in failed_api_urls: - response = requests.get(api_url, headers=headers) - time.sleep(2) - - if response.status_code == 200: - success = True - commits_data = response.json() - earliest_commit_sha = ( - commits_data["items"][0]["sha"] if commits_data["items"] else None - ) - - author_login_in_commit = ( - commits_data["items"][0]["author"]["login"] if commits_data["items"] else None - ) - # author_type = commits_data['items'][0]['author']['__typename'] if commits_data['items'] else None - - author_id_in_commit = ( - commits_data["items"][0]["author"]["id"] if commits_data["items"] else None - ) - # api_url_cache[api_url] = earliest_commit_sha - - first_time_commit = True if earliest_commit_sha == commit_sha else False - - c.execute( - "INSERT INTO commit_data (api_url, earliest_commit_sha, repo_name, package, author_login, author_commit_sha, author_login_in_1st_commit, author_id_in_1st_commit) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", - ( - api_url, - earliest_commit_sha, - repo_name, - package, - author_login, - commit_sha, - author_login_in_commit, - author_id_in_commit, - ), - ) - conn.commit() - - commit_result.update( - { - "api_url": api_url, # "https://api.github.com/search/commits?q=repo:{repo_name}+author:{author_login}+sort:author-date-asc - "earliest_commit_sha": earliest_commit_sha, - "author_login_in_1st_commit": author_login_in_commit, - "author_id_in_1st_commit": author_id_in_commit, - "is_first_commit": first_time_commit, - "commit_notice": "Data retrieved from API", - } - ) - - else: - print(f"Error: {response.status_code}") - remaining = response.headers.get("X-RateLimit-Remaining") - reset_time = response.headers.get("X-RateLimit-Reset") - wait_time = max(int(reset_time) - int(time.time()), 0) - print(f"Rate limit remaining: {remaining}") - - if remaining == "0": - time.sleep(wait_time) - - else: - time.sleep(base_wait_time) - - retries += 1 - print(f"Retrying...{retries}/{max_retries} for {api_url}") - - if not success: + # Data not found in cache, need to make API request + result = get_repo_author_commits(api_url) + if result: + earliest_commit = result[0] + earliest_commit_sha = earliest_commit["sha"] + author_login_in_commit = earliest_commit["author"]["login"] + author_id_in_commit = earliest_commit["author"]["id"] + first_time_commit = earliest_commit_sha == commit_sha + cache_manager.user_commit_cache.cache_user_commit( + api_url, + earliest_commit_sha, + repo_name, + package, + author_login, + commit_sha, + author_login_in_commit, + author_id_in_commit, + ) + commit_result.update( + { + "api_url": api_url, + "earliest_commit_sha": earliest_commit_sha, + "author_login_in_1st_commit": author_login_in_commit, + "author_id_in_1st_commit": author_id_in_commit, + "is_first_commit": first_time_commit, + "commit_notice": "Data retrieved from API", + } + ) + else: commit_result["commit_notice"] = f"Failed to retrieve data from API({api_url})" - failed_api_urls.add(api_url) - author["commit_result"] = commit_result - else: info["commit_result"] = None - conn.close() - return packages_data diff --git a/tool/github_repo.py b/tool/github_repo.py index 5cb37170..56cdda16 100644 --- a/tool/github_repo.py +++ b/tool/github_repo.py @@ -6,25 +6,13 @@ import logging from pathlib import Path from tqdm import tqdm - -# from datetime import datetime - +from tool_config import get_cache_manager +from typing import List TIMEOUT = 60 -script_dir = Path(__file__).parent.absolute() -database_file = script_dir / "database" / "github_repo_info_all.db" - -conn = sqlite3.connect(database_file) -c = conn.cursor() - -c.execute( - """CREATE TABLE IF NOT EXISTS pkg_github_repo_output ( - package TEXT PRIMARY KEY, - github TEXT)""" -) - -conn.commit() +cache_manager = get_cache_manager() +GITHUB_URL_PATTERN = re.compile(r"(github.*)", re.IGNORECASE) def write_output(folder_path, filename, data): @@ -42,10 +30,55 @@ def write_output(folder_path, filename, data): json.dump(data, f, indent=2) -def extract_repo_url(repo_info): - pattern = r"(github.*)" - match = re.search(pattern, repo_info, re.IGNORECASE) - return match.group(1) if match else "not github" +def extract_repo_url(repo_info: str) -> str: + """Extract GitHub repository URL from repository information.""" + if "https" not in repo_info: + # cases such as git@github:apache/maven-scm, we just remove the : + repo_info = repo_info.replace(":/", "/") + repo_info = repo_info.replace(":", "/") + match = GITHUB_URL_PATTERN.search(repo_info) + if not match: + return "not github" + + # if there is a match, there's still the possibility of the scm url having been + # put in a different form, e.g., + # github.com/apache/maven-scm/tree/maven-scm-2.1.0/maven-scm-providers/maven-scm-providers-standard + # from here, we only want the URL up until the second-most directory after github.com + url = match.group(0) + parts = url.split("/") + joined = "/".join(parts[:3]) if len(parts) > 3 else url + joined = joined if not joined.endswith(".git") else joined[:-4] + return joined + + +def get_scm_commands(pm: str, package: str) -> List[str]: + """Get the appropriate command to find a package's source code locations for the package manager.""" + if pm == "yarn-berry" or pm == "yarn-classic": + return [["yarn", "info", package, "repository.url"]] + elif pm == "pnpm": + return [["pnpm", "info", package, "repository.url"]] + elif pm == "npm": + return [["npm", "info", package, "repository.url"]] + elif pm == "maven": + name, version = package.split("@") + group_id, artifact_id = name.split(":") + return [ + [ + "mvn", + "org.apache.maven.plugins:maven-help-plugin:3.5.1:evaluate", + f"-Dexpression={source_code_location}", + f"-Dartifact={group_id}:{artifact_id}:{version}", + "-q", + "-DforceStdout", + ] + for source_code_location in [ + "project.scm.url", + "project.scm.connection", + "project.scm.developerConnection", + "project.url", + ] + ] + raise ValueError(f"Unsupported package manager: {pm}") def process_package( @@ -57,36 +90,27 @@ def process_package( some_errors, repos_output_json, ): - c.execute("SELECT github FROM pkg_github_repo_output WHERE package = ?", (package,)) - db_result = c.fetchone() - - if db_result: - repo_info = db_result[0] + def check_if_valid_repo_info(repo_info): + if repo_info is None or "Undefined" in repo_info or "undefined" in repo_info or "ERR!" in repo_info: + repos_output_json[package] = {"github": "Could not find"} + undefined.append(f"Undefined for {package}, {repo_info}") + return False - else: - try: - if pm == "yarn-berry" or pm == "yarn-classic": - command = ["yarn", "info", package, "repository.url"] - result = subprocess.run( - command, - capture_output=True, - text=True, - check=True, - timeout=TIMEOUT, - ) - - elif pm == "pnpm": - command = ["pnpm", "info", package, "repository.url"] - result = subprocess.run( - command, - capture_output=True, - text=True, - check=True, - timeout=TIMEOUT, - ) + url = extract_repo_url(repo_info) + repos_output_json[package] = {"github": url} + if url: + repos_output.append(url) + same_repos_deps.get("url", []).append(package) + return True + else: + some_errors.append(f"No GitHub URL for {package}\n{repo_info}") + return False - elif pm == "npm": - command = ["npm", "info", package, "repository.url"] + repo_info = cache_manager.github_cache.get_github_url(package) + valid_repo_info = False + if not repo_info: + for command in get_scm_commands(pm, package): + try: result = subprocess.run( command, capture_output=True, @@ -94,71 +118,29 @@ def process_package( check=True, timeout=TIMEOUT, ) - - elif pm == "maven": - # package is in the form of group_id:artifact_id@version -- we need all 3 - name, version = package.split("@") - group_id, artifact_id = name.split(":") - command = [ - "mvn", - "help:evaluate", - "-Dexpression=project.scm.url", - f"-Dartifact={group_id}:{artifact_id}:{version}", - "-q", - "-DforceStdout", - ] - result = subprocess.run( - command, - capture_output=True, - text=True, - check=True, - timeout=TIMEOUT, + if result.stdout: + repo_info = result.stdout + valid_repo_info = check_if_valid_repo_info(repo_info) + if valid_repo_info: + break + repo_info = None + else: + repo_info = result.stderr + except subprocess.TimeoutExpired: + logging.warning( + f"Command {command} timed out after {TIMEOUT} seconds for package {package}", ) - - else: - raise ValueError(f"Unsupported package manager: {pm}") - - repo_info = result.stdout if result.stdout else result.stderr - # print(f"Repo info for {package}: {repo_info}") - c.execute( - "INSERT OR IGNORE INTO pkg_github_repo_output (package, github) VALUES (?,?)", - (package, repo_info), - ) - conn.commit() - - except subprocess.TimeoutExpired: - logging.error( - f"Command {command} timed out after {TIMEOUT} seconds for package {package}", - ) - repo_info = None - - except subprocess.CalledProcessError as e: - logging.error(f"Command {command} failed for package {package}: {e}") - repo_info = None - - # TODO: npm? - package = package.replace("@npm:", "@") - - if ( - repo_info is None - or "Undefined" in repo_info - or "undefined" in repo_info - or "ERR!" in repo_info - # or "error" in repo_info - ): - repos_output_json[package] = {"github": "Could not find"} - undefined.append(f"Undefined for {package}, {repo_info}") + repo_info = None + except subprocess.CalledProcessError as e: + logging.warning(f"Command {command} failed for package {package}: {e}") + repo_info = "ERR!" + + if repo_info: + # Must still run the check if all cases were errors + check_if_valid_repo_info(repo_info) + cache_manager.github_cache.cache_github_url(package, repo_info) else: - url = extract_repo_url(repo_info) - # print(f"[INFO] Found GitHub URL for {package}: {url}") - repos_output_json[package] = {"github": url} - if url: - repos_output.append(url) - if url not in same_repos_deps: - same_repos_deps[url] = [] - same_repos_deps[url].append(package) - else: - some_errors.append(f"No GitHub URL for {package}\n{repo_info}") + check_if_valid_repo_info(repo_info) def get_github_repo_url(folder, dep_list, pm): @@ -168,7 +150,7 @@ def get_github_repo_url(folder, dep_list, pm): same_repos_deps = {} # Dict to store packages with same GitHub URL repos_output_json = {} # Dict to store packages with GitHub URL - print("Getting GitHub URLs of packages...") + logging.info("Getting GitHub URLs of packages...") total_packages_to_process = len(dep_list.get("resolutions", [])) # have not process patches with tqdm(total=total_packages_to_process, desc="Getting GitHub URLs") as pbar: diff --git a/tool/main.py b/tool/main.py index 42960971..a85f8afa 100644 --- a/tool/main.py +++ b/tool/main.py @@ -9,8 +9,6 @@ import requests from git import Repo -# from dotenv import load_dotenv - import extract_deps import github_repo @@ -24,7 +22,6 @@ import report_static import report_diff -# load_dotenv() github_token = os.getenv("GITHUB_API_TOKEN") if not github_token: raise ValueError("GitHub API token(GITHUB_API_TOKEN) is not set in the environment variables.") @@ -34,6 +31,8 @@ "Accept": "application/vnd.github.v3+json", } +cache_manager = tool_config.get_cache_manager() + def get_args(): """ @@ -91,6 +90,11 @@ def get_args(): action="store_true", help="Extract dependencies from pnpm with a specific scope using 'pnpm list --filter --depth Infinity' command. Configure the scope in tool_config.py file.", ) + parser.add_argument( + "--debug", + action="store_true", + help="Enable debug mode.", + ) # Add new smell check arguments smell_group = parser.add_argument_group("smell checks") @@ -130,22 +134,6 @@ def get_args(): return arguments -def logging_setup(log_file_path): - """ - Setup logging configuration. - - Args: - log_file_path (str): The path to the log file. - """ - logging.basicConfig( - filename=log_file_path, - level=logging.INFO, - filemode="w", - format="%(asctime)s %(levelname)-8s %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - ) - - def get_lockfile(project_repo_name, release_version, package_manager): """ Get the lockfile for the given project and release version. @@ -169,15 +157,11 @@ def get_lockfile(project_repo_name, release_version, package_manager): "maven": "pom.xml", } - tool_config.setup_cache("demo") - # logging.info("Cache [demo_cache] setup complete") - + cache_manager._setup_requests_cache(cache_name="get_lockfile") try: lockfile_name = LOOKING_FOR[package_manager] - logging.info(f"Getting {lockfile_name} for %s@%s", project_repo_name, release_version) + logging.info(f"Getting {lockfile_name} for {project_repo_name}@{release_version}") logging.info(f"Package manager: {package_manager}") - - print(f"Getting {lockfile_name} for {project_repo_name}@{release_version}") except KeyError: logging.error("Invalid package manager or lack of lockfile: %s", package_manager) raise ValueError("Invalid package manager or lack of lockfile.") @@ -189,7 +173,7 @@ def get_lockfile(project_repo_name, release_version, package_manager): data = response.json() download_url = data.get("download_url") lock_content = requests.get(download_url, timeout=60).text - print(f"Got the {lockfile_name} file from {download_url}.") + logging.info(f"Got the {lockfile_name} file from {download_url}.") else: logging.error(f"Failed to get {lockfile_name}.") raise ValueError(f"Failed to get {lockfile_name}.") @@ -207,37 +191,6 @@ def get_lockfile(project_repo_name, release_version, package_manager): return lock_content, default_branch, project_repo_name -def clone_repo(project_repo_name, release_version): - """ - Clone the repository for the given project and release version. - - Args: - project_repo_name (str): The name of the project repository. - release_version (str): The release version of the project. - - Returns: - str: The path to the cloned repository. - """ - - repo_url = f"https://github.com/{project_repo_name}.git" - - # Clone to /tmp folder; if it is already cloned, an error will be raised - try: - Repo.clone_from(repo_url, f"/tmp/{project_repo_name}") - except Exception as e: - # If the repo is already cloned, just fetch the latest changes - print(f"[INFO] Repo already cloned. Fetching the latest changes...") - repo = Repo(f"/tmp/{project_repo_name}") - - # Fetch the latest changes - repo.remotes.origin.fetch() - # Checkout to the release version - repo = Repo(f"/tmp/{project_repo_name}") - repo.git.checkout(release_version) - - return f"/tmp/{project_repo_name}" - - def get_deps(folder_path, project_repo_name, release_version, package_manager): """ Get the dependencies for the given project and release version. @@ -280,7 +233,7 @@ def get_deps(folder_path, project_repo_name, release_version, package_manager): # Example: parent package A has a child package B; we want to run it on package B, but cloning won't work here (?) # And even if it did, we still need to, inside the project, navigate to the child package and run the analysis there # So this is a side case not yet handled - repo_path = clone_repo(project_repo_name, release_version) + repo_path = tool_config.clone_repo(project_repo_name, release_version) deps_list_all = extract_deps.extract_deps_from_maven(repo_path) logging.info("Number of dependencies: %d", len(deps_list_all.get("resolutions", {}))) @@ -294,14 +247,6 @@ def get_deps(folder_path, project_repo_name, release_version, package_manager): len(dep_with_many_versions), ) - rv_name = release_version.replace("/", "_") - - # write_to_file(f"{rv_name}_deps_list_all.json", folder_path, deps_list_all) - # write_to_file( - # f"{rv_name}_dep_with_many_versions.json", folder_path, dep_with_many_versions - # ) - # write_to_file(f"{rv_name}_patches_info.json", folder_path, patches_info) - return deps_list_all, dep_with_many_versions, patches_info @@ -341,6 +286,7 @@ def differential_analysis( patches_new, patches_old, project_repo_name, + package_manager, ): """ Perform differential analysis on the given project and release versions. @@ -354,7 +300,7 @@ def differential_analysis( patches_new (dict): The patches info for the new release version. patches_old (dict): The patches info for the old release version. project_repo_name (str): The name of the project repository. - + package_manager (str): The package manager used in the project. Returns: tuple: A tuple containing the following: - compare_differences (dict): The comparison results for the dependencies. @@ -375,7 +321,10 @@ def differential_analysis( _, ) = compare_packages.differential(old_rv_dep_versions, new_rv_dep_versions, sa_1, sa_2) - changed_patches, _ = compare_packages.changed_patch(patches_old, patches_new) + if package_manager != "maven": + changed_patches, _ = compare_packages.changed_patch(patches_old, patches_new) + else: + changed_patches = {} authors = compare_commits.get_commit_results( headers, @@ -429,7 +378,7 @@ def setup_project_info(args): } -def setup_directories_and_logging(project_info): +def setup_directories_and_logging(project_info, debug): """Set up necessary directories and logging.""" dir_path = tool_config.PathManager() @@ -439,7 +388,7 @@ def setup_directories_and_logging(project_info): project_info["diff_folder"] = diff_folder log_file_path = result_folder_path / "analysis.log" - logging_setup(log_file_path) + tool_config.setup_logger(log_file_path, debug) def perform_static_analysis(project_info, is_old_version): @@ -503,6 +452,7 @@ def perform_differential_analysis(old_results, new_results, project_info): new_results[3], old_results[3], # patches_info project_info["repo_name"], + project_info["package_manager"], ) # Write differential analysis results to files @@ -577,9 +527,9 @@ def main(): } project_info = setup_project_info(dw_args) - setup_directories_and_logging(project_info) + setup_directories_and_logging(project_info, dw_args.debug) - print( + logging.info( f"Software supply chain smells analysis for {project_info['repo_name']} for version {project_info['old_version']}..." ) @@ -599,4 +549,4 @@ def main(): if __name__ == "__main__": main() - print("Analysis completed.") + logging.info("Analysis completed.") diff --git a/tool/report_diff.py b/tool/report_diff.py index e44d547c..752937e0 100644 --- a/tool/report_diff.py +++ b/tool/report_diff.py @@ -1,5 +1,6 @@ import pandas as pd from datetime import datetime +import logging def process_data(data): @@ -14,7 +15,8 @@ def process_data(data): new_version = info.get("tag2", "") repo_link = info.get("repo_link", "") - commits = info.get("authors", []) + # The 'or' is to handle the case where the information returned is None + commits = info.get("authors", []) or [] if not commits: record.append( @@ -116,7 +118,7 @@ def filter_df(df): def generate_diff_report(data, project_repo_name, release_version_old, release_version_new, output_file): - print(f"Generating differential report for {project_repo_name}") + logging.info(f"Generating differential report for {project_repo_name}") record, record_list, author_list = process_data(data) df_all = create_dataframe(record) @@ -265,4 +267,4 @@ def generate_diff_report(data, project_repo_name, release_version_old, release_v # md_file.write(f"- Tool version: {tool_commit_hash}\n") f.write(f"- project Name: {project_repo_name}\n") f.write(f"- Compared project Versions: {release_version_old} & {release_version_new}\n") - print(f"Report generated at {output_file}") + print(f"Report from differential analysis generated at {output_file}") diff --git a/tool/report_static.py b/tool/report_static.py index 6dfb3376..8570652c 100644 --- a/tool/report_static.py +++ b/tool/report_static.py @@ -159,7 +159,9 @@ def write_summary(df, project_name, release_version, package_manager, filename, warning_counts["forked_package"] = f":cactus: Packages that are forks (⚠️⚠️) {(df['is_fork'] == True).sum()}" if enabled_checks.get("code_signature"): - warning_counts["code_signature"] = f":lock: Packages without code signature (⚠️⚠️) {(code_signature_df.shape[0])}" + warning_counts["code_signature"] = ( + f":lock: Packages without code signature (⚠️⚠️) {(code_signature_df.shape[0])}" + ) if enabled_checks.get("provenance"): warning_counts["provenance"] = ( @@ -238,7 +240,7 @@ def write_summary(df, project_name, release_version, package_manager, filename, combined_repo_problems_df.index = range(1, len(combined_repo_problems_df) + 1) markdown_text = combined_repo_problems_df.reset_index().to_markdown(index=False) md_file.write(markdown_text) - md_file.write("\n") + md_file.write("\n\n") elif package_manager not in SUPPORTED_SMELLS["github_404"]: md_file.write( f"\nThe package manager ({package_manager}) does not support checking for not found source code links.\n" @@ -257,7 +259,7 @@ def write_summary(df, project_name, release_version, package_manager, filename, md_file.write("\n\n\n") markdown_text = release_tag_not_found_df.reset_index().to_markdown(index=False) md_file.write(markdown_text) - md_file.write("\n") + md_file.write("\n\n") elif package_manager not in SUPPORTED_SMELLS["release_tag_not_found"]: md_file.write( f"\nThe package manager ({package_manager}) does not support checking for inaccessible tags.\n" @@ -269,14 +271,14 @@ def write_summary(df, project_name, release_version, package_manager, filename, if not version_deprecated_df.empty: md_file.write( f""" -
- List of deprecated packages({(df['deprecated_in_version'] == True).sum()}) +
+ List of deprecated packages({(df['deprecated_in_version'] == True).sum()}) """ ) md_file.write("\n\n\n") markdown_text = version_deprecated_df.reset_index().to_markdown(index=False) md_file.write(markdown_text) - md_file.write("\n
") + md_file.write("\n
\n") elif package_manager not in SUPPORTED_SMELLS["deprecated"]: md_file.write( f"\nThe package manager ({package_manager}) does not support checking for deprecated packages.\n" @@ -288,9 +290,8 @@ def write_summary(df, project_name, release_version, package_manager, filename, if not forked_package_df.empty: md_file.write( f""" - -
- List of packages from fork({(df["is_fork"] == True).sum()}) +
+ List of packages from fork({(df["is_fork"] == True).sum()}) """ ) md_file.write("\n\n\n") @@ -308,8 +309,8 @@ def write_summary(df, project_name, release_version, package_manager, filename, if not provenance_df.empty: md_file.write( f""" -
- List of packages without provenance({(df["provenance_in_version"] == False).sum()}) +
+ List of packages without provenance({(df["provenance_in_version"] == False).sum()}) """ ) md_file.write("\n\n\n") @@ -400,4 +401,4 @@ def get_s_summary(data, project_name, release_version, package_manager, enabled_ enabled_checks=enabled_checks, mode="w", ) - print(f"Report created at {summary_filename}") + print(f"Report from static analysis created at {summary_filename}") diff --git a/tool/static_analysis.py b/tool/static_analysis.py index 0a6d3474..8c95099d 100644 --- a/tool/static_analysis.py +++ b/tool/static_analysis.py @@ -9,10 +9,10 @@ import subprocess import re -import tool_config +from tool_config import get_cache_manager, make_github_request from compare_commits import tag_format as construct_tag_format import logging - +import xmltodict github_token = os.getenv("GITHUB_API_TOKEN") @@ -21,7 +21,7 @@ "Accept": "application/vnd.github.v3+json", } -# tool_config.setup_cache("static") +cache_manager = get_cache_manager() MAX_WAIT_TIME = 15 * 60 @@ -114,7 +114,6 @@ def check_maven(package, package_version): def check_code_signature(package_name, package_version, pm): - # TODO: caching this somehow would be nice # TODO: find a package where we can check this, because with spoon everything is fine def check_maven_signature(package_name, package_version): # Construct the command @@ -216,22 +215,62 @@ def api_constructor(package_name, repository): return repo_api, simplified_path, package_full_name, name, version, error_message -def make_github_request(url, headers): - """Make a GET request to the GitHub API.""" - - response = requests.get(url, headers=headers) - - if response.status_code == 403 and int(response.headers.get("X-RateLimit-Remaining", 0)) <= 10: - reset_time = int(response.headers.get("X-RateLimit-Reset", 0)) - sleep_time = min(reset_time - int(time.time()), MAX_WAIT_TIME) - print(f"\nRate limit reached. Waiting for {sleep_time} seconds.") - time.sleep(sleep_time) - print("\nResuming analysis...") - response = requests.get(url, headers=headers) - return response +def check_parent_scm(package): + name, version = package.split("@") + group_id, artifact_id = name.split(":") + + existing_scm_data, repo_api, simplified_path, package_full_name = None, None, None, None + stopping = False + while not stopping: + # First, getting the parent's pom contents + command = [ + "mvn", + "org.apache.maven.plugins:maven-help-plugin:3.5.1:evaluate", + "-Dexpression=project.parent", + f"-Dartifact={group_id}:{artifact_id}:{version}", + "-q", + "-DforceStdout", + ] + output = subprocess.run(command, capture_output=True, text=True) + parent_pom = output.stdout.strip() + if not parent_pom or "null" in parent_pom: + # If there's no parent, we stop + stopping = True + else: + parents_contents = xmltodict.parse(parent_pom) + parent_group_id, parent_artifact_id = [ + parents_contents.get("project", {}).get("groupId", ""), + parents_contents.get("project", {}).get("artifactId", ""), + ] + if not parent_group_id or not parent_artifact_id or parent_group_id != group_id: + # If the parent is lacking data we stop; + # If the parent doesn't share the same group, we went too far, so we stop too + stopping = True + break + parent_scm_locations = [ + parents_contents.get("project", {}).get("scm", {}).get(location, "") + for location in ["url", "connection", "developerConnection"] + ] + [parents_contents.get("project", {}).get("url", "")] + for location in parent_scm_locations: + if location: + repo_api, simplified_path, package_full_name, _, _, _ = api_constructor(package, location) + data = make_github_request(repo_api, max_retries=2) + if data: + stopping = True + existing_scm_data = data + break + if not stopping: + group_id, artifact_id = parent_group_id, parent_artifact_id + + return { + "data": existing_scm_data, + "repo_api": repo_api, + "simplified_path": simplified_path, + "package_full_name": package_full_name, + } -def check_existence(package_name, repository): +def check_existence(package_name, repository, package_manager): """Check if the package exists in the repository.""" repo_api, simplified_path, package_full_name, _, version, error_message = api_constructor(package_name, repository) @@ -246,18 +285,30 @@ def check_existence(package_name, repository): github_redirected = False now_repo_url = None open_issues_count = None - - response = make_github_request(repo_api, headers=headers) - status_code = response.status_code - data = response.json() - - if status_code != 200: - print(f"[WARNING] No repo found for {package_name} in {repo_link}") + status_code = 404 + + data = make_github_request(repo_api, max_retries=2) + parent_scm_result = {} + if not data: + if package_manager == "maven": + # There's the possibility of, in maven's case, assembly inheritance not having worked well; + # As such, if the package manager is maven, we'll try to "work our way up", and perform the same check in the parent + parent_scm_result = check_parent_scm(package_name) + + if not data and not parent_scm_result["data"]: + # simplified_path = parent_scm_result.get("simplified_path", simplified_path) + # If we went up, and there's no still data, there really isn't a findable repository + logging.warning(f"No repo found for {package_name} in {repo_link}") archived = None is_fork = None repo_link = f"https://github.com/{simplified_path}".lower() + else: + data = data or parent_scm_result["data"] + simplified_path = parent_scm_result.get("simplified_path", simplified_path) + repo_api = parent_scm_result.get("repo_api", repo_api) + package_full_name = parent_scm_result.get("package_full_name", package_full_name) - if status_code == 200: + status_code = 200 github_exists = True open_issues_count = data["open_issues"] if data["archived"]: @@ -282,29 +333,25 @@ def check_existence(package_name, repository): have_no_tags_data = have_no_tags_response.json() if len(have_no_tags_data) == 0: - release_tag_exists = False release_tag_url = None tag_related_info = "No tag was found in the repo" status_code_release_tag = have_no_tags_response_status_code - else: - tag_possible_formats = construct_tag_format(version, package_full_name) - + tag_possible_formats = construct_tag_format(version, package_full_name, repo_name=simplified_path) # Making the default case not finding the tag tag_related_info = "The given tag was not found in the repo" - status_code_release_tag = 404 if tag_possible_formats: for tag_format in tag_possible_formats: tag_url = f"{repo_api}/git/ref/tags/{tag_format}" - response = make_github_request(tag_url, headers=headers) - if response.status_code == 200: + response = make_github_request(tag_url, silent=True) + if response: release_tag_exists = True release_tag_url = tag_url tag_related_info = f"Tag {tag_format} is found in the repo" - status_code_release_tag = response.status_code + status_code_release_tag = 200 break - if status_code_release_tag == 404: - print(f"[INFO] No tags found for {package_name} in {repo_api}") + if not release_tag_exists: + logging.info(f"No tags found for {package_name} in {repo_api}") github_info = { "github_api": repo_api, @@ -348,10 +395,10 @@ def get_api_content(api, headers): requests.Timeout, json.JSONDecodeError, ) as e: - print(f"Request error: {str(e)} for URL: {api}") + logging.error(f"Request error: {str(e)} for URL: {api}") return None except Exception as e: - print(f"Unexpected error: {str(e)} for URL: {api}") + logging.error(f"Unexpected error: {str(e)} for URL: {api}") return None @@ -422,8 +469,7 @@ def check_name_match_for_fork(package_name, repository): def check_name_match(package_name, repository): - tool_config.setup_cache("check_name") - # logging.info("Cache [check_name_cache] setup complete") + cache_manager._setup_requests_cache(cache_name="static_analysis") _, repo_name, _, _, _, _ = api_constructor(package_name, repository) original_package_name = package_name.rsplit("@", 1)[0] @@ -438,8 +484,8 @@ def check_name_match(package_name, repository): response = requests.get(url, headers=headers, timeout=20) - if not response.from_cache: - time.sleep(6) + # if not response.from_cache: + # time.sleep(6) status_code = response.status_code @@ -461,9 +507,6 @@ def check_name_match(package_name, repository): package_api_in_packages = item["url"] is_match = True - else: - is_match = False - if not is_match: unmatch_info = { "status_code": status_code, @@ -503,33 +546,66 @@ def analyze_package_data(package, repo_url, pm, check_match=False, enabled_check try: package_name, package_version = package.rsplit("@", 1) - # Only check deprecation and provenance if enabled - if enabled_checks["deprecated"] or enabled_checks["provenance"]: + # Try to get from cache first + cached_analysis = cache_manager.package_cache.get_package_analysis(package_name, package_version, pm) + + # Initialize missing_checks to track what needs to be analyzed + missing_checks = {} + + if cached_analysis: + logging.info(f"Found cached analysis for {package}") + package_info = cached_analysis + + # Check which enabled checks are missing from cache + for check, enabled in enabled_checks.items(): + if enabled: + if check == "deprecated" and "deprecated" not in cached_analysis: + missing_checks["deprecated"] = True + elif check == "provenance" and "provenance" not in cached_analysis: + missing_checks["provenance"] = True + elif check == "code_signature" and "code_signature" not in cached_analysis: + missing_checks["code_signature"] = True + elif check == "source_code" and "github_exists" not in cached_analysis: + missing_checks["source_code"] = True + elif check == "forks" and ( + "github_exists" not in cached_analysis + or "is_fork" not in cached_analysis.get("github_exists", {}) + ): + missing_checks["forks"] = True + + if not missing_checks: + logging.info(f"Using complete cached analysis for {package}") + return package_info + logging.info( + f"Found partial cached analysis for {package}, analyzing missing checks: {list(missing_checks.keys())}" + ) + else: + logging.info(f"No cached analysis for {package}, analyzing all enabled checks") + missing_checks = enabled_checks + + if missing_checks.get("deprecated") or missing_checks.get("provenance"): package_infos = check_deprecated_and_provenance(package_name, package_version, pm) - if enabled_checks["deprecated"]: + if missing_checks.get("deprecated"): package_info["deprecated"] = package_infos.get("deprecated_in_version") - if enabled_checks["provenance"]: + if missing_checks.get("provenance"): package_info["provenance"] = package_infos.get("provenance_in_version") package_info["package_info"] = package_infos - # Only check code signature if enabled - if enabled_checks["code_signature"]: + if missing_checks.get("code_signature"): package_info["code_signature"] = check_code_signature(package_name, package_version, pm) - # Only check source code and forks if enabled - if enabled_checks["source_code"] or enabled_checks["forks"]: + if missing_checks.get("source_code") or missing_checks.get("forks"): if "Could not find" in repo_url: package_info["github_exists"] = {"github_url": "No_repo_info_found"} elif "not github" in repo_url: package_info["github_exists"] = {"github_url": "Not_github_repo"} else: - github_info = check_existence(package, repo_url) + github_info = check_existence(package, repo_url, pm) package_info["github_exists"] = github_info - # Only check name matches if enabled and relevant - if check_match and package_info["github_exists"] and package_info["github_exists"].get("github_exists"): + if check_match and package_info.get("github_exists") and package_info["github_exists"].get("github_exists"): repo_url_to_use = github_info.get("redirected_repo") or repo_url - if package_info["provenance"] == False: + if package_info.get("provenance") == False: if ( package_info["github_exists"].get("is_fork") == True or package_info["github_exists"].get("archived") == True @@ -544,22 +620,24 @@ def analyze_package_data(package, repo_url, pm, check_match=False, enabled_check "repo_name": repo_url, } + # Cache the updated analysis + cache_manager.package_cache.cache_package_analysis(package_name, package_version, pm, package_info) + except Exception as e: - logging.error(f"Error analyzing package {package}: {str(e)}") + logging.error(f"Analyzing package {package}: {str(e)}") package_info["error"] = str(e) return package_info def get_static_data(folder, packages_data, pm, check_match=False, enabled_checks=DEFAULT_ENABLED_CHECKS): - print("Analyzing package static data...") + logging.info("Analyzing package static data...") package_all = {} errors = {} with tqdm(total=len(packages_data), desc="Analyzing packages") as pbar: for package, repo_urls in packages_data.items(): - # print(f"Analyzing {package}") - tqdm.write(f"[INFO] Currently analyzing {package}") + logging.info(f"Currently analyzing {package}") repo_url = repo_urls.get("github", "") analyzed_data = analyze_package_data( package, repo_url, pm, check_match=check_match, enabled_checks=enabled_checks @@ -572,14 +650,6 @@ def get_static_data(folder, packages_data, pm, check_match=False, enabled_checks else: package_all[package] = analyzed_data - # filepaths - - # file_path = os.path.join(folder, "all_info.json") - # error_path = os.path.join(folder, "errors.json") - - # save_results_to_file(file_path, package_all) - # save_results_to_file(error_path, errors) - return package_all, errors diff --git a/tool/tool_config.py b/tool/tool_config.py index 2c2a761f..59c4d007 100644 --- a/tool/tool_config.py +++ b/tool/tool_config.py @@ -2,11 +2,19 @@ This file contains the configuration for the tool. """ -import datetime import pathlib import logging - +import os import requests_cache +import requests +import sqlite3 +import json +from datetime import datetime, timedelta +from functools import lru_cache +from pathlib import Path +from typing import Dict, Optional +import time +from git import Repo # change this to the install command for your project PNPM_LIST_COMMAND = [ @@ -18,6 +26,13 @@ "Infinity", ] +github_token = os.getenv("GITHUB_API_TOKEN") + +headers = { + "Authorization": f"Bearer {github_token}", + "Accept": "application/vnd.github.v3+json", +} + class PathManager: """ @@ -32,7 +47,7 @@ def create_folders(self, version_tag): Create the folders for the results. """ - current_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") folder_name = f"results_{current_time}" result_folder_path = self.base_dir / folder_name result_folder_path.mkdir(parents=True, exist_ok=True) @@ -45,45 +60,699 @@ def create_folders(self, version_tag): return result_folder_path, json_directory, diff_directory -def setup_cache(cache_name): +class CacheManager: + def __init__(self, cache_dir="cache"): + self.cache_dir = Path(cache_dir) + self.cache_dir.mkdir(parents=True, exist_ok=True) + + # Initialize all cache instances + self.github_cache = GitHubCache(cache_dir) + self.package_cache = PackageAnalysisCache(cache_dir) + self.commit_comparison_cache = CommitComparisonCache(cache_dir) + self.user_commit_cache = UserCommitCache(cache_dir) + self.maven_cache = MavenDependencyCache(cache_dir) + + def _setup_requests_cache(self, cache_name="http_cache"): + requests_cache.install_cache( + cache_name=str(self.cache_dir / f"{cache_name}_cache"), + backend="sqlite", + expire_after=7776000, # 90 days + allowable_codes=(200, 301, 302, 404), + ) + + def clear_all_caches(self, older_than_days=None): + """Clear all caches""" + self.github_cache.clear_cache(older_than_days) + self.package_cache.clear_cache(older_than_days) + self.commit_comparison_cache.clear_cache(older_than_days) + self.user_commit_cache.clear_cache(older_than_days) + self.maven_cache.clear_cache(older_than_days) + + +class Cache: + def __init__(self, cache_dir="cache", db_name="cache.db"): + self.cache_dir = Path(cache_dir) + self.cache_dir.mkdir(parents=True, exist_ok=True) + self.db_path = self.cache_dir / db_name + self.setup_db() + + def setup_db(self): + """Initialize SQLite database - should be implemented by subclasses""" + raise NotImplementedError + + def _execute_query(self, query, params=None): + """Execute SQLite query with proper connection handling""" + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + try: + if params: + c.execute(query, params) + else: + c.execute(query) + conn.commit() + return c.fetchall() + finally: + conn.close() + + def clear_cache(self, older_than_days=None): + """Clear cached data older than specified days""" + if older_than_days: + cutoff = (datetime.now() - timedelta(days=older_than_days)).isoformat() + self._execute_query("DELETE FROM cache_entries WHERE cached_at < ?", (cutoff,)) + else: + self._execute_query("DELETE FROM cache_entries") + + +class GitHubCache(Cache): + def __init__(self, cache_dir="cache/github"): + super().__init__(cache_dir, "github_cache.db") + self.repo_cache = {} # In-memory LRU cache + + def setup_db(self): + """Initialize GitHub-specific cache tables""" + queries = [ + """CREATE TABLE IF NOT EXISTS github_urls ( + package TEXT PRIMARY KEY, + repo_url TEXT, + cached_at TIMESTAMP + )""", + """CREATE TABLE IF NOT EXISTS pr_info ( + package TEXT, + commit_sha TEXT, + commit_node_id TEXT PRIMARY KEY, + pr_info TEXT, + cached_at TIMESTAMP + )""", + """CREATE TABLE IF NOT EXISTS pr_reviews ( + package TEXT, + repo_name TEXT, + author TEXT, + first_review_data TEXT, + cached_at TIMESTAMP, + PRIMARY KEY (repo_name, author) + )""", + """CREATE TABLE IF NOT EXISTS tag_to_sha ( + repo_name TEXT, + tag TEXT, + sha TEXT, + cached_at TIMESTAMP, + PRIMARY KEY (repo_name, tag) + )""", + ] + + for query in queries: + self._execute_query(query) + + def cache_pr_review(self, package, repo_name, author, first_review_data): + """Cache PR review information""" + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + + try: + c.execute( + """ + INSERT OR REPLACE INTO pr_reviews + (package, repo_name, author, first_review_data, cached_at) + VALUES (?, ?, ?, ?, ?) + """, + (package, repo_name, author, json.dumps(first_review_data), datetime.now().isoformat()), + ) + conn.commit() + finally: + conn.close() + + def get_pr_review(self, repo_name=None, author=None): + """Get PR review information from cache""" + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + + try: + c.execute( + "SELECT first_review_data, cached_at FROM pr_reviews WHERE repo_name = ? AND author = ?", + (repo_name, author), + ) + result = c.fetchone() + if result: + review_data, cached_at = result + cached_at = datetime.fromisoformat(cached_at) + + # Return cached data if it's less than 7 days old + if datetime.now() - cached_at < timedelta(days=7): + return json.loads(review_data) + return None + finally: + conn.close() + + def cache_github_url(self, package, repo_url): + """Cache GitHub URL for a package""" + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + + try: + c.execute( + """ + INSERT OR REPLACE INTO github_urls + (package, repo_url, cached_at) + VALUES (?, ?, ?) + """, + (package, repo_url, datetime.now().isoformat()), + ) + conn.commit() + finally: + conn.close() + + def get_github_url(self, package): + """Get cached GitHub URL for a package""" + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + + try: + c.execute("SELECT repo_url, cached_at FROM github_urls WHERE package = ?", (package,)) + result = c.fetchone() + + if result: + repo_url, cached_at = result + cached_at = datetime.fromisoformat(cached_at) + + # URLs don't change often, so we can cache them for longer (30 days) + if datetime.now() - cached_at < timedelta(days=30): + return repo_url + + return None + finally: + conn.close() + + def cache_pr_info(self, pr_data: Dict): + """Cache PR info with current timestamp""" + with sqlite3.connect(self.db_path) as conn: + conn.execute( + """ + INSERT OR REPLACE INTO pr_info + (package, commit_sha, commit_node_id, pr_info, cached_at) + VALUES (?, ?, ?, ?, ?) + """, + ( + pr_data["package"], + pr_data["commit_sha"], + pr_data["commit_node_id"], + json.dumps(pr_data["pr_info"]), + datetime.now().isoformat(), + ), + ) + conn.commit() + + def get_pr_info(self, commit_node_id: str) -> Optional[Dict]: + """Get PR info from cache if available and not expired""" + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + with sqlite3.connect(self.db_path) as conn: + c.execute( + "SELECT package, commit_sha, commit_node_id, pr_info, cached_at FROM pr_info WHERE commit_node_id = ?", + (commit_node_id,), + ) + result = c.fetchone() + + if result: + package, commit_sha, commit_node_id, pr_info, cached_at = result + cached_at = datetime.fromisoformat(cached_at) + if datetime.now() - cached_at < timedelta(hours=24): + return { + "package": package, + "commit_sha": commit_sha, + "commit_node_id": commit_node_id, + "pr_info": json.loads(pr_info), + } + return None + + def cache_tag_to_sha(self, repo_name, tag, sha): + """Cache tag to SHA mapping""" + with sqlite3.connect(self.db_path) as conn: + conn.execute( + """ + INSERT OR REPLACE INTO tag_to_sha + (repo_name, tag, sha, cached_at) + VALUES (?, ?, ?, ?) + """, + (repo_name, tag, sha, datetime.now().isoformat()), + ) + conn.commit() + + def get_tag_to_sha(self, repo_name, tag): + """Get SHA for a tag from cache""" + with sqlite3.connect(self.db_path) as conn: + c = conn.cursor() + c.execute("SELECT sha, cached_at FROM tag_to_sha WHERE repo_name = ? AND tag = ?", (repo_name, tag)) + result = c.fetchone() + + if result: + sha, cached_at = result + cached_at = datetime.fromisoformat(cached_at) + if datetime.now() - cached_at < timedelta(days=30): + return sha + return None + + def clear_cache(self, older_than_days=None): + """Clear cached data""" + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + + try: + if older_than_days: + cutoff = (datetime.now() - timedelta(days=older_than_days)).isoformat() + c.execute("DELETE FROM pr_reviews WHERE cached_at < ?", (cutoff,)) + c.execute("DELETE FROM repo_info WHERE cached_at < ?", (cutoff,)) + c.execute("DELETE FROM github_urls WHERE cached_at < ?", (cutoff,)) + c.execute("DELETE FROM pr_info WHERE cached_at < ?", (cutoff,)) + else: + c.execute("DELETE FROM pr_reviews") + c.execute("DELETE FROM repo_info") + c.execute("DELETE FROM github_urls") + c.execute("DELETE FROM pr_info") + conn.commit() + + finally: + conn.close() + + +class PackageAnalysisCache(Cache): + def __init__(self, cache_dir="cache/packages"): + super().__init__(cache_dir, "package_analysis.db") + + def setup_db(self): + """Initialize package analysis cache tables""" + self._execute_query( + """ + CREATE TABLE IF NOT EXISTS package_analysis ( + package_name TEXT, + version TEXT, + package_manager TEXT, + analysis_data TEXT, + cached_at TIMESTAMP, + PRIMARY KEY (package_name, version, package_manager) + ) + """ + ) + + def cache_package_analysis(self, package_name, version, package_manager, analysis_data): + """Cache package analysis results""" + self._execute_query( + """ + INSERT OR REPLACE INTO package_analysis + (package_name, version, package_manager, analysis_data, cached_at) + VALUES (?, ?, ?, ?, ?) + """, + (package_name, version, package_manager, json.dumps(analysis_data), datetime.now().isoformat()), + ) + + def get_package_analysis(self, package_name, version, package_manager, max_age_days=30): + """Get cached package analysis results""" + results = self._execute_query( + """SELECT analysis_data, cached_at + FROM package_analysis + WHERE package_name = ? AND version = ? AND package_manager = ?""", + (package_name, version, package_manager), + ) + + if results: + analysis_data, cached_at = results[0] + cached_at = datetime.fromisoformat(cached_at) + + if datetime.now() - cached_at < timedelta(days=max_age_days): + return json.loads(analysis_data) + + return None + + def clear_cache(self, older_than_days=None): + """Clear cached data""" + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + + try: + if older_than_days: + cutoff = (datetime.now() - timedelta(days=older_than_days)).isoformat() + c.execute("DELETE FROM package_analysis WHERE cached_at < ?", (cutoff,)) + else: + c.execute("DELETE FROM package_analysis") + + conn.commit() + + finally: + conn.close() + + +class CommitComparisonCache(Cache): + def __init__(self, cache_dir="cache/commits"): + super().__init__(cache_dir, "commit_comparison_cache.db") + + def setup_db(self): + """Initialize commit comparison cache tables""" + queries = [ + """ + CREATE TABLE IF NOT EXISTS commit_authors_from_tags ( + package TEXT, + tag1 TEXT, + tag2 TEXT, + data TEXT, + cached_at TIMESTAMP, + PRIMARY KEY (package, tag1, tag2) + ) + """, + """ + CREATE TABLE IF NOT EXISTS commit_authors_from_url ( + commit_url TEXT PRIMARY KEY, + data TEXT, + cached_at TIMESTAMP + ) + """, + """ + CREATE TABLE IF NOT EXISTS patch_authors_from_sha ( + repo_name TEXT, + patch_path TEXT, + sha TEXT, + data TEXT, + cached_at TIMESTAMP, + PRIMARY KEY (repo_name, patch_path, sha) + ) + """, + ] + + for query in queries: + self._execute_query(query) + + def cache_authors_from_tags(self, package, tag1, tag2, data): + self._execute_query( + """ + INSERT OR REPLACE INTO commit_authors_from_tags + (package, tag1, tag2, data, cached_at) + VALUES (?, ?, ?, ?, ?) + """, + (package, tag1, tag2, json.dumps(data), datetime.now().isoformat()), + ) + + def get_authors_from_tags(self, package, tag1, tag2, max_age_days=30): + results = self._execute_query( + "SELECT data, cached_at FROM commit_authors_from_tags WHERE package = ? AND tag1 = ? AND tag2 = ?", + (package, tag1, tag2), + ) + if results: + data, cached_at = results[0] + cached_at = datetime.fromisoformat(cached_at) + if datetime.now() - cached_at < timedelta(days=max_age_days): + return json.loads(data) + return None + + def cache_authors_from_url(self, commit_url, data): + self._execute_query( + """ + INSERT OR REPLACE INTO commit_authors_from_url + (commit_url, data, cached_at) + VALUES (?, ?, ?) + """, + (commit_url, json.dumps(data), datetime.now().isoformat()), + ) + + def get_authors_from_url(self, commit_url, max_age_days=30): + results = self._execute_query( + "SELECT data, cached_at FROM commit_authors_from_url WHERE commit_url = ?", (commit_url,) + ) + if results: + data, cached_at = results[0] + cached_at = datetime.fromisoformat(cached_at) + if datetime.now() - cached_at < timedelta(days=max_age_days): + return json.loads(data) + return None + + def cache_patch_authors(self, repo_name, patch_path, sha, data): + self._execute_query( + """ + INSERT OR REPLACE INTO patch_authors_from_sha + (repo_name, patch_path, sha, data, cached_at) + VALUES (?, ?, ?, ?, ?) + """, + (repo_name, patch_path, sha, json.dumps(data), datetime.now().isoformat()), + ) + + def get_patch_authors(self, repo_name, patch_path, sha, max_age_days=30): + results = self._execute_query( + "SELECT data, cached_at FROM patch_authors_from_sha WHERE repo_name = ? AND patch_path = ? AND sha = ?", + (repo_name, patch_path, sha), + ) + if results: + data, cached_at = results[0] + cached_at = datetime.fromisoformat(cached_at) + if datetime.now() - cached_at < timedelta(days=max_age_days): + return json.loads(data) + return None + + def clear_cache(self, older_than_days=None): + """Clear cached data""" + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + + try: + if older_than_days: + cutoff = (datetime.now() - timedelta(days=older_than_days)).isoformat() + c.execute("DELETE FROM commit_authors_from_tags WHERE cached_at < ?", (cutoff,)) + c.execute("DELETE FROM commit_authors_from_url WHERE cached_at < ?", (cutoff,)) + c.execute("DELETE FROM patch_authors_from_sha WHERE cached_at < ?", (cutoff,)) + else: + c.execute("DELETE FROM commit_authors_from_tags") + c.execute("DELETE FROM commit_authors_from_url") + c.execute("DELETE FROM patch_authors_from_sha") + + conn.commit() + + finally: + conn.close() + + +class UserCommitCache(Cache): + def __init__(self, cache_dir="cache/user_commits"): + super().__init__(cache_dir, "user_commits.db") + + def setup_db(self): + self._execute_query( + """ + CREATE TABLE IF NOT EXISTS user_commit ( + api_url TEXT PRIMARY KEY, + earliest_commit_sha TEXT, + repo_name TEXT, + package TEXT, + author_login TEXT, + author_commit_sha TEXT, + author_login_in_1st_commit TEXT, + author_id_in_1st_commit TEXT, + cached_at TIMESTAMP + ) + """ + ) + + def cache_user_commit( + self, + api_url, + earliest_commit_sha, + repo_name, + package, + author_login, + author_commit_sha, + author_login_in_1st_commit, + author_id_in_1st_commit, + ): + self._execute_query( + """ + INSERT OR REPLACE INTO user_commit + (api_url, earliest_commit_sha, repo_name, package, author_login, author_commit_sha, author_login_in_1st_commit, author_id_in_1st_commit, cached_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + api_url, + earliest_commit_sha, + repo_name, + package, + author_login, + author_commit_sha, + author_login_in_1st_commit, + author_id_in_1st_commit, + datetime.now().isoformat(), + ), + ) + + def get_user_commit(self, api_url, max_age_days=30): + results = self._execute_query( + "SELECT earliest_commit_sha, author_login_in_1st_commit, author_id_in_1st_commit, cached_at FROM user_commit WHERE api_url = ?", + (api_url,), + ) + if results: + earliest_commit_sha, author_login_in_1st_commit, author_id_in_1st_commit, cached_at = results[0] + cached_at = datetime.fromisoformat(cached_at) + if datetime.now() - cached_at < timedelta(days=max_age_days): + return earliest_commit_sha, author_login_in_1st_commit, author_id_in_1st_commit + return None + + def clear_cache(self, older_than_days=None): + """Clear cached data""" + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + + try: + if older_than_days: + cutoff = (datetime.now() - timedelta(days=older_than_days)).isoformat() + c.execute("DELETE FROM user_commit WHERE cached_at < ?", (cutoff,)) + else: + c.execute("DELETE FROM user_commit") + + conn.commit() + + finally: + conn.close() + + +class MavenDependencyCache(Cache): + def __init__(self, cache_dir="cache/maven_deps"): + super().__init__(cache_dir, "maven_deps.db") + + def setup_db(self): + self._execute_query( + """ + CREATE TABLE IF NOT EXISTS maven_dependencies ( + repo_path TEXT, + pom_hash TEXT, + dependencies TEXT, + cached_at TIMESTAMP, + PRIMARY KEY (repo_path, pom_hash) + ) + """ + ) + + def cache_dependencies(self, repo_path, pom_hash, dependencies): + self._execute_query( + """ + INSERT OR REPLACE INTO maven_dependencies + (repo_path, pom_hash, dependencies, cached_at) + VALUES (?, ?, ?, ?) + """, + (repo_path, pom_hash, json.dumps(dependencies), datetime.now().isoformat()), + ) + + def get_dependencies(self, repo_path, pom_hash, max_age_days=30): + results = self._execute_query( + "SELECT dependencies, cached_at FROM maven_dependencies WHERE repo_path = ? AND pom_hash = ?", + (repo_path, pom_hash), + ) + if results: + deps_json, cached_at = results[0] + cached_at = datetime.fromisoformat(cached_at) + if datetime.now() - cached_at < timedelta(days=max_age_days): + return json.loads(deps_json) + return None + + def clear_cache(self, older_than_days=None): + """Clear cached data""" + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + + try: + if older_than_days: + cutoff = (datetime.now() - timedelta(days=older_than_days)).isoformat() + c.execute("DELETE FROM maven_dependencies WHERE cached_at < ?", (cutoff,)) + else: + c.execute("DELETE FROM maven_dependencies") + + conn.commit() + + finally: + conn.close() + + +cache_manager = CacheManager() + + +def get_cache_manager(): + return cache_manager + + +CLONE_OPTIONS = { + "blobless": "--filter=blob:none", +} + + +def clone_repo(project_repo_name, release_version=None, blobless=False): """ - Setup the cache for the requests. + Clone the repository for the given project and release version. + + Args: + project_repo_name (str): The name of the project repository. + release_version (str): The release version of the project. + blobless (bool): Whether to clone the repository without blobs. + + Returns: + str: The path to the cloned repository. """ - cache_folder = pathlib.Path("cache") - cache_folder.mkdir(parents=True, exist_ok=True) + repo_url = f"https://github.com/{project_repo_name}.git" - cache_file = cache_folder / f"{cache_name}_cache" + # Clone to /tmp folder; if it is already cloned, an error will be raised + try: + options = [CLONE_OPTIONS["blobless"]] if blobless else [] + Repo.clone_from(repo_url, f"/tmp/{project_repo_name}", multi_options=options) + except Exception as e: + # If the repo is already cloned, just fetch the latest changes + logging.info(f"Repo already cloned. Fetching the latest changes...") + repo = Repo(f"/tmp/{project_repo_name}") - requests_cache.install_cache( - cache_name=str(cache_file), - backend="sqlite", - expire_after=7776000, - allowable_codes=(200, 301, 302, 404), - ) # 90 days + # Fetch the latest changes + repo.remotes.origin.fetch() + # Checkout to the release version if provided + if release_version: + repo = Repo(f"/tmp/{project_repo_name}") + repo.git.checkout(release_version) - # logging.info(f"Cache setup complete: {cache_file}") + return f"/tmp/{project_repo_name}" -def setup_logger(log_file_path): +def setup_logger(log_file_path, debug=False): """ Setup the logger for the analysis. """ + class CustomFormatter(logging.Formatter): + """Custom formatter, includes color coding for log levels.""" + + grey = "\x1b[38;20m" + green = "\x1b[38;2;0;200;0m" + yellow = "\x1b[38;2;255;255;0m" + red = "\x1b[38;2;255;0;0m" + bold_red = "\x1b[1;31m" + reset = "\x1b[0m" + fmt = "%(asctime)s:%(name)s:%(levelname)s:%(message)s" + + FORMATS = { + logging.DEBUG: grey + fmt + reset, + logging.INFO: green + fmt + reset, + logging.WARNING: yellow + fmt + reset, + logging.ERROR: red + fmt + reset, + logging.CRITICAL: bold_red + fmt + reset, + } + + def format(self, record): + log_fmt = self.FORMATS.get(record.levelno) + formatter = logging.Formatter(log_fmt) + return formatter.format(record) + # Set up the logger - logger = logging.getLogger("dw_analysis") + logger = logging.getLogger() logger.setLevel(logging.INFO) # Create a console handler console_handler = logging.StreamHandler() - console_handler.setLevel(logging.INFO) + console_handler.setLevel(logging.WARNING if not debug else logging.INFO) # Create a file handler file_handler = logging.FileHandler(log_file_path) file_handler.setLevel(logging.INFO) # Create a formatter and set it for both handlers - formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") + formatter = CustomFormatter() console_handler.setFormatter(formatter) file_handler.setFormatter(formatter) @@ -92,3 +761,99 @@ def setup_logger(log_file_path): logger.addHandler(file_handler) return logger + + +def make_github_request( + url: str, + method: str = "GET", + headers: Dict = headers, + json_data: Optional[Dict] = None, + max_retries: int = 1, + retry_delay: int = 2, + timeout: int = 20, + sleep_between_requests: int = 0, + silent: bool = False, +) -> Optional[Dict]: + """ + Make a HTTP request with retry logic and rate limiting handling. + + Args: + url (str): HTTP URL + method (str): HTTP method ("GET" or "POST") + headers (Dict): Request headers + json_data (Optional[Dict]): JSON payload for POST requests + max_retries (int): Maximum number of retry attempts + retry_delay (int): Base time to wait between retries in seconds + timeout (int): Request timeout in seconds + silent (bool): Whether to suppress error logging + + Returns: + Optional[Dict]: JSON response or None if request failed + """ + for attempt in range(max_retries): + try: + response = requests.request(method=method, url=url, headers=headers, json=json_data, timeout=timeout) + response.raise_for_status() + return response.json() + + except requests.exceptions.RequestException as e: + time.sleep(sleep_between_requests) + if isinstance(e, requests.exceptions.HTTPError) and ( + e.response.status_code in [429, 403] or "rate limit" in e.response.text.lower() + ): + if attempt == max_retries - 1: + if not silent: + logging.error(f"Failed after {max_retries} attempts due to rate limiting: {e}") + return None + + # Get rate limit reset time and wait + reset_time = int(e.response.headers.get("X-RateLimit-Reset", 0)) + wait_time = max(reset_time - int(time.time()), 0) + if not silent: + logging.warning(f"Rate limit exceeded. Waiting {wait_time} seconds...") + time.sleep(wait_time) + else: + # Handle other errors + if not silent: + logging.warning(f"Request failed: {e}") + if attempt == max_retries - 1: + return None + time.sleep(retry_delay * (attempt + 1)) + + return None + + +def get_last_page_info( + url: str, max_retries: int = 1, retry_delay: int = 2, sleep_between_requests: int = 0 +) -> Optional[int]: + """ + Get the last page number from the response headers. + + Args: + url (str): URL to get the last page number + max_retries (int): Maximum number + retry_delay (int): Base time to wait between retries in seconds + sleep_between_requests (int): Time to sleep between requests in seconds + + Returns: + Optional[int]: Last page number or None if request failed + """ + + # We can't just use make_github_request here because we need to access the response headers + for attempt in range(max_retries): + try: + response = requests.get(url, headers=headers) + response.raise_for_status() + if "last" in response.links: + last_page = int(response.links["last"]["url"].split("=")[-1]) + else: + # Otherwise, the last page is the first page too + last_page = 1 + return last_page + + except requests.exceptions.RequestException as e: + time.sleep(sleep_between_requests) + if attempt == max_retries - 1: + logging.error(f"Failed after {max_retries} attempts: {e}") + return None + time.sleep(retry_delay * (attempt + 1))