Skip to content

Commit

Permalink
feat: implement cache module for static analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
randomicecube committed Jan 20, 2025
1 parent 42a6558 commit 5acaad7
Show file tree
Hide file tree
Showing 9 changed files with 813 additions and 196 deletions.
6 changes: 3 additions & 3 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@

in
{
devShells.x86_64-linux.default = pkgs.mkShell { packages = [ pythonEnv pkgs.maven ]; };
devShells.x86_64-linux.default = pkgs.mkShell { packages = [ pythonEnv pkgs.maven pkgs.yarn pkgs.pnpm pkgs.act ]; };
};
}
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ readme = "README.md"
requires-python = ">=3.12"
keywords = [ "software supply chain", "ssc", "dependencies", "npm",]
classifiers = [ "Intended Audience :: Developers", "Topic :: Software Development :: Build Tools", "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent",]
dependencies = [ "attrs == 24.2.0", "cattrs == 24.1.2", "certifi == 2024.8.30", "charset-normalizer == 3.4.0", "exceptiongroup == 1.2.2", "GitPython == 3.1.43", "idna == 3.10", "numpy == 2.1.2", "pandas == 2.2.3", "platformdirs == 4.3.6", "python-dateutil == 2.9.0.post0", "pytz == 2024.2", "requests == 2.32.3", "requests-cache == 1.2.1", "six == 1.16.0", "tabulate == 0.9.0", "tqdm == 4.66.5", "typing_extensions == 4.12.2", "tzdata == 2024.2", "url-normalize == 1.4.3", "urllib3 == 2.2.3",]
dependencies = [ "attrs == 24.2.0", "cattrs == 24.1.2", "certifi == 2024.8.30", "charset-normalizer == 3.4.0", "exceptiongroup == 1.2.2", "GitPython == 3.1.43", "idna == 3.10", "numpy == 2.1.2", "pandas == 2.2.3", "platformdirs == 4.3.6", "python-dateutil == 2.9.0.post0", "pytz == 2024.2", "PyYAML == 6.0.2", "requests == 2.32.3", "requests-cache == 1.2.1", "six == 1.16.0", "tabulate == 0.9.0", "tqdm == 4.66.5", "typing_extensions == 4.12.2", "tzdata == 2024.2", "url-normalize == 1.4.3", "urllib3 == 2.2.3",]
[[project.authors]]
name = "Raphina Liu"
email = "[email protected]"
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ pandas==2.2.3
platformdirs==4.3.6
python-dateutil==2.9.0.post0
pytz==2024.2
PyYAML==6.0.2
requests==2.32.3
requests-cache==1.2.1
six==1.16.0
Expand Down
118 changes: 73 additions & 45 deletions tool/extract_deps.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,15 @@
import sys
import shutil
from collections import defaultdict
import json

Check failure on line 13 in tool/extract_deps.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F811)

tool/extract_deps.py:13:8: F811 Redefinition of unused `json` from line 8
import hashlib
from pathlib import Path
import yaml

from tool_config import PNPM_LIST_COMMAND
from tool_config import PNPM_LIST_COMMAND, get_cache_manager

logger = logging.getLogger(__name__)

Check failure on line 20 in tool/extract_deps.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

tool/extract_deps.py:5:1: I001 Import block is un-sorted or un-formatted
cache_manager = get_cache_manager()

MVN_DEPENDENCY_PLUGIN = "org.apache.maven.plugins:maven-dependency-plugin:3.8.1"
append_dependency_goal = lambda goal: f"{MVN_DEPENDENCY_PLUGIN}:{goal}"

Check failure on line 24 in tool/extract_deps.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E731)

tool/extract_deps.py:24:1: E731 Do not assign a `lambda` expression, use a `def`
Expand Down Expand Up @@ -188,11 +193,6 @@ def extract_deps_from_v1_yarn(yarn_lock_file):
return {"resolutions": [], "patches": []}


def extract_deps_from_pnpm_lock_yaml(pnpm_lock_yaml_file):
"""
Extract dependencies from a pnpm-lock.yaml file.
"""


def get_pnpm_dep_tree(folder_path, version_tag, project_repo_name):
"""
Expand Down Expand Up @@ -365,6 +365,16 @@ def extract_deps_from_pnpm_mono(folder_path, version_tag, project_repo_name):
return deps_list_data


def get_pom_hash(repo_path):
"""Generate a hash of the pom.xml file to detect changes"""
pom_path = Path(repo_path) / "pom.xml"
if not pom_path.exists():
return None

Check failure on line 373 in tool/extract_deps.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (W293)

tool/extract_deps.py:373:1: W293 Blank line contains whitespace
with open(pom_path, "rb") as f:
return hashlib.sha256(f.read()).hexdigest()


def extract_deps_from_maven(repo_path):
"""
Extract dependencies from a Maven package, given the path to its locally cloned repo.
Expand All @@ -376,48 +386,60 @@ def extract_deps_from_maven(repo_path):
dict: A dictionary containing the extracted dependencies.
"""

def parse_mvn_dependency_logs(file_path):
def parse_mvn_dependency_logs(log_file):
"""
Parse the logs generated by the Maven dependency plugin.
Parse Maven dependency resolution logs to extract dependency information.
Args:
file_path (str): The path to the log file.
log_file (str): Path to the Maven dependency resolution log file
Returns:
List[Dict]: A list of dictionaries containing the dependencies. Each dictionary
contains the groupId, artifactId, and version of a dependency.
list: List of dictionaries containing dependency information
"""

result = []
with open(file_path, "r") as file:
for line in file:
line = line.strip()
# If the line starts with <something>:<something>:<something>:<something>,
# we want to keep the first, second, and fourth parts
# Otherwise, ignore the line
if re.match(r"^\S+:\S+:\S+:\S+", line):
parts = line.split(":")
result.append(
{
"groupId": parts[0],
"artifactId": parts[1],
"version": parts[3].split()[0],
dependencies = []

try:
with open(log_file, 'r') as f:
for line in f:
parts = line.strip().split(':')
if len(parts) >= 3: # Minimum required parts, [2] would be type
dep_info = {
'groupId': parts[0],
'artifactId': parts[1],
'version': parts[3].split()[0]
}
)
return result
dependencies.append(dep_info)

except FileNotFoundError:
logging.error("Dependency log file not found: %s", log_file)
except Exception as e:
logging.error("Error parsing dependency log: %s", str(e))

return dependencies

# Generate a cache key based on the repo path and pom.xml hash
pom_hash = get_pom_hash(repo_path)
if not pom_hash:
logging.error("No pom.xml found in %s", repo_path)
return {"resolutions": [], "patches": []}

# First, switch to the repository directory
cached_deps = cache_manager.maven_cache.get_dependencies(repo_path, pom_hash)
if cached_deps:
print(f"[INFO] Using cached Maven dependencies for {repo_path}")
return cached_deps

# If we reach here, we need to resolve dependencies
current_dir = os.getcwd()
os.chdir(repo_path)

retrieval_commands = {
"regular": [ # "Regular" dependencies
"regular": [
"mvn",
RESOLVE_GOAL,
"-Dsort=true",
f"-DoutputFile={RESOLVE_LOG}",
],
"plugins": [ # Plugin dependencies
"plugins": [
"mvn",
RESOLVE_PLUGINS_GOAL,
"-Dsort=true",
Expand All @@ -426,31 +448,37 @@ def parse_mvn_dependency_logs(file_path):
}

try:
# First, running both commands to get the dependencies
# Run Maven commands to resolve dependencies
subprocess.run(retrieval_commands["regular"], check=True)
subprocess.run(retrieval_commands["plugins"], check=True)
# Then, parsing them from the log files

# Parse the dependency logs
retrieved_deps = parse_mvn_dependency_logs(RESOLVE_LOG)
retrieved_plugins = parse_mvn_dependency_logs(RESOLVE_PLUGINS_LOG)
# Go back to the original directory

# Go back to original directory
os.chdir(current_dir)

# Format the dependencies
parsed_deps = [f"{dep['groupId']}:{dep['artifactId']}@{dep['version']}" for dep in retrieved_deps]
parsed_plugins = [
f"{plugin['groupId']}:{plugin['artifactId']}@{plugin['version']}" for plugin in retrieved_plugins
]

# Using a set to avoid duplicates
resolutions = set(parsed_deps + parsed_plugins)
deps_list_data = {"resolutions": resolutions, "patches": []}
# TODO: confirm resolutions?
# Create the result
deps_list_data = {
"resolutions": list(set(parsed_deps + parsed_plugins)),
"patches": []
}

# Cache the results
cache_manager.maven_cache.cache_dependencies(repo_path, pom_hash, deps_list_data)

return deps_list_data

except subprocess.CalledProcessError as e:
print(f"An error occurred: {e}")
logging.error(
"An error occurred while extracting dependencies from pom.xml file: %s",
str(e),
)
os.chdir(current_dir)
logging.error("Error resolving Maven dependencies: %s", str(e))
return {"resolutions": [], "patches": []}


Expand Down
124 changes: 39 additions & 85 deletions tool/github_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,13 @@
import logging
from pathlib import Path
from tqdm import tqdm

# from datetime import datetime

from tool_config import get_cache_manager
from typing import List

TIMEOUT = 60

script_dir = Path(__file__).parent.absolute()
database_file = script_dir / "database" / "github_repo_info_all.db"

conn = sqlite3.connect(database_file)
c = conn.cursor()

c.execute(
"""CREATE TABLE IF NOT EXISTS pkg_github_repo_output (
package TEXT PRIMARY KEY,
github TEXT)"""
)

conn.commit()
cache_manager = get_cache_manager()
GITHUB_URL_PATTERN = re.compile(r"(github.*)", re.IGNORECASE)


def write_output(folder_path, filename, data):
Expand All @@ -42,12 +30,33 @@ def write_output(folder_path, filename, data):
json.dump(data, f, indent=2)


def extract_repo_url(repo_info):
pattern = r"(github.*)"
match = re.search(pattern, repo_info, re.IGNORECASE)
def extract_repo_url(repo_info: str) -> str:
"""Extract GitHub repository URL from repository information."""
match = GITHUB_URL_PATTERN.search(repo_info)
return match.group(1) if match else "not github"


def get_package_command(pm: str, package: str) -> List[str]:
"""Get the appropriate command for the package manager."""
if pm == "yarn-berry" or pm == "yarn-classic":
return ["yarn", "info", package, "repository.url"]
elif pm == "pnpm":
return ["pnpm", "info", package, "repository.url"]
elif pm == "npm":
return ["npm", "info", package, "repository.url"]
elif pm == "maven":
name, version = package.split("@")
group_id, artifact_id = name.split(":")
return [
"mvn",
"help:evaluate",
"-Dexpression=project.scm.url",
f"-Dartifact={group_id}:{artifact_id}:{version}",
"-q",
"-DforceStdout",
]
raise ValueError(f"Unsupported package manager: {pm}")

def process_package(
package,
pm,
Expand All @@ -57,74 +66,20 @@ def process_package(
some_errors,
repos_output_json,
):
c.execute("SELECT github FROM pkg_github_repo_output WHERE package = ?", (package,))
db_result = c.fetchone()
repo_info = cache_manager.repo_cache.get_repo_info(package)

if db_result:
repo_info = db_result[0]

else:
if not repo_info:
try:
if pm == "yarn-berry" or pm == "yarn-classic":
command = ["yarn", "info", package, "repository.url"]
result = subprocess.run(
command,
capture_output=True,
text=True,
check=True,
timeout=TIMEOUT,
)

elif pm == "pnpm":
command = ["pnpm", "info", package, "repository.url"]
result = subprocess.run(
command,
capture_output=True,
text=True,
check=True,
timeout=TIMEOUT,
)

elif pm == "npm":
command = ["npm", "info", package, "repository.url"]
result = subprocess.run(
command,
capture_output=True,
text=True,
check=True,
timeout=TIMEOUT,
)

elif pm == "maven":
# package is in the form of group_id:artifact_id@version -- we need all 3
name, version = package.split("@")
group_id, artifact_id = name.split(":")
command = [
"mvn",
"help:evaluate",
"-Dexpression=project.scm.url",
f"-Dartifact={group_id}:{artifact_id}:{version}",
"-q",
"-DforceStdout",
]
result = subprocess.run(
command,
capture_output=True,
text=True,
check=True,
timeout=TIMEOUT,
)

else:
raise ValueError(f"Unsupported package manager: {pm}")

repo_info = result.stdout if result.stdout else result.stderr
# print(f"Repo info for {package}: {repo_info}")
c.execute(
"INSERT OR IGNORE INTO pkg_github_repo_output (package, github) VALUES (?,?)",
(package, repo_info),
command = get_package_command(pm, package)
result = subprocess.run(
command,
capture_output=True,
text=True,
check=True,
timeout=TIMEOUT,
)
conn.commit()
repo_info = result.stdout if result.stdout else result.stderr
cache_manager.repo_cache.cache_repo_info(package, repo_info)

except subprocess.TimeoutExpired:
logging.error(
Expand All @@ -136,7 +91,6 @@ def process_package(
logging.error(f"Command {command} failed for package {package}: {e}")
repo_info = None

# TODO: npm?
package = package.replace("@npm:", "@")

if (
Expand Down
Loading

0 comments on commit 5acaad7

Please sign in to comment.