From 6749d614c171501a17c87ebd3355835a1d6e458c Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 25 Oct 2024 05:06:42 -0500 Subject: [PATCH] lint (#18) --- README.md | 39 +++------ alembic/env.py | 3 +- alembic/run_migrations.sh | 23 +++-- .../20240918_1200-initial_migration.py | 1 + .../20240923_0821-add_load_history.py | 1 + .../versions/20240925_0808-add_users_urls.py | 3 +- ...2034-link_tables_add_unique_constraints.py | 1 - ...241002_1456-new_data_models_and_indexes.py | 3 +- .../20241003_0040-import_id_for_versions.py | 2 +- .../versions/20241003_1554-redo_data_model.py | 3 +- ...9_0915-modify_users_username_uniqueness.py | 1 - ...1010_1347-add_license_name_and_id_index.py | 1 - core/README.md | 83 +++++++++++++++++++ core/db.py | 1 - core/fetcher.py | 1 + core/logger.py | 2 +- core/models/__init__.py | 5 -- core/scheduler.py | 6 +- docker-compose.yml | 2 +- monitor/main.py | 7 +- package_managers/crates/main.py | 7 +- 21 files changed, 136 insertions(+), 59 deletions(-) create mode 100644 core/README.md diff --git a/README.md b/README.md index 8e489c8..fc2aa82 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,14 @@ Use [Docker](https://docker.com) +> [!NOTE] +> Currently, we support only two package managers: +> +> - crates +> - Homebrew +> +> We are planning on supporting `NPM`, `PyPI`, and `rubygems` + ### Arguments Specify these eg. `docker compose -e FOO=bar up`: @@ -35,7 +43,7 @@ These arguments are all configurable in the `docker-compose.yml` file. 1. `db`: [PostgreSQL] database for the reduced package data 2. `alembic`: handles migrations -3. `pipeline`: fetches and writes data +3. `package_managers`: fetches and writes data for each package manager 4. `api`: a simple REST api for reading from the db ### Hard Reset @@ -59,8 +67,10 @@ Our goal is to build a data schema that looks like this: ![db/CHAI_ERD.png](db/CHAI_ERD.png) Our specific application extracts the dependency graph understand what are -critical pieces of the open-source graph. there are many other potential use -cases for this data: +critical pieces of the open-source graph. We also built a simple example that displays +[sbom-metadata](examples/sbom-meta) for your repository. + +There are many other potential use cases for this data: - License compatibility checker - Developer publications @@ -70,29 +80,6 @@ cases for this data: > [!TIP] > Help us add the above to the examples folder. -### Package Popularity - -```sql -SELECT p.name, SUM(v.downloads) as total_downloads -FROM packages p -JOIN versions v ON p.id = v.package_id -GROUP BY p.name -ORDER BY total_downloads DESC -LIMIT 10; -``` - -### Developer Publications - -```sql -SELECT u.username, p.name, COUNT(uv.id) as publications -FROM users u -JOIN user_versions uv ON u.id = uv.user_id -JOIN versions v ON uv.version_id = v.id -JOIN packages p ON v.package_id = p.id -GROUP BY u.username, p.name -ORDER BY p.name; -``` - ## FAQs / Common Issues 1. The database url is `postgresql://postgres:s3cr3t@localhost:5435/chai`, and diff --git a/alembic/env.py b/alembic/env.py index 2608372..78a74c6 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -1,8 +1,9 @@ import os from logging.config import fileConfig -from alembic import context from sqlalchemy import engine_from_config, pool + +from alembic import context from core.models import Base # this is the Alembic Config object, which provides diff --git a/alembic/run_migrations.sh b/alembic/run_migrations.sh index 6e6beee..bed32cb 100755 --- a/alembic/run_migrations.sh +++ b/alembic/run_migrations.sh @@ -1,30 +1,37 @@ #!/bin/bash -# wait for db to be ready +set -uo pipefail + +# This script sets up the database, runs migrations, and loads initial values + +# Wait for database to be ready until pg_isready -h db -p 5432 -U postgres; do echo "waiting for database..." sleep 2 done -# create db if needed -# if [ "$( psql -XtAc "SELECT 1 FROM pg_database WHERE datname='chai'" 2&>/dev/null)" = '1' ] +# Check if the 'chai' database exists, create it if it doesn't if [ "$( psql -XtAc "SELECT 1 FROM pg_database WHERE datname='chai'" -h db -U postgres)" = '1' ] then echo "Database 'chai' already exists" else echo "Database 'chai' does not exist, creating..." + # Run the initialization script to create the database psql -U postgres -h db -f init-script.sql -a fi -# migrate -echo "db currently at $(alembic current)" +# Run database migrations +echo "Current database version: $(alembic current)" if alembic upgrade head then - echo "migrations run successfully" + echo "Migrations completed successfully" else - echo "migrations failed" + echo "Migration failed" exit 1 fi -# load values +# Load initial values into the database +echo "Loading initial values into the database..." psql -U postgres -h db -d chai -f load-values.sql -a + +echo "Database setup and initialization complete" \ No newline at end of file diff --git a/alembic/versions/20240918_1200-initial_migration.py b/alembic/versions/20240918_1200-initial_migration.py index e82f3fc..6342143 100644 --- a/alembic/versions/20240918_1200-initial_migration.py +++ b/alembic/versions/20240918_1200-initial_migration.py @@ -9,6 +9,7 @@ from typing import Sequence, Union import sqlalchemy as sa + from alembic import op # revision identifiers, used by Alembic. diff --git a/alembic/versions/20240923_0821-add_load_history.py b/alembic/versions/20240923_0821-add_load_history.py index 2c4d97d..c4c192c 100644 --- a/alembic/versions/20240923_0821-add_load_history.py +++ b/alembic/versions/20240923_0821-add_load_history.py @@ -9,6 +9,7 @@ from typing import Sequence, Union import sqlalchemy as sa + from alembic import op # revision identifiers, used by Alembic. diff --git a/alembic/versions/20240925_0808-add_users_urls.py b/alembic/versions/20240925_0808-add_users_urls.py index adb06e2..50949fa 100644 --- a/alembic/versions/20240925_0808-add_users_urls.py +++ b/alembic/versions/20240925_0808-add_users_urls.py @@ -13,9 +13,10 @@ from typing import Sequence, Union import sqlalchemy as sa -from alembic import op from sqlalchemy.dialects import postgresql +from alembic import op + # revision identifiers, used by Alembic. revision: str = "a97f16d0656a" down_revision: Union[str, None] = "0db06140525f" diff --git a/alembic/versions/20240930_2034-link_tables_add_unique_constraints.py b/alembic/versions/20240930_2034-link_tables_add_unique_constraints.py index eda078b..c92f7d5 100644 --- a/alembic/versions/20240930_2034-link_tables_add_unique_constraints.py +++ b/alembic/versions/20240930_2034-link_tables_add_unique_constraints.py @@ -10,7 +10,6 @@ from alembic import op - # revision identifiers, used by Alembic. revision: str = "905522c68f8a" down_revision: Union[str, None] = "a97f16d0656a" diff --git a/alembic/versions/20241002_1456-new_data_models_and_indexes.py b/alembic/versions/20241002_1456-new_data_models_and_indexes.py index a7e859f..c356eb2 100644 --- a/alembic/versions/20241002_1456-new_data_models_and_indexes.py +++ b/alembic/versions/20241002_1456-new_data_models_and_indexes.py @@ -8,10 +8,11 @@ from typing import Sequence, Union -from alembic import op import sqlalchemy as sa from sqlalchemy.dialects import postgresql +from alembic import op + # revision identifiers, used by Alembic. revision: str = "d1fca65a53c0" down_revision: Union[str, None] = "905522c68f8a" diff --git a/alembic/versions/20241003_0040-import_id_for_versions.py b/alembic/versions/20241003_0040-import_id_for_versions.py index 1887afc..13c4b43 100644 --- a/alembic/versions/20241003_0040-import_id_for_versions.py +++ b/alembic/versions/20241003_0040-import_id_for_versions.py @@ -8,9 +8,9 @@ from typing import Sequence, Union -from alembic import op import sqlalchemy as sa +from alembic import op # revision identifiers, used by Alembic. revision: str = "b806c732ebff" diff --git a/alembic/versions/20241003_1554-redo_data_model.py b/alembic/versions/20241003_1554-redo_data_model.py index c78f24b..a377002 100644 --- a/alembic/versions/20241003_1554-redo_data_model.py +++ b/alembic/versions/20241003_1554-redo_data_model.py @@ -8,10 +8,11 @@ from typing import Sequence, Union -from alembic import op import sqlalchemy as sa from sqlalchemy.dialects import postgresql +from alembic import op + # revision identifiers, used by Alembic. revision: str = "8423f70b5354" down_revision: Union[str, None] = "b806c732ebff" diff --git a/alembic/versions/20241009_0915-modify_users_username_uniqueness.py b/alembic/versions/20241009_0915-modify_users_username_uniqueness.py index bb2e9a1..6db11cc 100644 --- a/alembic/versions/20241009_0915-modify_users_username_uniqueness.py +++ b/alembic/versions/20241009_0915-modify_users_username_uniqueness.py @@ -10,7 +10,6 @@ from alembic import op - # revision identifiers, used by Alembic. revision: str = "3a8f2c4f018d" down_revision: Union[str, None] = "8423f70b5354" diff --git a/alembic/versions/20241010_1347-add_license_name_and_id_index.py b/alembic/versions/20241010_1347-add_license_name_and_id_index.py index 2783be9..79d0e9b 100644 --- a/alembic/versions/20241010_1347-add_license_name_and_id_index.py +++ b/alembic/versions/20241010_1347-add_license_name_and_id_index.py @@ -10,7 +10,6 @@ from alembic import op - # revision identifiers, used by Alembic. revision: str = "d183dcc4bdc8" down_revision: Union[str, None] = "c719192063b5" diff --git a/core/README.md b/core/README.md new file mode 100644 index 0000000..d77cffd --- /dev/null +++ b/core/README.md @@ -0,0 +1,83 @@ +# Core Tools for CHAI Python Loaders + +This directory contains a set of core tools and utilities to facilitate loading the CHAI +database with packaage manager data, using python helpers. These tools provide a common +foundation for fetching, transforming, and loading data from various package managers +into the database. + +## Key Components + +### 1. [Config](config.py) + +The Config module provides configuration management for loaders. It includes: + +- `PackageManager` enum for supported package managers +- `Config` class for storing loader-specific configurations +- Functions for initializing configurations and loading various types (URL types, + user types, package manager IDs, dependency types) + +### 2. [Database](db.py) + +The DB class offers a set of methods for interacting with the database, including: + +- Inserting and selecting data for packages, versions, users, dependencies, and more +- Caching mechanisms to improve performance +- Batch processing capabilities for efficient data insertion + +### 3. [Fetcher](fetcher.py) + +The Fetcher class provides functionality for downloading and extracting data from +package manager sources. It supports: + +- Downloading tarball files +- Extracting contents to a specified directory + +### 4. [Logger](logger.py) + +A custom logging utility that provides consistent logging across all loaders. + +### 5. [Models](models/**init**.py) + +SQLAlchemy models representing the database schema, including: + +- Package, Version, User, License, DependsOn, and other relevant tables + +> [!NOTE] +> +> This is currently used to actually generate the migrations as well + +### 6. [Scheduler](scheduler.py) + +A scheduling utility that allows loaders to run at specified intervals. + +### 7. [Transformer](transformer.py) + +The Transformer class provides a base for creating package manager-specific transformers. +It includes: + +- Methods for locating and reading input files +- Placeholder methods for transforming data into the required format + +## Usage + +To create a new loader for a package manager: + +1. Create a new directory under `package_managers/` for your package manager. +1. Implement a fetcher that inherits from the base Fetcher, that is able to fetch + the raw data from the package manager's source. +1. Implement a custom Transformer class that inherits from the base Transformer, that + figures out how to map the raw data provided by the package managers into the data + model described in the [models](models/**init**.py) module. +1. Create a main script that utilizes the core components (Config, DB, Fetcher, + Transformer, Scheduler) to fetch, transform, and load data. + +Example usage can be found in the [crates](../package_managers/crates) loader. + +## Contributing + +When adding new functionality or modifying existing core components, please ensure that +changes are compatible with all existing loaders and follow the established patterns +and conventions. + +For more detailed information on each component, refer to the individual files and their +docstrings. diff --git a/core/db.py b/core/db.py index 9efe13d..33b649f 100644 --- a/core/db.py +++ b/core/db.py @@ -320,7 +320,6 @@ def process_url(item: Dict[str, str]): def insert_package_urls(self, package_url_generator: Iterable[dict[str, str]]): url_cache: Dict[tuple[str, str], UUID] = {} - # package_cache: Dict[str, UUID] = {} def fetch_packages_and_urls(items: List[Dict[str, str]]): package_ids = build_query_params(items, self.package_cache, "import_id") diff --git a/core/fetcher.py b/core/fetcher.py index edfd741..8ad975c 100644 --- a/core/fetcher.py +++ b/core/fetcher.py @@ -6,6 +6,7 @@ from typing import Any from requests import get + from core.logger import Logger diff --git a/core/logger.py b/core/logger.py index bef9e82..4f36114 100644 --- a/core/logger.py +++ b/core/logger.py @@ -22,7 +22,7 @@ def __init__(self, name: str, mode=NORMAL, start=time.time()) -> None: self.mode = Logger.VERBOSE if DEBUG else mode def print(self, msg: str): - print(f"{self.time_diff():.2f}: [{self.name}]: {msg}") + print(f"{self.time_diff():.2f}: [{self.name}]: {msg}", flush=True) def error(self, message): self.print(f"[ERROR]: {message}") diff --git a/core/models/__init__.py b/core/models/__init__.py index b7a11b3..bc43885 100644 --- a/core/models/__init__.py +++ b/core/models/__init__.py @@ -11,10 +11,6 @@ UniqueConstraint, func, ) - -# from sqlalchemy.orm import Mapped -# from sqlalchemy.orm import mapped_column -# from sqlalchemy.orm import relationship from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.ext.declarative import declarative_base @@ -294,7 +290,6 @@ class URLType(Base): ) -# usernames can come from different sources, but within a source, they are probably unique class User(Base): __tablename__ = "users" __table_args__ = ( diff --git a/core/scheduler.py b/core/scheduler.py index c144a53..32a33c4 100644 --- a/core/scheduler.py +++ b/core/scheduler.py @@ -1,8 +1,10 @@ -from os import getenv -import schedule import time +from os import getenv from threading import Thread from typing import Callable + +import schedule + from core.logger import Logger FREQUENCY = int(getenv("FREQUENCY", 24)) diff --git a/docker-compose.yml b/docker-compose.yml index b68bfbb..45ec804 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -34,7 +34,7 @@ services: environment: - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai - PYTHONPATH=/ - - DEBUG=${DEBUG:-true} + - DEBUG=${DEBUG:-false} - TEST=${TEST:-false} - FETCH=${FETCH:-true} - FREQUENCY=${FREQUENCY:-24} diff --git a/monitor/main.py b/monitor/main.py index 97cf5d2..6686991 100644 --- a/monitor/main.py +++ b/monitor/main.py @@ -1,8 +1,9 @@ -import time -from typing import Tuple, Optional, Dict -import docker import json +import time from collections import defaultdict +from typing import Dict, Optional, Tuple + +import docker PIPELINE_CONTAINER = "chai-oss-pipeline-1" DATABASE_CONTAINER = "chai-oss-db-1" diff --git a/package_managers/crates/main.py b/package_managers/crates/main.py index e8a030c..38f98b6 100644 --- a/package_managers/crates/main.py +++ b/package_managers/crates/main.py @@ -19,15 +19,14 @@ def fetch(config: Config) -> None: def load(db: DB, transformer: CratesTransformer, config: Config) -> None: db.insert_packages(transformer.packages(), config.package_manager_id, "crates") - db.insert_versions(transformer.versions()) db.insert_users(transformer.users(), config.user_types.crates) db.insert_user_packages(transformer.user_packages()) - db.insert_urls(transformer.urls()) if not config.test: - # these are bigger files, so we skip them in tests + db.insert_urls(transformer.urls()) + db.insert_package_urls(transformer.package_urls()) + db.insert_versions(transformer.versions()) db.insert_user_versions(transformer.user_versions(), config.user_types.github) - # db.insert_package_urls(transformer.package_urls()) FIXME db.insert_dependencies(transformer.dependencies()) db.insert_load_history(config.package_manager_id)