From cffd6fbb3aa2a1d3a57c41e0078684aa71e08090 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Mon, 21 Oct 2024 13:21:55 -0500 Subject: [PATCH] homebrew (#8) * big reorg * docker builds db and alembic * alembic and db docker actually builds now also, migrations run * reloaded the crates stuff, and fixed imports * crates image is building and running * crates works * wait on homebrew * monitor service is building * env vars for crates * monitor service is running * delete some old stuff * the scheduler class * remove old scheduler * crates working with docker compose * jq formulae for homebrew * fix packages script * start of orchestrator for homebrew...that works! * jq transforms * volume mapping, docker compose * fix crates import * lint on db.py * config for crates * Dockerfile? * docker compose yml todo * we've got csvs! * everything but load * cleanup * inserting packages works! * all homebrew except dependencies * correctly create homebrew package manager row * yep, much faster * crates fix to create source and package manager * remove sed * config changes to load dependency types * create the data types upfront * get homebrew env vars * remove python * replace sed with jq * jq corrections, pipeline.sh fixes, dockerfile * pipeline.sh improvements * some cleanups --------- Co-authored-by: Jacob Heider --- .dockerignore | 6 +- alembic/Dockerfile | 4 +- alembic/env.py | 12 +- alembic/load-values.sql | 26 ++++ alembic/run_migrations.sh | 3 + ..._0803-package_managers_should_be_unique.py | 29 ++++ core/config.py | 127 +++++++++++++++++ src/pipeline/utils/pg.py => core/db.py | 38 +++++- {src/pipeline/utils => core}/fetcher.py | 2 +- {src/pipeline/utils => core}/logger.py | 13 +- {src/pipeline => core}/models/__init__.py | 4 +- core/scheduler.py | 39 ++++++ core/structs.py | 38 ++++++ {src/pipeline/utils => core}/transformer.py | 2 +- {src/pipeline/utils => core}/utils.py | 7 +- docker-compose.yml | 55 ++++---- package_managers/crates/Dockerfile | 5 + package_managers/crates/main.py | 72 ++++++++++ .../crates}/requirements.txt | 2 +- .../crates/structs.py | 15 -- .../crates/transformer.py | 7 +- package_managers/homebrew/Dockerfile | 9 ++ package_managers/homebrew/jq/dependencies.jq | 30 ++++ package_managers/homebrew/jq/package_url.jq | 19 +++ package_managers/homebrew/jq/packages.jq | 13 ++ package_managers/homebrew/jq/urls.jq | 19 +++ package_managers/homebrew/jq/versions.jq | 23 ++++ package_managers/homebrew/pipeline.sh | 129 ++++++++++++++++++ .../homebrew/sql/homebrew_vars.sql | 28 ++++ pyproject.toml | 30 ---- src/.pkgx.yaml | 6 - src/Dockerfile | 5 - src/pipeline/crates.py | 122 ----------------- src/pipeline/main.py | 39 ------ src/pipeline/pkgx.py | 58 -------- src/run_pipeline.sh | 7 - src/run_scheduler.py | 52 ------- 37 files changed, 692 insertions(+), 403 deletions(-) create mode 100644 alembic/load-values.sql create mode 100644 alembic/versions/20241021_0803-package_managers_should_be_unique.py create mode 100644 core/config.py rename src/pipeline/utils/pg.py => core/db.py (94%) rename {src/pipeline/utils => core}/fetcher.py (98%) rename {src/pipeline/utils => core}/logger.py (80%) rename {src/pipeline => core}/models/__init__.py (99%) create mode 100644 core/scheduler.py create mode 100644 core/structs.py rename {src/pipeline/utils => core}/transformer.py (96%) rename {src/pipeline/utils => core}/utils.py (58%) create mode 100644 package_managers/crates/Dockerfile create mode 100644 package_managers/crates/main.py rename {src => package_managers/crates}/requirements.txt (97%) rename src/pipeline/utils/crates/structures.py => package_managers/crates/structs.py (51%) rename {src/pipeline/utils => package_managers}/crates/transformer.py (97%) create mode 100644 package_managers/homebrew/Dockerfile create mode 100644 package_managers/homebrew/jq/dependencies.jq create mode 100644 package_managers/homebrew/jq/package_url.jq create mode 100644 package_managers/homebrew/jq/packages.jq create mode 100644 package_managers/homebrew/jq/urls.jq create mode 100644 package_managers/homebrew/jq/versions.jq create mode 100755 package_managers/homebrew/pipeline.sh create mode 100644 package_managers/homebrew/sql/homebrew_vars.sql delete mode 100644 pyproject.toml delete mode 100644 src/.pkgx.yaml delete mode 100644 src/Dockerfile delete mode 100644 src/pipeline/crates.py delete mode 100644 src/pipeline/main.py delete mode 100644 src/pipeline/pkgx.py delete mode 100755 src/run_pipeline.sh delete mode 100644 src/run_scheduler.py diff --git a/.dockerignore b/.dockerignore index 0384b02..a00944d 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,8 +1,10 @@ # directories data/ -db/ .venv/ # other files .gitignore -docker-compose.yml \ No newline at end of file +docker-compose.yml +.DS_Store +.git +README.md \ No newline at end of file diff --git a/alembic/Dockerfile b/alembic/Dockerfile index 24b6261..8749c8b 100644 --- a/alembic/Dockerfile +++ b/alembic/Dockerfile @@ -7,5 +7,5 @@ RUN apt -y install python3-psycopg2 RUN apt -y install python3-sqlalchemy python3-sqlalchemy-ext COPY . . WORKDIR /alembic -RUN chmod +x run_migrations.sh -ENTRYPOINT ["/alembic/run_migrations.sh"] +RUN chmod +x /alembic/run_migrations.sh +ENTRYPOINT ["/alembic/run_migrations.sh"] \ No newline at end of file diff --git a/alembic/env.py b/alembic/env.py index ca8b713..2608372 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -1,10 +1,9 @@ import os from logging.config import fileConfig -from sqlalchemy import engine_from_config, pool - from alembic import context -from src.pipeline.models import Base +from sqlalchemy import engine_from_config, pool +from core.models import Base # this is the Alembic Config object, which provides # access to the values within the .ini file in use. @@ -40,7 +39,6 @@ def run_migrations_offline() -> None: target_metadata=target_metadata, literal_binds=True, dialect_opts={"paramstyle": "named"}, - compare_server_default=True, ) with context.begin_transaction(): @@ -60,11 +58,7 @@ def run_migrations_online() -> None: ) with connectable.connect() as connection: - context.configure( - connection=connection, - target_metadata=target_metadata, - compare_server_default=True, - ) + context.configure(connection=connection, target_metadata=target_metadata) with context.begin_transaction(): context.run_migrations() diff --git a/alembic/load-values.sql b/alembic/load-values.sql new file mode 100644 index 0000000..1bdde5d --- /dev/null +++ b/alembic/load-values.sql @@ -0,0 +1,26 @@ +-- url types +INSERT INTO "url_types" ("name") +VALUES ('source'), ('homepage'), ('documentation'), ('repository') +ON CONFLICT (name) DO NOTHING; + +-- dependency types +INSERT INTO "depends_on_types" ("name") +VALUES +('build'), +('development'), +('runtime'), +('test'), +('optional'), +('recommended'), +('uses_from_macos') +ON CONFLICT (name) DO NOTHING; + +-- sources +INSERT INTO "sources" ("type") +VALUES ('crates'), ('npm'), ('pypi'), ('rubygems'), ('github'), ('homebrew') +ON CONFLICT (type) DO NOTHING; + +INSERT INTO "package_managers" ("source_id") +SELECT id +FROM "sources" +WHERE "type" IN ('crates', 'npm', 'pypi', 'rubygems', 'github', 'homebrew'); diff --git a/alembic/run_migrations.sh b/alembic/run_migrations.sh index 88e081a..6e6beee 100755 --- a/alembic/run_migrations.sh +++ b/alembic/run_migrations.sh @@ -25,3 +25,6 @@ else echo "migrations failed" exit 1 fi + +# load values +psql -U postgres -h db -d chai -f load-values.sql -a diff --git a/alembic/versions/20241021_0803-package_managers_should_be_unique.py b/alembic/versions/20241021_0803-package_managers_should_be_unique.py new file mode 100644 index 0000000..01bf5df --- /dev/null +++ b/alembic/versions/20241021_0803-package_managers_should_be_unique.py @@ -0,0 +1,29 @@ +"""package managers should be unique + +Revision ID: 38cc41599874 +Revises: 2481138a729a +Create Date: 2024-10-21 08:03:43.647535 + +""" + +from typing import Sequence, Union + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "38cc41599874" +down_revision: Union[str, None] = "2481138a729a" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_unique_constraint( + op.f("uq_package_managers_source_id"), "package_managers", ["source_id"] + ) + + +def downgrade() -> None: + op.drop_constraint( + op.f("uq_package_managers_source_id"), "package_managers", type_="unique" + ) diff --git a/core/config.py b/core/config.py new file mode 100644 index 0000000..82eb0f4 --- /dev/null +++ b/core/config.py @@ -0,0 +1,127 @@ +from dataclasses import dataclass +from os import getenv + +from core.db import DB +from core.logger import Logger +from core.structs import ( + DependencyTypes, + PackageManager, + PackageManagerIDs, + Sources, + URLTypes, + UserTypes, +) + +logger = Logger("config") + +TEST = getenv("TEST", "false").lower() == "true" +FETCH = getenv("FETCH", "true").lower() == "true" + + +@dataclass +class Config: + file_location: str + test: bool + fetch: bool + package_manager_id: str + url_types: URLTypes + user_types: UserTypes + dependency_types: DependencyTypes + + def __str__(self): + return f"Config(file_location={self.file_location}, test={self.test}, \ + fetch={self.fetch}, package_manager_id={self.package_manager_id}, \ + url_types={self.url_types}, user_types={self.user_types}, \ + dependency_types={self.dependency_types})" + + +def load_url_types(db: DB) -> URLTypes: + logger.debug("loading url types, and creating if not exists") + homepage_url = db.select_url_types_homepage(create=True) + repository_url = db.select_url_types_repository(create=True) + documentation_url = db.select_url_types_documentation(create=True) + source_url = db.select_url_types_source(create=True) + return URLTypes( + homepage=homepage_url.id, + repository=repository_url.id, + documentation=documentation_url.id, + source=source_url.id, + ) + + +def load_user_types(db: DB) -> UserTypes: + logger.debug("loading user types, and creating if not exists") + crates_source = db.select_source_by_name("crates", create=True) + github_source = db.select_source_by_name("github", create=True) + return UserTypes( + crates=crates_source.id, + github=github_source.id, + ) + + +def load_package_manager_ids(db: DB) -> PackageManagerIDs: + logger.debug("loading package manager ids, and creating if not exists") + crates_package_manager = db.select_package_manager_by_name("crates", create=True) + homebrew_package_manager = db.select_package_manager_by_name( + "homebrew", create=True + ) + return { + PackageManager.CRATES: crates_package_manager.id, + PackageManager.HOMEBREW: homebrew_package_manager.id, + } + + +def load_dependency_types(db: DB) -> DependencyTypes: + logger.debug("loading dependency types, and creating if not exists") + build_dep_type = db.select_dependency_type_by_name("build", create=True) + dev_dep_type = db.select_dependency_type_by_name("development", create=True) + runtime_dep_type = db.select_dependency_type_by_name("runtime", create=True) + test_dep_type = db.select_dependency_type_by_name("test", create=True) + optional_dep_type = db.select_dependency_type_by_name("optional", create=True) + recommended_dep_type = db.select_dependency_type_by_name("recommended", create=True) + return DependencyTypes( + build=build_dep_type.id, + development=dev_dep_type.id, + runtime=runtime_dep_type.id, + test=test_dep_type.id, + optional=optional_dep_type.id, + recommended=recommended_dep_type.id, + ) + + +def load_sources() -> Sources: + return { + PackageManager.CRATES: "https://static.crates.io/db-dump.tar.gz", + PackageManager.HOMEBREW: ( + "https://github.com/Homebrew/homebrew-core/tree/master/Formula" + ), + } + + +def initialize(package_manager: PackageManager, db: DB) -> Config: + url_types = load_url_types(db) + user_types = load_user_types(db) + package_manager_ids = load_package_manager_ids(db) + dependency_types = load_dependency_types(db) + sources = load_sources() + + if package_manager == PackageManager.CRATES: + return Config( + file_location=sources[PackageManager.CRATES], + test=False, + fetch=True, + package_manager_id=package_manager_ids[PackageManager.CRATES], + url_types=url_types, + user_types=user_types, + dependency_types=dependency_types, + ) + elif package_manager == PackageManager.HOMEBREW: + return Config( + file_location=sources[PackageManager.HOMEBREW], + test=False, + fetch=True, + package_manager_id=package_manager_ids[PackageManager.HOMEBREW], + url_types=url_types, + user_types=user_types, + dependency_types=dependency_types, + ) diff --git a/src/pipeline/utils/pg.py b/core/db.py similarity index 94% rename from src/pipeline/utils/pg.py rename to core/db.py index 5867d97..9efe13d 100644 --- a/src/pipeline/utils/pg.py +++ b/core/db.py @@ -1,13 +1,17 @@ import os from typing import Any, Dict, Iterable, List, Type -from src.pipeline.utils.utils import build_query_params + from sqlalchemy import UUID, create_engine from sqlalchemy.dialects import postgresql from sqlalchemy.dialects.postgresql import insert from sqlalchemy.orm import sessionmaker from sqlalchemy.orm.decl_api import DeclarativeMeta -from src.pipeline.models import ( + +from core.logger import Logger +from core.models import ( + URL, DependsOn, + DependsOnType, License, LoadHistory, Package, @@ -16,12 +20,11 @@ Source, URLType, User, - URL, UserPackage, UserVersion, Version, ) -from src.pipeline.utils.logger import Logger +from core.utils import build_query_params CHAI_DATABASE_URL = os.getenv("CHAI_DATABASE_URL") DEFAULT_BATCH_SIZE = 10000 @@ -371,8 +374,13 @@ def process_package_url(item: Dict[str, str]): PackageURL, self._process_batch(batch, process_package_url) ) - def insert_source(self, name: str) -> UUID: + def insert_source(self, name: str) -> Source: with self.session() as session: + existing_source = session.query(Source).filter_by(type=name).first() + if existing_source: + self.logger.warn(f"Source '{name}' already exists") + return existing_source + session.add(Source(type=name)) session.commit() return session.query(Source).filter_by(type=name).first() @@ -419,6 +427,9 @@ def select_url_types_repository(self, create: bool = False) -> URLType | None: def select_url_types_documentation(self, create: bool = False) -> URLType | None: return self.select_url_type("documentation", create) + def select_url_types_source(self, create: bool = False) -> URLType | None: + return self.select_url_type("source", create) + def select_package_manager_by_name( self, package_manager: str, create: bool = False ) -> PackageManager | None: @@ -514,3 +525,20 @@ def select_packages_by_import_ids(self, iids: Iterable[str]) -> List[Package]: def select_licenses_by_name(self, names: Iterable[str]) -> List[License]: with self.session() as session: return session.query(License).filter(License.name.in_(names)).all() + + def select_dependency_type_by_name( + self, name: str, create: bool = False + ) -> DependsOnType: + with self.session() as session: + result = session.query(DependsOnType).filter_by(name=name).first() + if result: + return result + if create: + return self.insert_dependency_type(name) + return None + + def insert_dependency_type(self, name: str) -> DependsOnType: + with self.session() as session: + session.add(DependsOnType(name=name)) + session.commit() + return session.query(DependsOnType).filter_by(name=name).first() diff --git a/src/pipeline/utils/fetcher.py b/core/fetcher.py similarity index 98% rename from src/pipeline/utils/fetcher.py rename to core/fetcher.py index da25064..edfd741 100644 --- a/src/pipeline/utils/fetcher.py +++ b/core/fetcher.py @@ -6,7 +6,7 @@ from typing import Any from requests import get -from src.pipeline.utils.logger import Logger +from core.logger import Logger @dataclass diff --git a/src/pipeline/utils/logger.py b/core/logger.py similarity index 80% rename from src/pipeline/utils/logger.py rename to core/logger.py index 0c2cc88..bef9e82 100644 --- a/src/pipeline/utils/logger.py +++ b/core/logger.py @@ -1,13 +1,10 @@ -from os import getenv -import time import sys +import time import traceback +from os import getenv -DEBUG = getenv("DEBUG", "false").lower() == "true" - -# use inspect to print the line of code as well? -# caller = inspect.currentframe().f_back -# filename = caller.f_code.co_filename, lineno = caller.f_lineno +debug = getenv("DEBUG", "false").lower() +DEBUG = debug == "true" or debug == "1" def as_minutes(seconds: float) -> float: @@ -25,7 +22,7 @@ def __init__(self, name: str, mode=NORMAL, start=time.time()) -> None: self.mode = Logger.VERBOSE if DEBUG else mode def print(self, msg: str): - print(f"{self.time_diff():.2f}: [{self.name}]: {msg}", flush=True) + print(f"{self.time_diff():.2f}: [{self.name}]: {msg}") def error(self, message): self.print(f"[ERROR]: {message}") diff --git a/src/pipeline/models/__init__.py b/core/models/__init__.py similarity index 99% rename from src/pipeline/models/__init__.py rename to core/models/__init__.py index 1dcfbcc..b7a11b3 100644 --- a/src/pipeline/models/__init__.py +++ b/core/models/__init__.py @@ -78,7 +78,9 @@ class PackageManager(Base): default=func.uuid_generate_v4(), server_default=func.uuid_generate_v4(), ) - source_id = Column(UUID(as_uuid=True), ForeignKey("sources.id"), nullable=False) + source_id = Column( + UUID(as_uuid=True), ForeignKey("sources.id"), nullable=False, unique=True + ) created_at = Column( DateTime, nullable=False, default=func.now(), server_default=func.now() ) diff --git a/core/scheduler.py b/core/scheduler.py new file mode 100644 index 0000000..c144a53 --- /dev/null +++ b/core/scheduler.py @@ -0,0 +1,39 @@ +from os import getenv +import schedule +import time +from threading import Thread +from typing import Callable +from core.logger import Logger + +FREQUENCY = int(getenv("FREQUENCY", 24)) + + +class Scheduler: + def __init__(self, name: str, frequency: int = FREQUENCY): + self.name = name + self.frequency = frequency + self.logger = Logger(f"{name}_scheduler") + self.job = None + self.is_running = False + + def start(self, task: Callable, *args): + self.job = schedule.every(self.frequency).hours.do(task, *args) + self.is_running = True + self.logger.log(f"scheduled {self.name} to run every {self.frequency} hours") + + def run_schedule(): + while self.is_running: + schedule.run_pending() + time.sleep(1) + + Thread(target=run_schedule, daemon=True).start() + + def stop(self): + if self.job: + schedule.cancel_job(self.job) + self.is_running = False + self.logger.log(f"stopped {self.name} scheduler") + + def run_now(self, task: Callable, *args): + self.logger.log(f"running {self.name} now") + task(*args) diff --git a/core/structs.py b/core/structs.py new file mode 100644 index 0000000..d926438 --- /dev/null +++ b/core/structs.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass +from enum import Enum +from typing import Dict + +from sqlalchemy import UUID + + +class PackageManager(Enum): + CRATES = "crates" + HOMEBREW = "homebrew" + + +PackageManagerIDs = Dict[PackageManager, UUID] +Sources = Dict[PackageManager, str] + + +@dataclass +class URLTypes: + homepage: UUID + repository: UUID + documentation: UUID + source: UUID + + +@dataclass +class UserTypes: + crates: UUID + github: UUID + + +@dataclass +class DependencyTypes: + build: UUID + development: UUID + runtime: UUID + test: UUID + optional: UUID + recommended: UUID diff --git a/src/pipeline/utils/transformer.py b/core/transformer.py similarity index 96% rename from src/pipeline/utils/transformer.py rename to core/transformer.py index 47f0cf8..0de6147 100644 --- a/src/pipeline/utils/transformer.py +++ b/core/transformer.py @@ -4,7 +4,7 @@ from sqlalchemy import UUID -from src.pipeline.utils.logger import Logger +from core.logger import Logger # this is a temporary fix, but sometimes the raw files have weird characters # and lots of data within certain fields diff --git a/src/pipeline/utils/utils.py b/core/utils.py similarity index 58% rename from src/pipeline/utils/utils.py rename to core/utils.py index 3de9bee..f622cf9 100644 --- a/src/pipeline/utils/utils.py +++ b/core/utils.py @@ -1,4 +1,4 @@ -from typing import List, Dict +from typing import Dict, List def safe_int(val: str) -> int | None: @@ -7,10 +7,7 @@ def safe_int(val: str) -> int | None: return int(val) -# given some items and a cache, this returns a list of attributes that are not in the -# cache so that we can use them in a query -# attr has to be an attribute in the item -# item[attr] is a key in the cache +# TODO: needs explanation or simplification def build_query_params( items: List[Dict[str, str]], cache: dict, attr: str ) -> List[str]: diff --git a/docker-compose.yml b/docker-compose.yml index 8e152f6..99cad6b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,7 +16,7 @@ services: alembic: build: - context: ./ + context: . dockerfile: ./alembic/Dockerfile environment: - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai @@ -27,60 +27,53 @@ services: working_dir: /alembic entrypoint: ["./run_migrations.sh"] - pipeline: + crates: build: - context: ./ - dockerfile: ./src/Dockerfile - args: - BUILDKIT_PROGRESS: plain + context: . + dockerfile: ./package_managers/crates/Dockerfile environment: - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai - - FREQUENCY=${FREQUENCY:-24} - PYTHONPATH=/ - - PKG_MANAGER=${PKG_MANAGER:-crates} - - TEST=${TEST:-false} - DEBUG=${DEBUG:-true} + - TEST=${TEST:-false} - FETCH=${FETCH:-true} + - FREQUENCY=${FREQUENCY:-24} depends_on: db: condition: service_healthy alembic: condition: service_completed_successfully - working_dir: /src - entrypoint: ["./run_pipeline.sh"] - api: + homebrew: build: - context: ./api - dockerfile: Dockerfile + context: . + dockerfile: ./package_managers/homebrew/Dockerfile environment: - - DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai - - HOST=0.0.0.0 - - PORT=8080 - ports: - - "8080:8080" + - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai + - TEST=${TEST:-false} + - FETCH=${FETCH:-true} + - FREQUENCY=${FREQUENCY:-24} + - SOURCE=https://formulae.brew.sh/api/formula.json + - CODE_DIR=/package_managers/homebrew + - DATA_DIR=/data/homebrew + volumes: + - ./data/homebrew:/data/homebrew depends_on: db: condition: service_healthy alembic: condition: service_completed_successfully - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8080/heartbeat"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 5s monitor: - build: monitor + build: + context: ./monitor + dockerfile: Dockerfile environment: - DOCKER_HOST=${DOCKER_HOST:-unix:///var/run/docker.sock} volumes: - - ./monitor:/app - /var/run/docker.sock:/var/run/docker.sock depends_on: - pipeline: + crates: + condition: service_started + homebrew: condition: service_started - working_dir: /usr/src/monitor - entrypoint: ["./run_monitor.sh"] diff --git a/package_managers/crates/Dockerfile b/package_managers/crates/Dockerfile new file mode 100644 index 0000000..f599716 --- /dev/null +++ b/package_managers/crates/Dockerfile @@ -0,0 +1,5 @@ +FROM python:3.11 +COPY . . +WORKDIR /package_managers/crates +RUN pip install --no-cache-dir -r requirements.txt +CMD ["python", "/package_managers/crates/main.py"] \ No newline at end of file diff --git a/package_managers/crates/main.py b/package_managers/crates/main.py new file mode 100644 index 0000000..e8a030c --- /dev/null +++ b/package_managers/crates/main.py @@ -0,0 +1,72 @@ +import time + +from core.config import Config, PackageManager, initialize +from core.db import DB +from core.fetcher import TarballFetcher +from core.logger import Logger +from core.scheduler import Scheduler +from package_managers.crates.transformer import CratesTransformer + +logger = Logger("crates_orchestrator") +crates = PackageManager.CRATES + + +def fetch(config: Config) -> None: + fetcher = TarballFetcher("crates", config.file_location) + files = fetcher.fetch() + fetcher.write(files) + + +def load(db: DB, transformer: CratesTransformer, config: Config) -> None: + db.insert_packages(transformer.packages(), config.package_manager_id, "crates") + db.insert_versions(transformer.versions()) + db.insert_users(transformer.users(), config.user_types.crates) + db.insert_user_packages(transformer.user_packages()) + db.insert_urls(transformer.urls()) + + if not config.test: + # these are bigger files, so we skip them in tests + db.insert_user_versions(transformer.user_versions(), config.user_types.github) + # db.insert_package_urls(transformer.package_urls()) FIXME + db.insert_dependencies(transformer.dependencies()) + + db.insert_load_history(config.package_manager_id) + logger.log("✅ crates") + + +def run_pipeline(db: DB, config: Config) -> None: + if config.fetch: + fetch(config) + + transformer = CratesTransformer(config.url_types, config.user_types) + load(db, transformer, config) + + coda = ( + "validate by running " + + '`psql "postgresql://postgres:s3cr3t@localhost:5435/chai" ' + + '-c "SELECT * FROM load_history;"`' + ) + logger.log(coda) + + +def main(): + db = DB() + config = initialize(crates, db) + logger.debug(config) + + scheduler = Scheduler("crates") + scheduler.start(run_pipeline, db, config) + + # run immediately + scheduler.run_now(run_pipeline, db, config) + + # keep the main thread alive so we can terminate the program with Ctrl+C + try: + while True: + time.sleep(1) + except KeyboardInterrupt: + scheduler.stop() + + +if __name__ == "__main__": + main() diff --git a/src/requirements.txt b/package_managers/crates/requirements.txt similarity index 97% rename from src/requirements.txt rename to package_managers/crates/requirements.txt index 63fff01..7357d56 100644 --- a/src/requirements.txt +++ b/package_managers/crates/requirements.txt @@ -31,4 +31,4 @@ typing-extensions==4.12.2 # alembic # sqlalchemy urllib3==2.2.2 - # via requests + # via requests \ No newline at end of file diff --git a/src/pipeline/utils/crates/structures.py b/package_managers/crates/structs.py similarity index 51% rename from src/pipeline/utils/crates/structures.py rename to package_managers/crates/structs.py index 06b2e6c..60f557e 100644 --- a/src/pipeline/utils/crates/structures.py +++ b/package_managers/crates/structs.py @@ -1,6 +1,4 @@ from enum import IntEnum -from dataclasses import dataclass -from sqlalchemy import UUID class DependencyType(IntEnum): @@ -11,16 +9,3 @@ class DependencyType(IntEnum): def __str__(self): return self.name.lower() - - -@dataclass -class URLTypes: - homepage: UUID - repository: UUID - documentation: UUID - - -@dataclass -class UserTypes: - crates: UUID - github: UUID diff --git a/src/pipeline/utils/crates/transformer.py b/package_managers/crates/transformer.py similarity index 97% rename from src/pipeline/utils/crates/transformer.py rename to package_managers/crates/transformer.py index ded4ff5..33097fb 100644 --- a/src/pipeline/utils/crates/transformer.py +++ b/package_managers/crates/transformer.py @@ -1,9 +1,10 @@ import csv from typing import Dict, Generator -from src.pipeline.utils.utils import safe_int -from src.pipeline.utils.crates.structures import DependencyType, URLTypes, UserTypes -from src.pipeline.utils.transformer import Transformer +from core.structs import URLTypes, UserTypes +from core.transformer import Transformer +from core.utils import safe_int +from package_managers.crates.structs import DependencyType # crates provides homepage and repository urls, so we'll initialize this transformer diff --git a/package_managers/homebrew/Dockerfile b/package_managers/homebrew/Dockerfile new file mode 100644 index 0000000..895fccc --- /dev/null +++ b/package_managers/homebrew/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.11 +RUN apt-get update && \ + apt-get install -y jq curl postgresql-client && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +COPY . . +WORKDIR /package_managers/homebrew +RUN chmod +x /package_managers/homebrew/pipeline.sh +CMD ["/package_managers/homebrew/pipeline.sh"] diff --git a/package_managers/homebrew/jq/dependencies.jq b/package_managers/homebrew/jq/dependencies.jq new file mode 100644 index 0000000..811c0b4 --- /dev/null +++ b/package_managers/homebrew/jq/dependencies.jq @@ -0,0 +1,30 @@ +# TODO: variations (linux only, by architecture) + +[.[] | { + package_name: .name, + build_deps: .build_dependencies, + runtime_deps: .dependencies, + recommended_deps: .recommended_dependencies, + test_deps: .test_dependencies, + optional_deps: .optional_dependencies, + uses_from_macos: .uses_from_macos +} | + # here's where we'd substitute the depends_on_type ids, for each depends_on type ids + # the `[]` at the end is to ensure that we're exploding the arrays, so each dependency gets its own row! + {package_name: .package_name, depends_on_type: $build_deps_type_id, depends_on: .build_deps[]}, + {package_name: .package_name, depends_on_type: $runtime_deps_type_id, depends_on: .runtime_deps[]}, + {package_name: .package_name, depends_on_type: $recommended_deps_type_id, depends_on: .recommended_deps[]}, + {package_name: .package_name, depends_on_type: $test_deps_type_id, depends_on: .test_deps[]}, + {package_name: .package_name, depends_on_type: $optional_deps_type_id, depends_on: .optional_deps[]}, + {package_name: .package_name, depends_on_type: $uses_from_macos_type_id, depends_on: .uses_from_macos[]} + | + # now, filter out the null dependencies + select(.depends_on != null) | + # and only look at the ones that are strings TODO: some are JSONs? + select(.depends_on | type == "string") | + # generate the sql statements! + "INSERT INTO dependencies (version_id, dependency_id, dependency_type_id) VALUES ( + (SELECT id FROM versions WHERE import_id = '" + .package_name + "'), + (SELECT id FROM packages WHERE import_id = '" + .depends_on + "'), + '" + .depends_on_type + "') ON CONFLICT DO NOTHING;" +] | join("\n") \ No newline at end of file diff --git a/package_managers/homebrew/jq/package_url.jq b/package_managers/homebrew/jq/package_url.jq new file mode 100644 index 0000000..9e63306 --- /dev/null +++ b/package_managers/homebrew/jq/package_url.jq @@ -0,0 +1,19 @@ +# mapping package to urls is straightforward +# but, in the first normal form we've gotta do the mapping ourselves +# luckily, homebrew is small enough that we can push some of that work to the db + +[.[] | { + package_name: .name, + homepage_url: .homepage, + source_url: .urls.stable.url +} | + # here's where we substitute the url type ids, for each url type + {package_name: .package_name, type: $homepage_url_type_id, url: .homepage_url}, + {package_name: .package_name, type: $source_url_type_id, url: .source_url} + | + # and here we say "for each url, generate an insert statement" + "INSERT INTO package_urls (package_id, url_id) VALUES ( + (SELECT id FROM packages WHERE name = '" + .package_name + "'), + (SELECT id FROM urls WHERE url = '" + .url + "' AND url_type_id = '" + .type + "')) + ON CONFLICT DO NOTHING;" +] | join("\n") diff --git a/package_managers/homebrew/jq/packages.jq b/package_managers/homebrew/jq/packages.jq new file mode 100644 index 0000000..db44a61 --- /dev/null +++ b/package_managers/homebrew/jq/packages.jq @@ -0,0 +1,13 @@ + +[.[] | + "INSERT INTO packages (name, derived_id, import_id, package_manager_id) VALUES ('" + + # for every single row, extract the name => it's the only key we need from Homebrew + (.name) + "', '" + + # the derived_id is the package manager name + "/" + the package name, which enforces + # uniqueness on the packages table + ("homebrew/" + .name) + "', '" + + # the import_id is the same as the package name (used for joins) + .name + "', '" + + # the package manager ID is passed in as a variable + $package_manager_id + "') ON CONFLICT DO NOTHING;" +] | join("\n") diff --git a/package_managers/homebrew/jq/urls.jq b/package_managers/homebrew/jq/urls.jq new file mode 100644 index 0000000..8f74533 --- /dev/null +++ b/package_managers/homebrew/jq/urls.jq @@ -0,0 +1,19 @@ +# from our sources.json, we're extracting homepage and source: + # homepage is at the main key + # source is inside stable, and it's the tarball + +# for every single row, extract the homepage and source: +[.[] | { + homepage: .homepage, + source: .urls.stable.url +} | to_entries | map({ +# `map` basically explodes the json, creating two rows for each JSON object + name: .key, + url: .value +}) | .[] | +# and here, we can generate our SQL statement! + "INSERT INTO urls (url, url_type_id) VALUES ('" + + .url + "', '" + + if .name == "source" then $source_url_type_id else $homepage_url_type_id end + "') + ON CONFLICT DO NOTHING;" +] | join("\n") diff --git a/package_managers/homebrew/jq/versions.jq b/package_managers/homebrew/jq/versions.jq new file mode 100644 index 0000000..4338034 --- /dev/null +++ b/package_managers/homebrew/jq/versions.jq @@ -0,0 +1,23 @@ +# homebrew has the problem where there are no versions +# we're gonna assume the version available is the latest + +# TODO: `downloads: .analytics.install_on_request."365d".[$name]` +# above gives us the downloads for the last 365 days +# not available in the full JSON API + +# TODO: there are also a problem of versioned formulae + +# TODO: licenses is in source.json, but we need a long-term mapping solution + +[.[] | +.name as $name | +{ + version: .versions.stable, + import_id: .name +} | +"INSERT INTO versions (version, import_id, package_id) VALUES ( + '" + .version + "', + '" + .import_id + "', + (SELECT id FROM packages WHERE import_id = '" + .import_id + "') + ) ON CONFLICT DO NOTHING;" +] | join("\n") diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh new file mode 100755 index 0000000..bb33efb --- /dev/null +++ b/package_managers/homebrew/pipeline.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +# Homebrew Pipeline Script +# This script fetches, transforms, and loads Homebrew package data into a PostgreSQL database. + +# Set bash options: +# -e: Exit immediately if a command exits with a non-zero status. +# -x: Print commands and their arguments as they are executed. +# -u: Treat unset variables as an error when substituting. +# -o pipefail: Return value of a pipeline is the status of the last command to exit with a non-zero status. +set -exuo pipefail + +# Function to log messages with timestamps +log() { + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" +} + +log "Starting Homebrew pipeline script" + +# Fetch required IDs and URLs from the database +log "Fetching required IDs and URLs from the database" +IDS=$(psql "$CHAI_DATABASE_URL" -f sql/homebrew_vars.sql -t -A -F'|') + +# Parse the results +IFS='|' read -r \ + PACKAGE_MANAGER_ID \ + HOMEPAGE_URL_TYPE_ID \ + SOURCE_URL_TYPE_ID \ + BUILD_DEPENDS_ON_TYPE_ID \ + RUNTIME_DEPENDS_ON_TYPE_ID \ + RECOMMENDED_DEPENDS_ON_TYPE_ID \ + OPTIONAL_DEPENDS_ON_TYPE_ID \ + TEST_DEPENDS_ON_TYPE_ID \ + USES_FROM_MACOS_DEPENDS_ON_TYPE_ID <<< "$IDS" + +# Validate that all required IDs are present and export them +required_vars=( + PACKAGE_MANAGER_ID + HOMEPAGE_URL_TYPE_ID + SOURCE_URL_TYPE_ID + BUILD_DEPENDS_ON_TYPE_ID + RUNTIME_DEPENDS_ON_TYPE_ID + RECOMMENDED_DEPENDS_ON_TYPE_ID + OPTIONAL_DEPENDS_ON_TYPE_ID + TEST_DEPENDS_ON_TYPE_ID + USES_FROM_MACOS_DEPENDS_ON_TYPE_ID +) + +for var in "${required_vars[@]}"; do + if [ -z "${!var}" ]; then + log "ERROR: Required variable $var is empty or unset. Exiting." + exit 1 + fi + # shellcheck disable=SC2163 + export "$var" +done + +# Data fetching and processing +if [ "$FETCH" = true ]; then + log "Fetching new data from Homebrew" + + # Create timestamped directory for this run + NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + mkdir -p "$DATA_DIR"/"$NOW" + + # Download source data + log "Downloading source data" + curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json + + # Update 'latest' symlink + ln -sfn "$NOW" "$DATA_DIR"/latest + + # Transform data using jq scripts + log "Transforming data" + for x in "$CODE_DIR"/jq/*.jq; do + filename=$(basename "$x" .jq) + log "Processing $filename" + case "$filename" in + packages) + jq -f "$x" -r \ + --arg package_manager_id "$PACKAGE_MANAGER_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + urls) + jq -f "$x" -r \ + --arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ + --arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + versions) + jq -f "$x" -r \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + package_url) + jq -f "$x" -r \ + --arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ + --arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + dependencies) + jq -f "$x" -r \ + --arg build_deps_type_id "$BUILD_DEPENDS_ON_TYPE_ID" \ + --arg runtime_deps_type_id "$RUNTIME_DEPENDS_ON_TYPE_ID" \ + --arg recommended_deps_type_id "$RECOMMENDED_DEPENDS_ON_TYPE_ID" \ + --arg optional_deps_type_id "$OPTIONAL_DEPENDS_ON_TYPE_ID" \ + --arg test_deps_type_id "$TEST_DEPENDS_ON_TYPE_ID" \ + --arg uses_from_macos_type_id "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + *) + log "Skipping unknown file: $filename" + ;; + esac + done +else + log "Skipping data fetch (FETCH=false)" +fi + +# Load data into database +log "Loading data into database" +psql -q "$CHAI_DATABASE_URL" < Config: - file_location = "https://static.crates.io/db-dump.tar.gz" - test = getenv("TEST", "false").lower() == "true" - fetch = getenv("FETCH", "true").lower() == "true" - package_manager = db.select_package_manager_by_name("crates", create=True) - homepage_url = db.select_url_types_homepage(create=True) - repository_url = db.select_url_types_repository(create=True) - documentation_url = db.select_url_types_documentation(create=True) - crates_source = db.select_source_by_name("crates", create=True) - github_source = db.select_source_by_name("github", create=True) - url_types = URLTypes( - homepage=homepage_url.id, - repository=repository_url.id, - documentation=documentation_url.id, - ) - user_types = UserTypes(crates=crates_source.id, github=github_source.id) - - logger.debug("initialized config") - - return Config( - file_location=file_location, - test=test, - fetch=fetch, - package_manager_id=package_manager.id, - url_types=url_types, - user_types=user_types, - ) - - -def fetch(config: Config) -> None: - fetcher = TarballFetcher("crates", config.file_location) - files = fetcher.fetch() - fetcher.write(files) - - -def load(db: DB, transformer: CratesTransformer, config: Config) -> None: - logger.log("loading crates packages...this should take a minute") - db.insert_packages(transformer.packages(), config.package_manager_id, "crates") - logger.log("✅ inserted packages") - - logger.log("loading crates urls...this should take a minute") - db.insert_urls(transformer.urls()) - logger.log("✅ inserted urls") - - logger.log("loading crates package urls...this should take ~3 minutes") - db.insert_package_urls(transformer.package_urls()) - logger.log("✅ inserted package urls") - - logger.log("loading crates versions...this should take ~5 minutes") - db.insert_versions(transformer.versions()) - logger.log("✅ inserted versions") - - logger.log("loading crates users...this should take a minute") - db.insert_users(transformer.users(), config.user_types.crates) - logger.log("✅ inserted users") - - logger.log("loading crates user packages...this should take a few seconds") - db.insert_user_packages(transformer.user_packages()) - logger.log("✅ inserted user packages") - - if not config.test: - # these are bigger files, so we skip them in tests - logger.log("loading crates user versions...this should take ~5 minutes") - db.insert_user_versions(transformer.user_versions(), config.user_types.github) - logger.log("✅ inserted user versions") - - logger.log("loading crates dependencies...this should take ~1 hour") - db.insert_dependencies(transformer.dependencies()) - logger.log("✅ inserted dependencies") - - db.insert_load_history(config.package_manager_id) - logger.log("✅ crates") - - -def main(db: DB) -> None: - config = initialize(db) - logger.debug(config) - if config.fetch: - fetch(config) - - transformer = CratesTransformer(config.url_types, config.user_types) - load(db, transformer, config) - - coda = """ - validate by running - `psql "postgresql://postgres:s3cr3t@localhost:5435/chai" \ - -c "SELECT * FROM load_history;"` - """ - logger.log(coda) - - -if __name__ == "__main__": - main() diff --git a/src/pipeline/main.py b/src/pipeline/main.py deleted file mode 100644 index 211b1d9..0000000 --- a/src/pipeline/main.py +++ /dev/null @@ -1,39 +0,0 @@ -import sys -from os import getenv - -from src.pipeline.crates import main as crates_main -from src.pipeline.utils.logger import Logger -from src.pipeline.utils.pg import DB - -logger = Logger("main_pipeline") - - -def main(): - try: - if len(sys.argv) != 2: - raise ValueError("usage: python main.py ") - - if getenv("CHAI_DATABASE_URL") is None: - raise ValueError("CHAI_DATABASE_URL is not set") - - # initialize the db and handoff everywhere - db = DB() - package_manager = sys.argv[1] - - print(f"[main] Running pipeline for {package_manager}...") - - # run the pipeline for the specified package manager - if package_manager == "crates": - print("[main] Running crates pipeline...") - crates_main(db) - else: - raise ValueError("invalid package manager") - except Exception: - logger.exception() - sys.exit(1) - - sys.exit(0) - - -if __name__ == "__main__": - main() diff --git a/src/pipeline/pkgx.py b/src/pipeline/pkgx.py deleted file mode 100644 index 1775560..0000000 --- a/src/pipeline/pkgx.py +++ /dev/null @@ -1,58 +0,0 @@ -import json -import os - -from requests import RequestException, get -from requests.exceptions import HTTPError -from src.pipeline.utils.logger import Logger - -# env vars -PANTRY_URL = "https://api.github.com/repos/pkgxdev/pantry" -OUTPUT_DIR = "data/pkgx" - -# setup -headers = { - "Authorization": f"Bearer {os.getenv('GITHUB_TOKEN')}", -} - - -def get_contents(path: str) -> list[dict]: - url = f"{PANTRY_URL}/contents/{path}" - response = get(url, headers=headers) - response.raise_for_status() - return response.json() - - -def get_pkgx_packages(logger: Logger) -> None: - try: - packages = {} - projects = get_contents("projects") - - for project in projects: - logger.debug(f"project: {project}") - if project["type"] == "dir": - project_contents = get_contents(project["path"]) - for item in project_contents: - if item["name"] == "package.yml": - response = get(item["download_url"], headers=headers) - packages[project["name"]] = response.text - - # Ensure output directory exists - os.makedirs(OUTPUT_DIR, exist_ok=True) - - # Write packages to JSON file - output_file = os.path.join(OUTPUT_DIR, "pkgx_packages.json") - with open(output_file, "w") as f: - json.dump(packages, f, indent=2) - - except HTTPError as e: - if e.response.status_code == 404: - logger.error("404, probs bad url") - elif e.response.status_code == 401: - logger.error("401, probs bad token") - raise e - except RequestException as e: - logger.error(f"RequestException: {e}") - raise e - except Exception as e: - logger.error(f"Exception: {e}") - raise e diff --git a/src/run_pipeline.sh b/src/run_pipeline.sh deleted file mode 100755 index 0b3818e..0000000 --- a/src/run_pipeline.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -# make the data directory -mkdir -p data/{crates,pkgx,homebrew,npm,pypi,rubys} - -# run the pipeline -python -u /src/run_scheduler.py diff --git a/src/run_scheduler.py b/src/run_scheduler.py deleted file mode 100644 index 61f207d..0000000 --- a/src/run_scheduler.py +++ /dev/null @@ -1,52 +0,0 @@ -import schedule -import time -import subprocess -import sys -import os - -PKG_MANAGER = os.getenv("PKG_MANAGER", "crates") -FREQUENCY = int(os.getenv("FREQUENCY", 24)) - - -def run_pipeline(): - # using Popen so we can continuously capture output - process = subprocess.Popen( - [sys.executable, "/src/pipeline/main.py", PKG_MANAGER], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - bufsize=1, - universal_newlines=True, - text=True, - ) - # this is hacky, but ensures we capture all output - while True: - output = process.stdout.readline() - if output == "" and process.poll() is not None: - break - if output: - print(output.strip()) - rc = process.poll() - if rc != 0: - print(process.stderr.read(), file=sys.stderr) - raise Exception(f"Pipeline failed with return code {rc}") - - -def main(): - # make sure we're in the correct directory - os.chdir("/src") - - # schedule - print(f"scheduling pipeline to run every {FREQUENCY} hours...") - schedule.every(FREQUENCY).hours.do(run_pipeline) - - # run now - run_pipeline() - - # keep running - while True: - schedule.run_pending() - time.sleep(1) - - -if __name__ == "__main__": - main()