From f05402502bdc2637deb47e4d9052bfafc7d888ea Mon Sep 17 00:00:00 2001
From: Sanchit Ram Arvind <sanchitram@gmail.com>
Date: Fri, 25 Oct 2024 17:58:18 -0500
Subject: [PATCH] no cache (#19)

---
 README.md                              |  26 +--
 alembic/README.md                      |  56 +++++++
 core/README.md                         |  26 ++-
 core/config.py                         | 223 ++++++++++++-------------
 core/fetcher.py                        |  32 ++--
 core/logger.py                         |   6 +-
 core/structs.py                        |  38 -----
 core/utils.py                          |   8 +
 db/README.md                           | 168 +++++++++++++++++++
 docker-compose.yml                     |   4 +
 package_managers/crates/README.md      |  89 ++++++++++
 package_managers/crates/main.py        |  27 +--
 package_managers/crates/transformer.py |   2 +-
 package_managers/homebrew/README.md    |  62 +++++++
 package_managers/homebrew/pipeline.sh  |  14 +-
 package_managers/homebrew/schedule.sh  |  27 ++-
 16 files changed, 597 insertions(+), 211 deletions(-)
 create mode 100644 alembic/README.md
 delete mode 100644 core/structs.py
 create mode 100644 db/README.md
 create mode 100644 package_managers/crates/README.md

diff --git a/README.md b/README.md
index fc2aa82..d1bc12e 100644
--- a/README.md
+++ b/README.md
@@ -12,30 +12,32 @@ Use [Docker](https://docker.com)
 2. Then, run `docker compose up` to launch.
 
 > [!NOTE]
+>
 > This will run CHAI with for all package managers. As an example crates by
 > itself will take over an hour and consume >5GB storage.
 >
-> To run only a specific backend, comment out the others in `docker-compose.yml`.
-
-<!-- we'd like to change the above to be more friendly to users trying to run a specific
-pipeline -->
-
-> [!NOTE]
 > Currently, we support only two package managers:
 >
 > - crates
 > - Homebrew
 >
-> We are planning on supporting `NPM`, `PyPI`, and `rubygems`
+> You can run a single package manager by running
+> `docker compose run --rm -e ... <package_manager>`
+>
+> We are planning on supporting `NPM`, `PyPI`, and `rubygems` next.
 
 ### Arguments
 
 Specify these eg. `docker compose -e FOO=bar up`:
 
-- `FREQUENCY`: how frequently **(in hours)** the pipeline will run
-  (defaults to `24`)
-- `FETCH`: whether the pipeline will fetch the data. Defaults to `true`
-- `DEBUG`: whether the pipeline will run in debug mode. Defaults to `true`
+- `FREQUENCY`: Sets how often (in hours) the pipeline should run.
+- `TEST`: Runs the loader in test mode when set to true, skipping certain data insertions.
+- `FETCH`: Determines whether to fetch new data from the source when set to true.
+- `NO_CACHE`: When set to true, deletes temporary files after processing.
+
+> [!NOTE]
+> The flag `NO_CACHE` does not mean that files will not get downloaded to your local
+> storage, just that we'll delete the files once we're done with them
 
 These arguments are all configurable in the `docker-compose.yml` file.
 
@@ -66,6 +68,8 @@ Our goal is to build a data schema that looks like this:
 
 ![db/CHAI_ERD.png](db/CHAI_ERD.png)
 
+You can read more about specific data models in the dbs [readme](db/README.md)
+
 Our specific application extracts the dependency graph understand what are
 critical pieces of the open-source graph. We also built a simple example that displays
 [sbom-metadata](examples/sbom-meta) for your repository.
diff --git a/alembic/README.md b/alembic/README.md
new file mode 100644
index 0000000..efb97ef
--- /dev/null
+++ b/alembic/README.md
@@ -0,0 +1,56 @@
+# CHAI Data Migrations
+
+This directory contains the Alembic configuration and migration scripts for managing the
+database schema of the CHAI project. Alembic is used to handle database migrations,
+allowing for version control of our database schema.
+
+### About Alembic
+
+Alembic is a database migration tool for SQLAlchemy. It allows us to:
+
+- Track changes to our database schema over time
+- Apply and revert these changes in a controlled manner
+- Generate migration scripts automatically based on model changes
+
+> [!NOTE]
+> It's important to note that while `alembic` serves our current needs, it may not be
+> our long-term solution. As the CHAI project evolves, we might explore other database
+> migration tools or strategies that better fit our growing requirements. We're open to
+> reassessing our approach to schema management as needed.
+
+## Entrypoint
+
+The main entrypoint for running migrations is the
+[run migrations script](run_migrations.sh). This script orchestrates the initialization
+and migration process.
+
+## Steps
+
+1. [Initialize](init-script.sql)
+
+The initialization script creates the database `chai`, and loads it up with any
+extensions that we'd need, so we've got a clean slate for our db structures.
+
+2. [Load](load-values.sql)
+
+The load script pre-populated some of the tables, with `enum`-like values - specifically
+for:
+
+- `url_types`: defines different types of URLs (e.g., source, homepage, documentation)
+- `depends_on_types`: defines different types of dependencies (e.g., runtime,
+  development)
+- `sources` and `package_managers`: defines different package managers (e.g., npm, pypi)
+
+3. Run Alembic Migrations
+
+After initialization and loading initial data, the script runs Alembic migrations to apply any pending database schema changes.
+
+## Contributing
+
+To contirbute to the database schema:
+
+1. Make a change in the [models](../core/models/__init__.py) file
+2. Generate a new migration script: `alembic revision --autogenerate "Description"`
+3. Review the generated migration script in the [versions](versions/) directory. The
+   auto-generation is powerful but not perfect, please review the script carefully.
+4. Test the migration by running `alembic upgrade head`.
diff --git a/core/README.md b/core/README.md
index d77cffd..f27522f 100644
--- a/core/README.md
+++ b/core/README.md
@@ -9,12 +9,18 @@ into the database.
 
 ### 1. [Config](config.py)
 
-The Config module provides configuration management for loaders. It includes:
+Config always runs first, and is the entrypoint for all loaders. It includes;
 
-- `PackageManager` enum for supported package managers
-- `Config` class for storing loader-specific configurations
-- Functions for initializing configurations and loading various types (URL types,
-  user types, package manager IDs, dependency types)
+- Execution flags:
+  - `FETCH` determines whether we request the data from source
+  - `TEST` enables a test mode, to test specific portions of the pipeline
+  - `NO_CACHE` to determine whether we save the intermediate pipeline files
+- Package Manager flags
+  - `pm_id` gets the package manager id from the db, that we'd run the pipeline for
+  - `source` is the data source for that package manager. `SOURCES` defines the map.
+
+The next 3 configuration classes retrieve the IDs for url types (homepage, documentation,
+etc.), dependency types (build, runtime, etc.) and user types (crates user, github user)
 
 ### 2. [Database](db.py)
 
@@ -31,6 +37,7 @@ package manager sources. It supports:
 
 - Downloading tarball files
 - Extracting contents to a specified directory
+- Maintaining a "latest" symlink so we always know where to look
 
 ### 4. [Logger](logger.py)
 
@@ -72,12 +79,3 @@ To create a new loader for a package manager:
    Transformer, Scheduler) to fetch, transform, and load data.
 
 Example usage can be found in the [crates](../package_managers/crates) loader.
-
-## Contributing
-
-When adding new functionality or modifying existing core components, please ensure that
-changes are compatible with all existing loaders and follow the established patterns
-and conventions.
-
-For more detailed information on each component, refer to the individual files and their
-docstrings.
diff --git a/core/config.py b/core/config.py
index 82eb0f4..8a940e7 100644
--- a/core/config.py
+++ b/core/config.py
@@ -1,127 +1,126 @@
-from dataclasses import dataclass
-from os import getenv
+from enum import Enum
+
+from sqlalchemy import UUID
 
 from core.db import DB
 from core.logger import Logger
-from core.structs import (
-    DependencyTypes,
-    PackageManager,
-    PackageManagerIDs,
-    Sources,
-    URLTypes,
-    UserTypes,
-)
+from core.utils import env_vars
 
 logger = Logger("config")
 
-TEST = getenv("TEST", "false").lower() == "true"
-FETCH = getenv("FETCH", "true").lower() == "true"
 
+class PackageManager(Enum):
+    CRATES = "crates"
+    HOMEBREW = "homebrew"
 
-@dataclass
-class Config:
-    file_location: str
+
+TEST = env_vars("TEST", "false")
+FETCH = env_vars("FETCH", "true")
+NO_CACHE = env_vars("NO_CACHE", "true")
+SOURCES = {
+    PackageManager.CRATES: "https://static.crates.io/db-dump.tar.gz",
+    PackageManager.HOMEBREW: "https://github.com/Homebrew/homebrew-core/tree/master/Formula",  # noqa
+}
+
+# The three configuration values URLTypes, DependencyTypes, and UserTypes will query the
+# DB to get the respective IDs. If the values don't exist in the database, they will
+# raise an AttributeError (None has no attribute id) at the start
+
+
+class ExecConf:
     test: bool
     fetch: bool
-    package_manager_id: str
+    no_cache: bool
+
+    def __init__(self) -> None:
+        self.test = TEST
+        self.fetch = FETCH
+        self.no_cache = NO_CACHE
+
+    def __str__(self):
+        return f"ExecConf(test={self.test},fetch={self.fetch},no_cache={self.no_cache}"
+
+
+class PMConf:
+    pm_id: str
+    source: str
+
+    def __init__(self, pm: PackageManager, db: DB):
+        self.pm_id = db.select_package_manager_by_name(pm.value).id
+        self.source = SOURCES[pm]
+
+    def __str__(self):
+        return f"PMConf(pm_id={self.pm_id},source={self.source})"
+
+
+class URLTypes:
+    homepage: UUID
+    repository: UUID
+    documentation: UUID
+    source: UUID
+
+    def __init__(self, db: DB):
+        self.load_url_types(db)
+
+    def load_url_types(self, db: DB) -> None:
+        self.homepage = db.select_url_types_homepage().id
+        self.repository = db.select_url_types_repository().id
+        self.documentation = db.select_url_types_documentation().id
+        self.source = db.select_url_types_source().id
+
+    def __str__(self) -> str:
+        return f"URLs(homepage={self.homepage},repo={self.repository},docs={self.documentation},src={self.source})"  # noqa
+
+
+class UserTypes:
+    crates: UUID
+    github: UUID
+
+    def __init__(self, db: DB):
+        self.crates = db.select_source_by_name("crates").id
+        self.github = db.select_source_by_name("github").id
+
+    def __str__(self) -> str:
+        return f"UserTypes(crates={self.crates},github={self.github})"
+
+
+class DependencyTypes:
+    build: UUID
+    development: UUID
+    runtime: UUID
+    test: UUID
+    optional: UUID
+    recommended: UUID
+
+    def __init__(self, db: DB):
+        self.build = db.select_dependency_type_by_name("build").id
+        self.development = db.select_dependency_type_by_name("development").id
+        self.runtime = db.select_dependency_type_by_name("runtime").id
+        self.test = db.select_dependency_type_by_name("test").id
+        self.optional = db.select_dependency_type_by_name("optional").id
+        self.recommended = db.select_dependency_type_by_name("recommended").id
+
+    def __str__(self) -> str:
+        return f"DependencyTypes(build={self.build},development={self.development},runtime={self.runtime},test={self.test},optional={self.optional},recommended={self.recommended})"  # noqa
+
+
+class Config:
+    exec_config: ExecConf
+    pm_config: PMConf
     url_types: URLTypes
     user_types: UserTypes
     dependency_types: DependencyTypes
 
+    def __init__(self, pm: PackageManager, db: DB) -> None:
+        self.exec_config = ExecConf()
+        self.pm_config = PMConf(pm, db)
+        self.url_types = URLTypes(db)
+        self.user_types = UserTypes(db)
+        self.dependency_types = DependencyTypes(db)
+
     def __str__(self):
-        return f"Config(file_location={self.file_location}, test={self.test}, \
-            fetch={self.fetch}, package_manager_id={self.package_manager_id}, \
-            url_types={self.url_types}, user_types={self.user_types}, \
-            dependency_types={self.dependency_types})"
-
-
-def load_url_types(db: DB) -> URLTypes:
-    logger.debug("loading url types, and creating if not exists")
-    homepage_url = db.select_url_types_homepage(create=True)
-    repository_url = db.select_url_types_repository(create=True)
-    documentation_url = db.select_url_types_documentation(create=True)
-    source_url = db.select_url_types_source(create=True)
-    return URLTypes(
-        homepage=homepage_url.id,
-        repository=repository_url.id,
-        documentation=documentation_url.id,
-        source=source_url.id,
-    )
-
-
-def load_user_types(db: DB) -> UserTypes:
-    logger.debug("loading user types, and creating if not exists")
-    crates_source = db.select_source_by_name("crates", create=True)
-    github_source = db.select_source_by_name("github", create=True)
-    return UserTypes(
-        crates=crates_source.id,
-        github=github_source.id,
-    )
-
-
-def load_package_manager_ids(db: DB) -> PackageManagerIDs:
-    logger.debug("loading package manager ids, and creating if not exists")
-    crates_package_manager = db.select_package_manager_by_name("crates", create=True)
-    homebrew_package_manager = db.select_package_manager_by_name(
-        "homebrew", create=True
-    )
-    return {
-        PackageManager.CRATES: crates_package_manager.id,
-        PackageManager.HOMEBREW: homebrew_package_manager.id,
-    }
-
-
-def load_dependency_types(db: DB) -> DependencyTypes:
-    logger.debug("loading dependency types, and creating if not exists")
-    build_dep_type = db.select_dependency_type_by_name("build", create=True)
-    dev_dep_type = db.select_dependency_type_by_name("development", create=True)
-    runtime_dep_type = db.select_dependency_type_by_name("runtime", create=True)
-    test_dep_type = db.select_dependency_type_by_name("test", create=True)
-    optional_dep_type = db.select_dependency_type_by_name("optional", create=True)
-    recommended_dep_type = db.select_dependency_type_by_name("recommended", create=True)
-    return DependencyTypes(
-        build=build_dep_type.id,
-        development=dev_dep_type.id,
-        runtime=runtime_dep_type.id,
-        test=test_dep_type.id,
-        optional=optional_dep_type.id,
-        recommended=recommended_dep_type.id,
-    )
-
-
-def load_sources() -> Sources:
-    return {
-        PackageManager.CRATES: "https://static.crates.io/db-dump.tar.gz",
-        PackageManager.HOMEBREW: (
-            "https://github.com/Homebrew/homebrew-core/tree/master/Formula"
-        ),
-    }
-
-
-def initialize(package_manager: PackageManager, db: DB) -> Config:
-    url_types = load_url_types(db)
-    user_types = load_user_types(db)
-    package_manager_ids = load_package_manager_ids(db)
-    dependency_types = load_dependency_types(db)
-    sources = load_sources()
-
-    if package_manager == PackageManager.CRATES:
-        return Config(
-            file_location=sources[PackageManager.CRATES],
-            test=False,
-            fetch=True,
-            package_manager_id=package_manager_ids[PackageManager.CRATES],
-            url_types=url_types,
-            user_types=user_types,
-            dependency_types=dependency_types,
-        )
-    elif package_manager == PackageManager.HOMEBREW:
-        return Config(
-            file_location=sources[PackageManager.HOMEBREW],
-            test=False,
-            fetch=True,
-            package_manager_id=package_manager_ids[PackageManager.HOMEBREW],
-            url_types=url_types,
-            user_types=user_types,
-            dependency_types=dependency_types,
-        )
+        return f"Config(exec_config={self.exec_config}, pm_config={self.pm_config}, url_types={self.url_types}, user_types={self.user_types}, dependency_types={self.dependency_types})"  # noqa
+
+
+if __name__ == "__main__":
+    print(PackageManager.CRATES.value)
diff --git a/core/fetcher.py b/core/fetcher.py
index 8ad975c..8b4afd4 100644
--- a/core/fetcher.py
+++ b/core/fetcher.py
@@ -3,10 +3,12 @@
 from dataclasses import dataclass
 from datetime import datetime
 from io import BytesIO
+from shutil import rmtree
 from typing import Any
 
 from requests import get
 
+from core.config import Config
 from core.logger import Logger
 
 
@@ -18,11 +20,13 @@ class Data:
 
 
 class Fetcher:
-    def __init__(self, name: str, source: str):
+    def __init__(self, name: str, config: Config):
         self.name = name
-        self.source = source
+        self.source = config.pm_config.source
         self.output = f"data/{name}"
         self.logger = Logger(f"{name}_fetcher")
+        self.no_cache = config.exec_config.no_cache
+        self.test = config.exec_config.test
 
     def write(self, files: list[Data]):
         """generic write function for some collection of files"""
@@ -59,19 +63,25 @@ def update_symlink(self, latest_path: str):
         os.symlink(latest_path, latest_symlink)
 
     def fetch(self):
-        response = get(self.source)
-        try:
-            response.raise_for_status()
-        except Exception as e:
-            self.logger.error(f"error fetching {self.source}: {e}")
-            raise e
+        if self.fetch:
+            response = get(self.source)
+            try:
+                response.raise_for_status()
+            except Exception as e:
+                self.logger.error(f"error fetching {self.source}: {e}")
+                raise e
+
+            return response.content
 
-        return response.content
+    def cleanup(self):
+        if self.no_cache:
+            rmtree(self.output, ignore_errors=True)
+            os.makedirs(self.output, exist_ok=True)
 
 
 class TarballFetcher(Fetcher):
-    def __init__(self, name: str, source: str):
-        super().__init__(name, source)
+    def __init__(self, name: str, config: Config):
+        super().__init__(name, config)
 
     def fetch(self) -> list[Data]:
         content = super().fetch()
diff --git a/core/logger.py b/core/logger.py
index 4f36114..dd7bf8a 100644
--- a/core/logger.py
+++ b/core/logger.py
@@ -1,10 +1,10 @@
 import sys
 import time
 import traceback
-from os import getenv
 
-debug = getenv("DEBUG", "false").lower()
-DEBUG = debug == "true" or debug == "1"
+from core.utils import env_vars
+
+DEBUG = env_vars("DEBUG", "false")
 
 
 def as_minutes(seconds: float) -> float:
diff --git a/core/structs.py b/core/structs.py
deleted file mode 100644
index d926438..0000000
--- a/core/structs.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from dataclasses import dataclass
-from enum import Enum
-from typing import Dict
-
-from sqlalchemy import UUID
-
-
-class PackageManager(Enum):
-    CRATES = "crates"
-    HOMEBREW = "homebrew"
-
-
-PackageManagerIDs = Dict[PackageManager, UUID]
-Sources = Dict[PackageManager, str]
-
-
-@dataclass
-class URLTypes:
-    homepage: UUID
-    repository: UUID
-    documentation: UUID
-    source: UUID
-
-
-@dataclass
-class UserTypes:
-    crates: UUID
-    github: UUID
-
-
-@dataclass
-class DependencyTypes:
-    build: UUID
-    development: UUID
-    runtime: UUID
-    test: UUID
-    optional: UUID
-    recommended: UUID
diff --git a/core/utils.py b/core/utils.py
index f622cf9..6185e4c 100644
--- a/core/utils.py
+++ b/core/utils.py
@@ -1,3 +1,4 @@
+from os import getenv
 from typing import Dict, List
 
 
@@ -16,3 +17,10 @@ def build_query_params(
         if item[attr] not in cache:
             params.add(item[attr])
     return list(params)
+
+
+# env vars could be true or 1, or anything else -- here's a centralized location to
+# handle that
+def env_vars(env_var: str, default: str):
+    var = getenv(env_var, default).lower()
+    return var == "true" or var == "1"
diff --git a/db/README.md b/db/README.md
new file mode 100644
index 0000000..2c68480
--- /dev/null
+++ b/db/README.md
@@ -0,0 +1,168 @@
+# CHAI Data Model
+
+The CHAI data model is designed to respresnt the package manager data in a unified and
+consistent form. The model's goal is _standardization_ - of the various complexities,
+and idiosyncracies of each individual package manager. We want to provide a standard way
+for analysis, querying, and whatever your use case might be.
+
+## Definitions
+
+We use certain nomenclature throughout the codebase:
+
+- `derived_id`: A unique identifier combining the package manager and package name. Like
+  `crates/serde`, or `homebrew/a2ps`, or `npm/lodash`.
+- `import_id`: The original identifier from the source system. Like the `crate_id`
+  integers provided by crates, or the package name provided by Homebrew
+
+# Core Entities
+
+## Packages
+
+The Package model is a fundamental unit in our system. Each package is uniquely
+identified and associated with a specific package manager.
+
+Key fields:
+
+- `derived_id`
+- `name`
+- `package_manager_id`: Reference to the associated package manager.
+- `import_id`: The original identifier from the source system.
+- `readme`: Optional field for package documentation.
+
+### Versions
+
+Each version is a different release of a package, and **must** be associated with a
+package.
+
+Key fields:
+
+- `package_id`: Reference to the associated package.
+- `version`: The version string.
+- `import_id`: The original identifier from the source system.
+- `size`, `published_at`, `license_id`, `downloads`, `checksum`: Optional metadata
+  fields.
+
+### Users
+
+The User model represents individuals or entities associated with packages. This is not
+necessarily always available, but if it is, it's interesting data.
+
+Key fields:
+
+- `username`: The user's name or identifier.
+- `source_id`: Reference to the data source (e.g., GitHub, npm user, crates user, etc).
+- `import_id`: The original identifier from the source system.
+
+### URLs
+
+The URL model is populated with all the URLs that are provided by the package manager
+source data - this includes documentation, repository, source, issues, and other url
+types as well. Each URL is associated with a URL type. The relationships between a URL
+and a Package are captured in the PackageURL model.
+
+Key fields:
+
+- `url`: The URL.
+- `url_type_id`: Reference to the type of URL. (e.g., homepage, repository, etc)
+
+## Type Models
+
+These models define categorizations and types used across the system. All these values
+are loaded from the alembic service, specifically in the
+[load-values.sql](../alembic/versions/load-values.sql) script.
+
+### URLType
+
+Represents different types of URLs associated with packages.
+
+Predefined types (from load-values.sql):
+
+- `source`
+- `homepage`
+- `documentation`
+- `repository`
+
+### DependsOnType
+
+Categorizes different types of dependencies between packages.
+Predefined types (from load-values.sql):
+build
+
+- `development`
+- `runtime`
+- `test`
+- `optional`
+- `recommended`
+- `uses_from_macos` (Homebrew only)
+
+### Source
+
+Represents the authoritative sources of package data.
+
+- `crates`
+- `homebrew`
+
+The below are not yet supported:
+
+- `npm`
+- `pypi`
+- `rubygems`
+- `github`
+
+## Relationship Models
+
+These models establish connections between core entities.
+
+### DependsOn
+
+In our data model, a specific release depends on a specific package. We include a field
+`semver_range`, which would represent the range of dependency releases compatible with
+that specific release.
+
+> [!NOTE]
+> Not all package managers provide semantic versions. Homebrew does not, for example.
+> This is why `semver_range` is optional.
+>
+> On the other hand, the dependency type is non-optional, and the combination of
+> `version_id`, `dependency_id`, and `dependency_type_id` must be unique.
+
+Key fields:
+
+- `version_id`: The version that has the dependency.
+- `dependency_id`: The package that is depended upon.
+- `dependency_type_id`: The type of dependency.
+- `semver_range`: The version range for the dependency (optional).
+
+### UserVersion and UserPackage
+
+These models associate users with specific versions and packages, respectively.
+
+### PackageURL
+
+Associates packages with their various URLs.
+
+## Caveats
+
+### `Source` and `PackageManager` Relationship
+
+We've chosen to separate `Source` and `PackageManager` into distinct entities:
+
+- `Source`: Represents data sources that can provide information about packages, users,
+  or both.
+- `PackageManager`: Specifically represents sources that are package managers.
+
+For example, 'crates' functions both as a package manager and as a source of user data.
+By keeping these concepts separate, we can accurately represent such systems, and have
+one point where we can modify any information about 'crates'.
+
+## Additional Models
+
+### License
+
+Represents software licenses associated with package versions. Great place to start
+contirbutions!
+
+### LoadHistory
+
+Tracks the history of data loads for each package manager, useful for auditing and
+incremental updates.
diff --git a/docker-compose.yml b/docker-compose.yml
index 45ec804..05e3deb 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -33,11 +33,14 @@ services:
       dockerfile: ./package_managers/crates/Dockerfile
     environment:
       - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai
+      - NO_CACHE=${NO_CACHE:-false}
       - PYTHONPATH=/
       - DEBUG=${DEBUG:-false}
       - TEST=${TEST:-false}
       - FETCH=${FETCH:-true}
       - FREQUENCY=${FREQUENCY:-24}
+    volumes:
+      - ./data/crates:/data/crates
     depends_on:
       db:
         condition: service_healthy
@@ -50,6 +53,7 @@ services:
       dockerfile: ./package_managers/homebrew/Dockerfile
     environment:
       - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai
+      - NO_CACHE=${NO_CACHE:-false}
       - TEST=${TEST:-false}
       - FETCH=${FETCH:-true}
       - FREQUENCY=${FREQUENCY:-24}
diff --git a/package_managers/crates/README.md b/package_managers/crates/README.md
new file mode 100644
index 0000000..6a81576
--- /dev/null
+++ b/package_managers/crates/README.md
@@ -0,0 +1,89 @@
+# crates
+
+The crates service uses the database dump provided by crates.io and coerces their data
+model into CHAI's. It's containerized using Docker for easy deployment and consistency.
+It's also written in `python` as a first draft, and uses a lot of the
+[core tools](../../core/).
+
+## Getting Started
+
+To just run the crates service, you can use the following commands:
+
+```bash
+docker compose build crates
+docker compose run crates
+```
+
+## Execution Steps
+
+The crates loader goes through the following steps when executed:
+
+1. Initialization: The loader starts by initializing the configuration and database
+   connection.
+2. Fetching: If the `FETCH` flag is set to true, the loader downloads the latest crates
+   data from the configured source.
+3. Transformation: The downloaded data is transformed into a format compatible with the
+   CHAI database schema.
+4. Loading: The transformed data is loaded into the database. This includes:
+   - Packages
+   - Users
+   - User Packages
+   - URLs
+   - Package URLs
+   - Versions
+   - Dependencies
+5. Cleanup: After successful loading, temporary files are cleaned up if the `NO_CACHE` flag is set.
+
+The main execution logic is in the `run_pipeline` function in [main.py](main.py).
+
+```python
+def run_pipeline(db: DB, config: Config) -> None:
+    fetcher = fetch(config)
+    transformer = CratesTransformer(config.url_types, config.user_types)
+    load(db, transformer, config)
+    fetcher.cleanup(config)
+
+    coda = (
+        "validate by running "
+        + '`psql "postgresql://postgres:s3cr3t@localhost:5435/chai" '
+        + '-c "SELECT * FROM load_history;"`'
+    )
+    logger.log(coda)
+```
+
+### Configuration Flags
+
+The crates loader supports several configuration flags:
+
+- `DEBUG`: Enables debug logging when set to true.
+- `TEST`: Runs the loader in test mode when set to true, skipping certain data insertions.
+- `FETCH`: Determines whether to fetch new data from the source when set to true.
+- `FREQUENCY`: Sets how often (in hours) the pipeline should run.
+- `NO_CACHE`: When set to true, deletes temporary files after processing.
+
+These flags can be set in the `docker-compose.yml` file:
+
+```yaml
+crates:
+  build:
+    context: .
+    dockerfile: ./package_managers/crates/Dockerfile
+  environment:
+    - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai
+    - PYTHONPATH=/
+    - DEBUG=${DEBUG:-false}
+    - TEST=${TEST:-false}
+    - FETCH=${FETCH:-true}
+    - FREQUENCY=${FREQUENCY:-24}
+    - NO_CACHE=${NO_CACHE:-false}
+```
+
+## Notes
+
+- We're reopening the same files multiple times, which is not efficient.
+  - `versions.csv` contains all the `published_by` ids
+  - `crates.csv` contains all the `urls`
+- The cache logic in the database client is super complicated, and needs some better
+  explanation...it does work though.
+- Licenses are non-standardized.
+- Warnings on missing users are because `gh_login` in the source data is non-unique.
diff --git a/package_managers/crates/main.py b/package_managers/crates/main.py
index 38f98b6..91ada8e 100644
--- a/package_managers/crates/main.py
+++ b/package_managers/crates/main.py
@@ -1,6 +1,6 @@
 import time
 
-from core.config import Config, PackageManager, initialize
+from core.config import Config, PackageManager
 from core.db import DB
 from core.fetcher import TarballFetcher
 from core.logger import Logger
@@ -8,37 +8,40 @@
 from package_managers.crates.transformer import CratesTransformer
 
 logger = Logger("crates_orchestrator")
-crates = PackageManager.CRATES
 
 
-def fetch(config: Config) -> None:
-    fetcher = TarballFetcher("crates", config.file_location)
+def fetch(config: Config) -> TarballFetcher:
+    fetcher = TarballFetcher("crates", config)
     files = fetcher.fetch()
     fetcher.write(files)
+    return fetcher
 
 
 def load(db: DB, transformer: CratesTransformer, config: Config) -> None:
-    db.insert_packages(transformer.packages(), config.package_manager_id, "crates")
-    db.insert_users(transformer.users(), config.user_types.crates)
+    db.insert_packages(
+        transformer.packages(),
+        config.pm_config.pm_id,
+        PackageManager.CRATES.value,
+    )
+    db.insert_users(transformer.users(), config.user_types.github)
     db.insert_user_packages(transformer.user_packages())
 
-    if not config.test:
+    if not config.exec_config.test:
         db.insert_urls(transformer.urls())
         db.insert_package_urls(transformer.package_urls())
         db.insert_versions(transformer.versions())
         db.insert_user_versions(transformer.user_versions(), config.user_types.github)
         db.insert_dependencies(transformer.dependencies())
 
-    db.insert_load_history(config.package_manager_id)
+    db.insert_load_history(config.pm_config.pm_id)
     logger.log("✅ crates")
 
 
 def run_pipeline(db: DB, config: Config) -> None:
-    if config.fetch:
-        fetch(config)
-
+    fetcher = fetch(config)
     transformer = CratesTransformer(config.url_types, config.user_types)
     load(db, transformer, config)
+    fetcher.cleanup()
 
     coda = (
         "validate by running "
@@ -50,7 +53,7 @@ def run_pipeline(db: DB, config: Config) -> None:
 
 def main():
     db = DB()
-    config = initialize(crates, db)
+    config = Config(PackageManager.CRATES, db)
     logger.debug(config)
 
     scheduler = Scheduler("crates")
diff --git a/package_managers/crates/transformer.py b/package_managers/crates/transformer.py
index 33097fb..f8d0bb3 100644
--- a/package_managers/crates/transformer.py
+++ b/package_managers/crates/transformer.py
@@ -1,7 +1,7 @@
 import csv
 from typing import Dict, Generator
 
-from core.structs import URLTypes, UserTypes
+from core.config import URLTypes, UserTypes
 from core.transformer import Transformer
 from core.utils import safe_int
 from package_managers.crates.structs import DependencyType
diff --git a/package_managers/homebrew/README.md b/package_managers/homebrew/README.md
index f24c9ea..7b354d3 100644
--- a/package_managers/homebrew/README.md
+++ b/package_managers/homebrew/README.md
@@ -1,5 +1,67 @@
 # Homebrew
 
+The Homebrew service uses Homebrew's JSON API Documentation to build the Homebrew data
+model. It's lightweight -- written in shell scripts, `jq`, and `psql` -- and
+containerized using Docker
+
+# Getting Started
+
+To just run the Homebrew service, you can use the following commands:
+
+```bash
+docker compose build homebrew
+docker compose run homebrew
+```
+
+## Pipeline Overview
+
+The Homebrew pipeline consists of two main scripts:
+
+- `pipeline.sh`: Responsible for fetching, transforming, and loading Homebrew package
+  data.
+- `schedule.sh`: Handles the scheduling and execution of the pipeline script.
+
+> [!NOTE]
+> The key aspect of `pipeline.sh` to note is how it prepares the sql statements - since
+> our data model is completely normalized, we need to retrieve the IDs for each data
+> model when loading our "edge" data.
+>
+> For example, in the `user_packages` table, we need to know the `user_id` and
+> `package_id` for each record, which happens via a sub-select on each row. It sounds
+> awful, but Homebrew's data is pretty small, so we're not asking the database to do
+> much.
+
+### [`schedule.sh`](schedule.sh)
+
+The schedule.sh script sets up and manages the cron job for running the pipeline:
+
+- Creates a cron job based on the `FREQUENCY` environment variable. Defaults to 24 hrs.
+- Runs the pipeline immediately upon startup.
+- Starts the cron daemon and tails the log file.
+
+### [`jq` files](jq/)
+
+The jq files in the [`jq/`](jq/) directory are responsible for transforming the raw
+Homebrew JSON data into SQL statements for insertion into the database. Each file
+corresponds to a specific table or relationship in the database.
+
+To edit the jq files:
+
+- Navigate to the [`jq/`](jq/) directory.
+- Open the desired jq file in a text editor.
+- Modify the jq queries as needed.
+
+> [!NOTE]
+> You can comment using `#` in the jq files!
+
+Key jq files and their purposes:
+
+- [`packages.jq`](jq/packages.jq): Transforms package data.
+- [`urls.jq`](jq/urls.jq): Extracts and formats URL information.
+- [`versions.jq`](jq/versions.jq): Handles version data (currently assumes latest version).
+- [`package_url.jq`](jq/package_url.jq): Maps packages to their URLs.
+- [`dependencies.jq`](jq/dependencies.jq): Processes dependency information.
+
 ## Notes
 
 - Homebrew's dependencies are not just restricted to the `{build,test,...}_dependencies`
diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh
index 47ec704..99c9876 100755
--- a/package_managers/homebrew/pipeline.sh
+++ b/package_managers/homebrew/pipeline.sh
@@ -1,14 +1,15 @@
 #!/bin/bash
 
 # Homebrew Pipeline Script
-# This script fetches, transforms, and loads Homebrew package data into a PostgreSQL database.
+# This script fetches, transforms, and loads Homebrew package data into a 
+# PostgreSQL database.
 
 # Set bash options:
 # -e: Exit immediately if a command exits with a non-zero status.
-# -x: Print commands and their arguments as they are executed.
 # -u: Treat unset variables as an error when substituting.
-# -o pipefail: Return value of a pipeline is the status of the last command to exit with a non-zero status.
-set -uo pipefail
+# -o pipefail: Return value of a pipeline is the status of the last command to exit 
+# with a non-zero status.
+set -euo pipefail
 
 # Function to log messages with timestamps
 log() {
@@ -124,3 +125,8 @@ psql -q "$CHAI_DATABASE_URL" <<EOSQL
 EOSQL
 
 log "Homebrew pipeline completed successfully"
+
+# If --no-cache is on, delete all the intermediate files
+if [ "X$NO_CACHE" = "Xtrue" ]; then
+    rm -rf "${DATA_DIR:?}"/*
+fi 
diff --git a/package_managers/homebrew/schedule.sh b/package_managers/homebrew/schedule.sh
index 20a5259..0cc75ab 100644
--- a/package_managers/homebrew/schedule.sh
+++ b/package_managers/homebrew/schedule.sh
@@ -1,15 +1,31 @@
 #!/bin/bash
 
-set -uo pipefail
+# Homebrew schedule script
+# This script schedules and executes the Homebrew service logging to cron.log
 
-# Create the log fil
+# Set bash options:
+# -e: Exit immediately if a command exits with a non-zero status.
+# -u: Treat unset variables as an error when substituting.
+# -o pipefail: Return value of a pipeline is the status of the last command to exit 
+# with a non-zero status.
+set -euo pipefail
+
+# Function to log messages with timestamps
+log() {
+    echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" | tee -a /var/log/cron.log
+}
+
+log "Starting scheduler"
+
+# Create the log file
 touch /var/log/cron.log
 
 # Set up the cron job
 if [ "$TEST" = "true" ]; then
-    echo "*/2 * * * * /usr/bin/env CHAI_DATABASE_URL=$CHAI_DATABASE_URL SOURCE=$SOURCE CODE_DIR=$CODE_DIR DATA_DIR=$DATA_DIR FETCH=$FETCH /package_managers/homebrew/pipeline.sh >> /var/log/cron.log 2>&1" > /etc/cron.d/homebrew-cron
+    # In test mode, set the schedule for every two minutes so we can test the scheduling
+    echo "*/2 * * * * /usr/bin/env CHAI_DATABASE_URL=$CHAI_DATABASE_URL SOURCE=$SOURCE CODE_DIR=$CODE_DIR DATA_DIR=$DATA_DIR FETCH=$FETCH NO_CACHE=$NO_CACHE /package_managers/homebrew/pipeline.sh >> /var/log/cron.log 2>&1" > /etc/cron.d/homebrew-cron
 else
-    echo "0 */$FREQUENCY * * * /usr/bin/env CHAI_DATABASE_URL=$CHAI_DATABASE_URL SOURCE=$SOURCE CODE_DIR=$CODE_DIR DATA_DIR=$DATA_DIR FETCH=$FETCH /package_managers/homebrew/pipeline.sh >> /var/log/cron.log 2>&1" > /etc/cron.d/homebrew-cron
+    echo "0 */$FREQUENCY * * * /usr/bin/env CHAI_DATABASE_URL=$CHAI_DATABASE_URL SOURCE=$SOURCE CODE_DIR=$CODE_DIR DATA_DIR=$DATA_DIR FETCH=$FETCH NO_CACHE=$NO_CACHE /package_managers/homebrew/pipeline.sh >> /var/log/cron.log 2>&1" > /etc/cron.d/homebrew-cron
 fi
 
 # Give execution rights on the cron job
@@ -22,7 +38,8 @@ crontab /etc/cron.d/homebrew-cron
 /package_managers/homebrew/pipeline.sh
 
 # Start cron
+log "Starting cron"
 cron
 
 # Tail the log file to keep the container running and show logs
-tail -f /var/log/cron.log
\ No newline at end of file
+tail -f /var/log/cron.log