Skip to content

Commit

Permalink
remove start script, add blob backend
Browse files Browse the repository at this point in the history
  • Loading branch information
fpgmaas committed Jun 21, 2024
1 parent d83bb6a commit d77543b
Show file tree
Hide file tree
Showing 16 changed files with 174 additions and 144 deletions.
Binary file added .DS_Store
Binary file not shown.
6 changes: 1 addition & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,10 @@ RUN poetry install --no-interaction --no-ansi --no-root --no-dev && \
# Copy Python code to the Docker image
COPY pypi_scout /code/pypi_scout/

# Copy the start script and make executable
COPY start.sh /start.sh
RUN chmod +x /start.sh

# Make empty data directory
RUN mkdir -p /code/data

ENV PYTHONPATH=/code

# Use the script as the entrypoint
ENTRYPOINT ["/start.sh"]
CMD ["uvicorn", "pypi_scout.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
6 changes: 1 addition & 5 deletions DockerfileCPU
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,10 @@ RUN pip install --no-cache-dir -r requirements-cpu.txt
# Copy the rest of the application code
COPY pypi_scout /code/pypi_scout/

# Copy the start script and make it executable
COPY start.sh /start.sh
RUN chmod +x /start.sh

# Make empty data directory
RUN mkdir -p /code/data

ENV PYTHONPATH=/code

# Use the script as the entrypoint
ENTRYPOINT ["/start.sh"]
CMD ["uvicorn", "pypi_scout.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
5 changes: 2 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@ services:
build:
context: .
dockerfile: Dockerfile
command: uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000
ports:
- "8000:8000"
volumes:
- ./data:/data
- ./data:/code/data
env_file:
- .env

Expand All @@ -18,7 +17,7 @@ services:
context: ./frontend
dockerfile: Dockerfile
args:
NEXT_PUBLIC_API_URL: http://localhost:8000
NEXT_PUBLIC_API_URL: http://localhost:8000/api
ports:
- "3000:3000"
depends_on:
Expand Down
2 changes: 1 addition & 1 deletion pypi_scout/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
allow_headers=["*"],
)

df = load_dataset(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
df = load_dataset(config)

model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME)

Expand Down
33 changes: 29 additions & 4 deletions pypi_scout/api/utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,37 @@
import logging
from pathlib import Path
import sys

import polars as pl

from pypi_scout.config import Config, StorageBackend
from pypi_scout.utils.blob_io import BlobIO


def load_dataset(config: Config) -> pl.DataFrame:
dataset_path = config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME

if dataset_path.exists():
logging.info(f"Found local dataset. Reading dataset from `{dataset_path}`...")
df = pl.read_csv(dataset_path)

elif config.STORAGE_BACKEND == StorageBackend.BLOB:
logging.info(
f"Downloading `{config.PROCESSED_DATASET_CSV_NAME}` from container `{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}`..."
)
blob_io = BlobIO(
config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
config.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
config.STORAGE_BACKEND_BLOB_KEY,
)
df = blob_io.download_csv(config.PROCESSED_DATASET_CSV_NAME)
logging.info("Finished downloading.")

else:
logging.error(
f"Dataset {dataset_path} not found, and config.StorageBackend is not `BLOB` so can't download the dataset from Azure. Terminating."
)
sys.exit(1)

def load_dataset(path_to_dataset: Path):
logging.info("Loading the processed dataset...")
df = pl.read_csv(path_to_dataset)
logging.info(f"Finished loading the processed dataset. Number of rows: {len(df):,}")
logging.info(f"The highest weekly downloads in the dataset: {df['weekly_downloads'].max():,}")
logging.info(f"The lowest weekly downloads in the dataset: {df['weekly_downloads'].min():,}")
Expand Down
10 changes: 8 additions & 2 deletions pypi_scout/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ class Config:
# Dimension of the vector embeddings produced by the model. Should match the output of the model above.
EMBEDDINGS_DIMENSION = 768

# Boolean to overwrite existing files. e.g. re-download the raw dataset, upload processed dataset to blob, etc.
OVERWRITE: bool = True

# Directory where dataset files are stored.
DATA_DIR: Path = Path("data")

Expand All @@ -53,7 +56,10 @@ class Config:
WEIGHT_SIMILARITY = 0.8
WEIGHT_WEEKLY_DOWNLOADS = 0.2

# Storage backend
# Storage backend configuration. Can be either StorageBackend.LOCAL or StorageBackend.BLOB.
# If StorageBackend.BLOB, the processed dataset will be uploaded to Blob, and the backend API
# will read the data from there, rather than from a local data directory. In order to use StorageBackend.BLOB,
# the other `STORAGE_BACKEND_BLOB_` variables need to be set as environment variables.
STORAGE_BACKEND: StorageBackend = StorageBackend.LOCAL
STORAGE_BACKEND_BLOB_ACCOUNT_NAME: str | None = None
STORAGE_BACKEND_BLOB_CONTAINER_NAME: str | None = None
Expand All @@ -76,4 +82,4 @@ def __post_init__(self) -> None:
self.STORAGE_BACKEND_BLOB_KEY,
]
):
raise OSError("One or more BLOB storage environment variables are missing!")
raise OSError("One or more BLOB storage environment variables are missing!") # noqa: TRY003
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


@dataclass
class DataReader:
class RawDataReader:
"""
A class for reading and processing data from a raw PyPI dataset.
"""
Expand Down
61 changes: 0 additions & 61 deletions pypi_scout/scripts/download_dataset.py

This file was deleted.

35 changes: 35 additions & 0 deletions pypi_scout/scripts/download_raw_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import logging

import gdown
from dotenv import load_dotenv

from pypi_scout.config import Config
from pypi_scout.utils.logging import setup_logging


def download_raw_dataset():
"""
Downloads the dataset from a Google Drive link using the gdown library.
"""
load_dotenv()
config = Config()

target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME
if target_path.exists():
if not config.OVERWRITE:
logging.info(f"🔹 Raw dataset {target_path} from Google Drive already exists! Skipping download.")
return
else:
logging.info(
f"⤵️ Raw dataset {target_path} from Google Drive exists, but config.OVERWRITE is `true`. Overwriting..."
)

logging.info(f"⬇️ Downloading raw dataset from Google Drive to {target_path}...")
url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}"
gdown.download(url, str(target_path), quiet=False)
logging.info("✅ Done!")


if __name__ == "__main__":
setup_logging()
download_raw_dataset()
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
import logging
from pathlib import Path

import polars as pl
from dotenv import load_dotenv

from pypi_scout.config import Config, StorageBackend
from pypi_scout.config import Config
from pypi_scout.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner
from pypi_scout.data.reader import DataReader
from pypi_scout.utils.blob_io import BlobIO
from pypi_scout.data.raw_data_reader import RawDataReader
from pypi_scout.utils.logging import setup_logging


def read_raw_dataset(path_to_raw_dataset):
logging.info("📂 Reading the raw dataset...")
df = DataReader(path_to_raw_dataset).read()
df = RawDataReader(path_to_raw_dataset).read()
logging.info("📊 Number of rows in the raw dataset: %s", len(df))
logging.info(f"The highest weekly downloads in the raw dataset: {df['weekly_downloads'].max():,}")
logging.info(f"The lowest weekly downloads in the raw dataset: {df['weekly_downloads'].min():,}")
Expand Down Expand Up @@ -44,61 +42,22 @@ def clean_descriptions(df):
return df


def store_processed_dataset_local(df: pl.DataFrame, processed_dataset_path: Path):
def store_processed_dataset(df, processed_dataset_path):
logging.info("Storing the processed dataset...")
df.write_csv(processed_dataset_path)
logging.info("✅ Done!")


def store_processed_dataset_blob(df: pl.DataFrame, blob_io: BlobIO, blob_name: str):
logging.info(f"Storing the processed dataset as {blob_name} in container '{blob_io.container_name}'...")
blob_io.upload_csv(df, blob_name)
logging.info("✅ Done!")


def handle_for_local_backend(config: Config):
if (config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME).exists():
logging.info(f"✔️ Processed dataset {config.PROCESSED_DATASET_CSV_NAME} already exists! Skipping.")
return

df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
if config.FRAC_DATA_TO_INCLUDE < 1.0:
df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE)
df = clean_descriptions(df)

store_processed_dataset_local(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)


def handle_for_blob_backend(config: Config):
blob_io = BlobIO(
config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
config.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
config.STORAGE_BACKEND_BLOB_KEY,
)

if blob_io.exists(config.PROCESSED_DATASET_CSV_NAME):
logging.info(
f"✔️ Raw dataset {config.PROCESSED_DATASET_CSV_NAME} already exists in container '{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}'! Skipping download."
)
return

def process_raw_dataset():
load_dotenv()
config = Config()
df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
if config.FRAC_DATA_TO_INCLUDE < 1.0:
df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE)
df = clean_descriptions(df)

store_processed_dataset_blob(df, blob_io, config.PROCESSED_DATASET_CSV_NAME)


def process_dataset():
load_dotenv()
config = Config()
if config.STORAGE_BACKEND == StorageBackend.LOCAL:
handle_for_local_backend(config)
else:
handle_for_blob_backend(config)
store_processed_dataset(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)


if __name__ == "__main__":
setup_logging()
process_dataset()
process_raw_dataset()
20 changes: 16 additions & 4 deletions pypi_scout/scripts/setup.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,30 @@
import argparse
import logging

from pypi_scout.scripts.download_dataset import download_dataset
from pypi_scout.scripts.process_dataset import process_dataset
from pypi_scout.scripts.download_raw_dataset import download_raw_dataset
from pypi_scout.scripts.process_raw_dataset import process_raw_dataset
from pypi_scout.scripts.setup_pinecone import setup_pinecone
from pypi_scout.scripts.upload_processed_dataset import upload_processed_dataset
from pypi_scout.scripts.upsert_data import upsert_data
from pypi_scout.utils.logging import setup_logging


def main(no_upsert):
setup_logging()

logging.info("\n\nSETTING UP PINECONE -------------\n")
setup_pinecone()
download_dataset()
process_dataset()

logging.info("\n\nDOWNLOADING RAW DATASET -------------\n")
download_raw_dataset()

logging.info("\n\nPROCESSING RAW DATASET -------------\n")
process_raw_dataset()

logging.info("\n\nUPLOADING PROCESSED DATASET -------------\n")
upload_processed_dataset()
if not no_upsert:
logging.info("\n\nUPSERTING DATA TO PINECONE -------------\n")
upsert_data()


Expand Down
2 changes: 1 addition & 1 deletion pypi_scout/scripts/setup_pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def setup_pinecone():
logging.info("✅ Pinecone index created successfully.")
except PineconeApiException as e:
if e.status == 409:
logging.warning(f"✔️ Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.")
logging.warning(f"🔹 Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.")
else:
logging.exception("❌ An error occurred while creating the Pinecone index.")

Expand Down
Loading

0 comments on commit d77543b

Please sign in to comment.