diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..9679ffb --- /dev/null +++ b/.dockerignore @@ -0,0 +1,41 @@ +__pycache__/ +*.py[cod] +*$py.class + +.Python +build/ +develop-eggs/ +dist/ +downloads/ +data/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +.python-version + +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + + +.mypy_cache/ +.dmypy.json +dmypy.json + +.pre-commit-config.yaml +.pre-commit-hooks.yaml diff --git a/.github/workflows/dev-star-tracker.yml b/.github/workflows/dev-star-tracker.yml index 3d8483f..ef4d195 100644 --- a/.github/workflows/dev-star-tracker.yml +++ b/.github/workflows/dev-star-tracker.yml @@ -4,6 +4,9 @@ on: push: branches: - master + pull_request: + branches: + - '**' workflow_dispatch: jobs: @@ -19,4 +22,8 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: - organizations: 'roboflow, autodistill' + organizations: 'roboflow, autodistill, voxel51' + + - name: 📊 Show data.csv + run: | + cat data/data.csv diff --git a/.github/workflows/star-tracker.yml b/.github/workflows/star-tracker.yml index ea04c00..ed1fdd4 100644 --- a/.github/workflows/star-tracker.yml +++ b/.github/workflows/star-tracker.yml @@ -8,7 +8,7 @@ on: permissions: contents: write # Grants push access - + jobs: track-stars: runs-on: ubuntu-latest @@ -22,10 +22,10 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: - organizations: 'roboflow, autodistill' + organizations: 'roboflow, autodistill, huggingface, voxel51, ultralytics, Lightning-AI' - name: Commit Data uses: stefanzweifel/git-auto-commit-action@v4 with: commit_message: Update star data - file_pattern: data/data.csv \ No newline at end of file + file_pattern: data/data.csv diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..8c6f8b7 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,33 @@ + +ci: + autofix_prs: true + autoupdate_schedule: weekly + autofix_commit_msg: "fix(pre_commit): 🎨 auto format pre-commit hooks" + autoupdate_commit_msg: "chore(pre_commit): ⬆ pre_commit autoupdate" + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + exclude: test/.*\.py + - id: check-yaml + exclude: mkdocs.yml + - id: check-executables-have-shebangs + - id: check-toml + - id: check-case-conflict + - id: check-added-large-files + - id: detect-private-key + - id: pretty-format-json + exclude: demo.ipynb + args: ['--autofix', '--no-sort-keys', '--indent=4'] + - id: end-of-file-fixer + - id: mixed-line-ending + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.7 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format + types_or: [ python, pyi, jupyter ] diff --git a/Dockerfile b/Dockerfile index 81d332d..9048e07 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9-slim +FROM python:3.12-slim COPY startrack/ /startrack/ COPY requirements.txt . diff --git a/README.md b/README.md index 69fb099..84da4af 100644 --- a/README.md +++ b/README.md @@ -2,28 +2,28 @@ ## 👋 hello -Star-Track is a user-friendly utility for tracking GitHub repository statistics. +Star-Track is a user-friendly utility for tracking GitHub repository statistics. ## 💻 install - clone repositoryą - ```bash - git clone https://github.com/roboflow/star-track.git - ``` - -- setup python environment and activate it [optional] + ```bash + git clone https://github.com/roboflow/star-track.git + ``` - ```bash - python3 -m venv venv - source venv/bin/activate - ``` +- setup python environment and activate it \[optional\] + + ```bash + python3 -m venv venv + source venv/bin/activate + ``` - install required dependencies - ```bash - pip install -r requirements.txt - ``` + ```bash + pip install -r requirements.txt + ``` ## ⚙️ execute @@ -37,22 +37,22 @@ To test the Docker solution locally, follow these steps: 1. **Build the Docker Image** - ```bash - docker build -t startrack:latest . - ``` + ```bash + docker build -t startrack:latest . + ``` 2. **Run the Docker Container** - ```bash - docker run --rm \ - -e GITHUB_TOKEN=your_github_token \ - -e INPUT_ORGANIZATIONS=org1,org2 \ - -e INPUT_REPOSITORIES=user1/repo1,user2/repo2 \ - -v $(pwd)/data:/app/data \ - startrack:latest - ``` + ```bash + docker run --rm \ + -e GITHUB_TOKEN=your_github_token \ + -e INPUT_ORGANIZATIONS=org1,org2 \ + -e INPUT_REPOSITORIES=user1/repo1,user2/repo2 \ + -v $(pwd)/data:/app/data:z \ + startrack:latest + ``` -### Explanation: +### Explanation - `--rm`: Automatically remove the container when it exits. - `-e GITHUB_TOKEN=your_github_token`: Set the `GITHUB_TOKEN` environment variable. diff --git a/action.yml b/action.yml index dd66e2a..e7f92b9 100644 --- a/action.yml +++ b/action.yml @@ -11,5 +11,5 @@ runs: using: 'docker' image: 'Dockerfile' args: - - ${{ inputs.organizations }} - - ${{ inputs.repositories }} \ No newline at end of file + - ${{ inputs.organizations }} + - ${{ inputs.repositories }} diff --git a/config.json b/config.json index 22ff20f..19955d6 100644 --- a/config.json +++ b/config.json @@ -2,4 +2,4 @@ "organizations": [ "roboflow" ] -} \ No newline at end of file +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a37423b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,121 @@ +[tool.ruff] +target-version = "py312" + +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "venv", + "yarn-error.log", + "yarn.lock", + "docs", +] + +line-length = 89 +indent-width = 4 + +[tool.ruff.lint] +# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. +select = ["E", "F", "I", "A", "Q", "W","RUF","UP","YTT","NPY","ANN","T","S","Q","N","G","F","E","C","B","A"] +ignore = [] +# Allow autofix for all enabled rules (when `--fix`) is provided. +fixable = [ + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "I", + "N", + "Q", + "S", + "T", + "W", + "ANN", + "ARG", + "BLE", + "COM", + "DJ", + "DTZ", + "EM", + "ERA", + "EXE", + "FBT", + "ICN", + "INP", + "ISC", + "NPY", + "PD", + "PGH", + "PIE", + "PL", + "PT", + "PTH", + "PYI", + "RET", + "RSE", + "RUF", + "SIM", + "SLF", + "TCH", + "TID", + "TRY", + "UP", + "YTT", +] +unfixable = [] +# Allow unused variables when underscore-prefixed. +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" +pylint.max-args = 20 + +[tool.ruff.lint.flake8-quotes] +inline-quotes = "double" +multiline-quotes = "double" +docstring-quotes = "double" + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] + +[tool.ruff.lint.mccabe] +# Flag errors (`C901`) whenever the complexity level exceeds 5. +max-complexity = 20 + +[tool.ruff.lint.isort] +order-by-type = true +no-sections = false + +[tool.ruff.format] +# Like Black, use double quotes for strings. +quote-style = "double" +docstring-code-format = true + +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" diff --git a/requirements.txt b/requirements.txt index 5b2f108..69de461 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ pandas -requests \ No newline at end of file +requests diff --git a/startrack/app.py b/startrack/app.py index 9a198e9..5f2155a 100644 --- a/startrack/app.py +++ b/startrack/app.py @@ -1,27 +1,37 @@ +import concurrent.futures import os from datetime import datetime -from typing import List +from pathlib import Path import pandas as pd -from startrack.config import GITHUB_TOKEN_ENV, INPUT_ORGANIZATIONS_ENV, \ - INPUT_REPOSITORIES_ENV +from startrack.config import ( + GITHUB_TOKEN_ENV, + INPUT_ORGANIZATIONS_ENV, + INPUT_OUTPUT_FILENAME_ENV, + INPUT_OUTPUT_PATH_ENV, + INPUT_REPOSITORIES_ENV, +) from startrack.core import ( + RepositoryData, RepositoryType, - RepositoryData, fetch_all_organization_repositories, - fetch_repository_data_by_full_name, convert_repositories_to_dataframe + convert_repositories_to_dataframe, + fetch_all_organization_repositories, + fetch_repository_data_by_full_name, ) GITHUB_TOKEN = os.environ.get(GITHUB_TOKEN_ENV) -ORGANIZATIONS = os.environ.get(INPUT_ORGANIZATIONS_ENV, '') -REPOSITORIES = os.environ.get(INPUT_REPOSITORIES_ENV, '') -ORGANIZATION_NAMES = [org.strip() for org in ORGANIZATIONS.split(',') if org.strip()] -REPOSITORY_NAMES = [repo.strip() for repo in REPOSITORIES.split(',') if repo.strip()] +ORGANIZATIONS = os.environ.get(INPUT_ORGANIZATIONS_ENV) +REPOSITORIES = os.environ.get(INPUT_REPOSITORIES_ENV, "") +OUTPUT_PATH = os.environ.get(INPUT_OUTPUT_PATH_ENV, "data") +OUTPUT_FILENAME = os.environ.get(INPUT_OUTPUT_FILENAME_ENV, "data.csv") + +ORGANIZATION_NAMES = [org.strip() for org in ORGANIZATIONS.split(",") if org.strip()] +REPOSITORY_NAMES = [repo.strip() for repo in REPOSITORIES.split(",") if repo.strip()] def save_to_csv(df: pd.DataFrame, directory: str, filename: str) -> None: - """ - Save a DataFrame to a CSV file in the specified directory. + """Save a DataFrame to a CSV file in the specified directory. Args: df (pd.DataFrame): The DataFrame to save. @@ -35,70 +45,98 @@ def save_to_csv(df: pd.DataFrame, directory: str, filename: str) -> None: df.to_csv(file_path) -def get_all_repositories() -> List[RepositoryData]: - """ - Fetch all repositories from specified organizations and individual repositories. +def fetch_organization_repositories(organization_name: str) -> list[RepositoryData]: + repos = fetch_all_organization_repositories( + github_token=GITHUB_TOKEN, + organization_name=organization_name, + repository_type=RepositoryType.PUBLIC, + ) + return [RepositoryData.from_json(repo) for repo in repos] + + +def fetch_individual_repository(repo_full_name: str) -> RepositoryData: + repo_data = fetch_repository_data_by_full_name( + github_token=GITHUB_TOKEN, + repository_full_name=repo_full_name, + ) + if repo_data: + return RepositoryData.from_json(repo_data) + return None + + +def get_all_repositories() -> list[RepositoryData]: + """Fetch all repositories from specified organizations and individual repositories. Returns: List[RepositoryData]: A list of repository data objects. """ all_repositories = [] - # Fetch repositories from specified organizations - for organization_name in ORGANIZATION_NAMES: - repos = fetch_all_organization_repositories( - github_token=GITHUB_TOKEN, - organization_name=organization_name, - repository_type=RepositoryType.PUBLIC - ) - all_repositories.extend([RepositoryData.from_json(repo) for repo in repos]) - - # Fetch specified repositories - for repo_full_name in REPOSITORY_NAMES: - repo_data = fetch_repository_data_by_full_name( - github_token=GITHUB_TOKEN, - repository_full_name=repo_full_name - ) - if repo_data: - all_repositories.append(RepositoryData.from_json(repo_data)) + with concurrent.futures.ThreadPoolExecutor() as executor: + # Fetch repositories from specified organizations in parallel + organization_futures = [ + executor.submit(fetch_organization_repositories, org_name) + for org_name in ORGANIZATION_NAMES + ] + + # Fetch specified repositories in parallel + repository_futures = [ + executor.submit(fetch_individual_repository, repo_name) + for repo_name in REPOSITORY_NAMES + ] + + # Collect results from organization futures + for future in concurrent.futures.as_completed(organization_futures): + all_repositories.extend(future.result()) + + # Collect results from repository futures + for future in concurrent.futures.as_completed(repository_futures): + repo_data = future.result() + if repo_data: + all_repositories.append(repo_data) return all_repositories def main() -> None: """ - Main function to fetch repository data, update the DataFrame, and save it to a CSV - file. - """ + "Main function to fetch repository data, update the DataFrame, and save it to a CSV file. + """ # noqa: E501 + if not GITHUB_TOKEN: - raise ValueError( + msg = ( "`GITHUB_TOKEN` is not set. Please set the `GITHUB_TOKEN` environment " "variable." ) - if not ORGANIZATION_NAMES and not REPOSITORY_NAMES: raise ValueError( + msg, + ) + if not ORGANIZATION_NAMES and not REPOSITORY_NAMES: + msg = ( "Either `ORGANIZATION_NAMES` or `REPOSITORY_NAMES` must be set. Please " "provide at least one organization name or repository name." ) + raise ValueError( + msg, + ) repositories = get_all_repositories() - df = convert_repositories_to_dataframe(repositories) - df = df.set_index('full_name').T + df = df.set_index("full_name").T current_date = datetime.now().strftime("%Y-%m-%d") df.index = [current_date] # Load existing data if the file exists - file_path = os.path.join('data', 'data.csv') - if os.path.exists(file_path): + file_path = Path(OUTPUT_PATH) / OUTPUT_FILENAME + if Path.exists(file_path, follow_symlinks=False): existing_df = pd.read_csv(file_path, index_col=0) df = pd.concat([existing_df, df]) save_to_csv( df=df, - directory='data', - filename='data.csv' + directory=OUTPUT_PATH, + filename=OUTPUT_FILENAME, ) diff --git a/startrack/config.py b/startrack/config.py index 4d6ce65..03ab5d2 100644 --- a/startrack/config.py +++ b/startrack/config.py @@ -1,3 +1,7 @@ -GITHUB_TOKEN_ENV = "GITHUB_TOKEN" +GITHUB_TOKEN_ENV = "GITHUB_TOKEN" # noqa: S105 INPUT_ORGANIZATIONS_ENV = "INPUT_ORGANIZATIONS" INPUT_REPOSITORIES_ENV = "INPUT_REPOSITORIES" + +INPUT_OUTPUT_PATH_ENV = "INPUT_OUTPUT_PATH" +INPUT_OUTPUT_FILENAME_ENV = "INPUT_OUTPUT_FILENAME" +HTTP_REQUEST_TIMEOUT = 30 diff --git a/startrack/core.py b/startrack/core.py index 0948582..344c35d 100644 --- a/startrack/core.py +++ b/startrack/core.py @@ -1,29 +1,30 @@ from dataclasses import dataclass from enum import Enum -from typing import List, Dict, Any +from typing import Any import pandas as pd import requests +from startrack.config import HTTP_REQUEST_TIMEOUT + @dataclass class RepositoryData: - """ - Data class for storing repository information. + """Data class for storing repository information. Attributes: full_name (str): The name of the repository. star_count (int): The number of stars the repository has. fork_count (int): The number of forks the repository has. """ + full_name: str star_count: int fork_count: int @classmethod - def from_json(cls, data: Dict[str, Any]) -> 'RepositoryData': - """ - Create a RepositoryData instance from a JSON dictionary. + def from_json(cls: type["RepositoryData"], data: dict[str, Any]) -> "RepositoryData": + """Create a RepositoryData instance from a JSON dictionary. Args: data (Dict[str, Any]): A dictionary containing repository data. @@ -31,15 +32,14 @@ def from_json(cls, data: Dict[str, Any]) -> 'RepositoryData': Returns: RepositoryData: An instance of RepositoryData. """ - full_name = data['full_name'] - star_count = data['stargazers_count'] - fork_count = data['forks_count'] + full_name = data["full_name"] + star_count = data["stargazers_count"] + fork_count = data["forks_count"] return cls(full_name=full_name, star_count=star_count, fork_count=fork_count) class RepositoryType(Enum): - """ - Enum for specifying types of repositories. + """Enum for specifying types of repositories. Attributes: ALL: Represents all types of repositories. @@ -61,10 +61,9 @@ class RepositoryType(Enum): def fetch_all_organization_repositories( github_token: str, organization_name: str, - repository_type: RepositoryType -) -> List: - """ - Fetch all repositories of a specified organization. + repository_type: RepositoryType, +) -> list: + """Fetch all repositories of a specified organization. Args: github_token (str): The GitHub personal access token for authentication. @@ -77,28 +76,31 @@ def fetch_all_organization_repositories( """ all_repositories = [] page = 1 - while True: - repos = fetch_organization_repositories_by_page( - github_token=github_token, - organization_name=organization_name, - repository_type=repository_type, - page=page) - if not repos: - break - all_repositories.extend(repos) - page += 1 + with requests.Session() as session: + while True: + repos = fetch_organization_repositories_by_page( + session=session, + github_token=github_token, + organization_name=organization_name, + repository_type=repository_type, + page=page, + ) + if not repos: + break + all_repositories.extend(repos) + page += 1 return all_repositories def fetch_organization_repositories_by_page( + session: requests.Session, github_token: str, organization_name: str, repository_type: RepositoryType = RepositoryType.ALL, - page: int = 1 -) -> List: - """ - Lists the repositories of a specified GitHub organization based on the repository + page: int = 1, +) -> list: + """Lists the repositories of a specified GitHub organization based on the repository type and page number. Args: @@ -112,29 +114,32 @@ def fetch_organization_repositories_by_page( Returns: List: A list containing details of the organization's repositories. """ - headers = { + "Accept-Encoding": "gzip", "Accept": "application/vnd.github+json", "Authorization": f"Bearer {github_token}", - "X-GitHub-Api-Version": "2022-11-28" + "X-GitHub-Api-Version": "2022-11-28", } params = { "type": repository_type.value, - "page": page + "page": page, } url = f"https://api.github.com/orgs/{organization_name}/repos" - response = requests.get(url, headers=headers, params=params) + response = session.get( + url, headers=headers, params=params, timeout=HTTP_REQUEST_TIMEOUT + ) return response.json() -def convert_repositories_to_dataframe(repositories: List[RepositoryData]) -> pd.DataFrame: - """ - Convert a list of RepositoryData objects into a pandas DataFrame. +def convert_repositories_to_dataframe( + repositories: list[RepositoryData], +) -> pd.DataFrame: + """Convert a list of RepositoryData objects into a pandas DataFrame. - Args: + Args: repositories (List[RepositoryData]): A list of RepositoryData objects. Returns: @@ -142,19 +147,17 @@ def convert_repositories_to_dataframe(repositories: List[RepositoryData]) -> pd. for the repository's name and star count. """ data = [ - {'full_name': repository.full_name, 'star_count': repository.star_count} - for repository - in repositories + {"full_name": repository.full_name, "star_count": repository.star_count} + for repository in repositories ] return pd.DataFrame(data) def fetch_repository_data_by_full_name( github_token: str, - repository_full_name: str -) -> Dict[str, Any]: - """ - Fetch data for a specific repository by its full name. + repository_full_name: str, +) -> dict[str, Any]: + """Fetch data for a specific repository by its full name. Args: github_token (str): The GitHub personal access token for authentication. @@ -165,12 +168,12 @@ def fetch_repository_data_by_full_name( successful, otherwise None. """ headers = { + "Accept-Encoding": "gzip", "Accept": "application/vnd.github+json", "Authorization": f"Bearer {github_token}", } url = f"https://api.github.com/repos/{repository_full_name}" - response = requests.get(url, headers=headers) - if response.status_code == 200: + response = requests.get(url, headers=headers, timeout=HTTP_REQUEST_TIMEOUT) + if response.status_code == requests.codes.OK: return response.json() - else: - return None + return None