diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..fd1c0f70 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Lint and Format + +on: + push: + branches: + - "main" + - "release-**" + paths: + - '**.py' + - 'pyproject.toml' + - 'requirements*.txt' + - 'tox.ini' + - 'scripts/*.sh' + - '.github/**' + pull_request: + branches: + - "main" + - "release-**" + paths: + - '**.py' + - 'pyproject.toml' + - 'requirements*.txt' + - 'tox.ini' + - 'scripts/*.sh' + - '.github/**' + +env: + PYTHON_VERSION: 3.11 + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + submodules: true + + - name: Setup Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: 3.11 + cache: pip + cache-dependency-path: | + **/pyproject.toml + **/requirements*.txt + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox + + - name: Run Ruff check + run: | + tox -e ruff -- check + + - name: Run linting + if: success() || failure() + run: | + echo "::add-matcher::.github/workflows/matchers/pylint.json" + tox -e lint diff --git a/.github/workflows/matchers/pylint.json b/.github/workflows/matchers/pylint.json new file mode 100644 index 00000000..2b62078c --- /dev/null +++ b/.github/workflows/matchers/pylint.json @@ -0,0 +1,33 @@ +{ + "problemMatcher": [ + { + "owner": "pylint-error", + "severity": "error", + "pattern": [ + { + "regexp": "^(.+):(\\d+):(\\d+):\\s(([EF]\\d{4}):\\s.+)$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + }, + { + "owner": "pylint-warning", + "severity": "warning", + "pattern": [ + { + "regexp": "^(.+):(\\d+):(\\d+):\\s(([CRW]\\d{4}):\\s.+)$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + } + ] + } + \ No newline at end of file diff --git a/isort.cfg b/.isort.cfg similarity index 100% rename from isort.cfg rename to .isort.cfg diff --git a/requirements-dev.txt b/requirements-dev.txt index b0e0f77e..6513dc6b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # TODO: Uncomment below line once requirements.txt is created -# -r requirements.txt +-r requirements.txt pre-commit>=3.0.4,<4.0 pylint>=2.16.2,<4.0 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..f7be0dc6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: Apache-2.0 +click>=8.1.7,<9.0.0 +httpx>=0.25.0,<1.0.0 +jinja2 +openai>=1.13.3,<2.0.0 +rouge_score +tqdm>=4.66.2,<5.0.0 \ No newline at end of file diff --git a/src/instructlab_sdg/__init__.py b/src/instructlab_sdg/__init__.py index 3ca8d1f0..8eb177f1 100644 --- a/src/instructlab_sdg/__init__.py +++ b/src/instructlab_sdg/__init__.py @@ -1 +1,2 @@ +# First Party from instructlab_sdg.generate_data import generate_data diff --git a/src/instructlab_sdg/generate_data.py b/src/instructlab_sdg/generate_data.py index d9a3a8b3..4206d4c6 100644 --- a/src/instructlab_sdg/generate_data.py +++ b/src/instructlab_sdg/generate_data.py @@ -1,6 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # Standard +from datetime import datetime +from functools import partial +from pathlib import Path +from typing import Optional import json import multiprocessing import os @@ -8,23 +12,22 @@ import re import string import time -from datetime import datetime -from functools import partial -from pathlib import Path -from typing import Optional -import click -import tqdm -# instructlab - All of these need to go away - issue #6 -from instructlab.config import (DEFAULT_MULTIPROCESSING_START_METHOD, - get_model_family) -from instructlab.utils import (chunk_document, max_seed_example_tokens, - num_chars_from_tokens, read_taxonomy) # Third Party +# instructlab - All of these need to go away - issue #6 +from instructlab.config import DEFAULT_MULTIPROCESSING_START_METHOD, get_model_family +from instructlab.utils import ( + chunk_document, + max_seed_example_tokens, + num_chars_from_tokens, + read_taxonomy, +) from jinja2 import Template from rouge_score import rouge_scorer +import click +import tqdm -# Local +# First Party from instructlab_sdg import utils DEFAULT_PROMPT_TEMPLATE_MERLINITE = """\ diff --git a/src/instructlab_sdg/utils.py b/src/instructlab_sdg/utils.py index ab5e9041..75528b6d 100644 --- a/src/instructlab_sdg/utils.py +++ b/src/instructlab_sdg/utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # Standard +from typing import Optional, Sequence, Union import copy import dataclasses import io @@ -9,14 +10,13 @@ import math import os import sys -from typing import Optional, Sequence, Union -import httpx +# Third Party # instructlab - TODO these need to go away, issue #6 from instructlab.config import DEFAULT_API_KEY, DEFAULT_MODEL_OLD from instructlab.utils import get_sysprompt -# Third Party from openai import OpenAI, OpenAIError +import httpx StrOrOpenAIObject = Union[str, object]