Skip to content

Commit

Permalink
concurrent catalog_preparation yml
Browse files Browse the repository at this point in the history
Signed-off-by: dafnapension <[email protected]>
  • Loading branch information
dafnapension committed Nov 12, 2024
1 parent 13aa641 commit 15e2ba2
Show file tree
Hide file tree
Showing 6 changed files with 203 additions and 115 deletions.
10 changes: 9 additions & 1 deletion .github/workflows/catalog_preparation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ jobs:
HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
TQDM_DISABLE: "True"

strategy:
matrix:
modulo: [0,1,2,3,4]

steps:
- uses: actions/checkout@v4
Expand All @@ -31,4 +34,9 @@ jobs:
- run: huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}

- name: Run Tests
run: python -m unittest tests.catalog.test_preparation
run: |
modulo="${{ matrix.modulo }}"
echo "modulo=${modulo}" >> $GITHUB_STEP_SUMMARY
echo "sed -i 's/^modulo = ./modulo = ${modulo}/' tests/catalog/test_preparation.py" > sedit.sh
sh sedit.sh
python -m unittest tests.catalog.test_preparation
30 changes: 30 additions & 0 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Test Docs Compilation

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
compile-docs:

runs-on: ubuntu-latest
env:
OS: ubuntu-latest
PYTHONPATH: ./docs

steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: '3.9'

- run: curl -LsSf https://astral.sh/uv/install.sh | sh
- run: uv pip install --system ".[tests,docs]"

- name: Compile Docs
run: make docs


65 changes: 65 additions & 0 deletions .github/workflows/performance.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
name: Test Performance

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
run-performance-tests:

runs-on: ubuntu-latest
env:
OS: ubuntu-latest
UNITXT_DEFAULT_VERBOSITY: error
DATASETS_VERBOSITY: error
HF_HUB_VERBOSITY: error
HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
TQDM_DISABLE: "True"

steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: '3.9'

- name: Install Requirements
run: |
curl -LsSf https://astral.sh/uv/install.sh | sh
uv pip install --system -e ".[tests]"
- name: Prepare the dirs for performance evaluation in main
run: |
mkdir -p performance_action
mkdir -p performance_action/logs
echo "" > performance_action/__init__.py
echo " " > performance_action/logs/cards_benchmark.prof
echo " " > performance_action/logs/cards_benchmark.json
cp performance/card_profiler.py performance_action/card_profiler.py
cp performance/compare_performance_results.py performance_action/compare_performance_results.py
- name: Checkout main branch
uses: actions/checkout@v4
with:
ref: main
clean: false

- name: Run performance on main branch
run: |
python performance_action/card_profiler.py --output_file performance_action/main_results.json
- name: Checkout PR branch
uses: actions/checkout@v4
with:
ref: ${{ github.head_ref }}
clean: false

- name: Run performance on PR branch
run: |
python performance_action/card_profiler.py --output_file performance_action/pr_results.json
- name: Compare main and PR performance results
run: |
python performance_action/compare_performance_results.py performance_action/main_results.json performance_action/pr_results.json >> $GITHUB_STEP_SUMMARY
28 changes: 28 additions & 0 deletions .github/workflows/quality.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Verify Code Quality & Security

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
verify-pre-commit-executed:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.9'
- name: Install requirements
run: |
pip install ".[dev]"
- name: Install the pre-commit hooks
run: |
pre-commit install
- name: Validate all pre-commit verification were run correctly
run: |
pre-commit run --all-files
- name: Send failure message
if: failure() # This step will only run if a previous step failed
run: echo "The quality & security verification failed. This is likely due to not using pre-commit hooks please run 'pre-commit install' befroe any commit. "
26 changes: 26 additions & 0 deletions .github/workflows/test_helm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: Test HELM Integration

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
test-helm:

runs-on: ubuntu-latest
env:
OS: ubuntu-latest

steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.9'
cache: 'pip' # caching pip dependencies
- run: pip install --upgrade 'crfm-helm[unitxt]>=0.5.3'

- name: Test Helm
run: utils/run_helm.sh

159 changes: 45 additions & 114 deletions tests/catalog/test_preparation.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import glob
import os
import time
from collections import defaultdict
from typing import Tuple

from huggingface_hub.utils import GatedRepoError
from unitxt.loaders import MissingKaggleCredentialsError
Expand All @@ -22,128 +20,61 @@
)
glob_query = os.path.join(project_dir, "prepare", "**", "*.py")
all_preparation_files = glob.glob(glob_query, recursive=True)
# all_preparation_files = [
# "prepare/cards/claim_stance_topic.py",
# "prepare/cards/clinc_oos.py",
# ]
# Make sure the order in which the tests are run is deterministic
# Having a different order for local testing and github testing may cause diffs in results.
all_preparation_files.sort()
# the following should be any of modulo 5: 0,1,2,3,4
modulo = 1
all_preparation_files = [
file for i, file in enumerate(all_preparation_files) if i % 5 == modulo
]


class TestCatalogPreparation(UnitxtCatalogPreparationTestCase):
def process_one_file(self, file: str) -> Tuple:
messages = defaultdict(list)
exceptions = defaultdict(list)
elapsed_times = defaultdict(list)
try:
messages[file].append(
def test_preparations(self):
logger.info(glob_query)
logger.critical(f"Testing preparation files: {all_preparation_files}")
times = {}
for file in all_preparation_files:
logger.info(
"\n_____________________________________________\n"
f" Testing preparation file:\n {file}."
"\n_____________________________________________\n"
)
start_time = time.time()
try:
import_module_from_file(file)
except (MissingKaggleCredentialsError, GatedRepoError) as e:
messages[file].append(f"Skipping file {file} due to ignored error {e}")
return (messages, exceptions, elapsed_times)
except OSError as e:
if "You are trying to access a gated repo" in str(e):
messages[file].append(
f"Skipping file {file} due to ignored error {e}"
)
return (messages, exceptions, elapsed_times)
exceptions[file].append(e)
return (messages, exceptions, elapsed_times)
messages[file].append(f"Testing preparation file: {file} passed")

elapsed_times[file] = time.time() - start_time
return (messages, exceptions, elapsed_times)
except Exception as exc:
exceptions[file].append(f"Testing preparation file '{file}' failed: {exc}")
return (messages, exceptions, elapsed_times)
start_time = time.time()
with self.subTest(file=file):
try:
import_module_from_file(file)
except (MissingKaggleCredentialsError, GatedRepoError) as e:
logger.info(f"Skipping file {file} due to ignored error {e}")
continue
except OSError as e:
if "You are trying to access a gated repo" in str(e):
logger.info(
f"Skipping file {file} due to ignored error {e}"
)
continue
raise
logger.info(f"Testing preparation file: {file} passed")
self.assertTrue(True)

def format_time(self, elapsed_time: int) -> str:
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)
return f"{minutes:02}:{seconds:02}"
elapsed_time = time.time() - start_time
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)
formatted_time = f"{minutes:02}:{seconds:02}"
logger.info(
"\n_____________________________________________\n"
f" Finished testing preparation file:\n {file}."
f" Preparation Time: {formatted_time}"
"\n_____________________________________________\n"
)

def test_preparations(self):
logger.critical(
f"Testing {len(all_preparation_files)} preparation files: {all_preparation_files}"
)
# Make sure the order in which the tests are run is deterministic
# Having a different order for local testing and github testing may cause diffs in results.
all_preparation_files.sort()
all_messages = defaultdict(list)
all_exceptions = defaultdict(list)
all_elapsed_times = {}
overall_start_time = time.time()
times[file.split("prepare")[-1]] = formatted_time
except Exception as e:
logger.critical(f"Testing preparation file '{file}' failed:")
raise e

for file in all_preparation_files:
try:
data = self.process_one_file(file)
all_messages.update(data[0])
all_exceptions.update(data[1])
all_elapsed_times.update(data[2])
except Exception as exc:
logger.critical(f"{file} generated an exception: {exc}")
all_exceptions[file].append(f"{file} generated an exception: {exc}")

# with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# # Start the load operations and mark each future with its URL
# future_to_file = {
# executor.submit(self.process_one_file, file): file
# for file in all_preparation_files
# }
# for future in concurrent.futures.as_completed(future_to_file):
# file = future_to_file[future]
# try:
# data = future.result()
# all_messages.update(data[0])
# all_exceptions.update(data[1])
# all_elapsed_times.update(data[2])
# except Exception as exc:
# logger.critical(f"{file} generated an exception: {exc}")
# all_exceptions[file].append(f"{file} generated an exception: {exc}")

overall_passed_time = time.time() - overall_start_time
logger.critical(
f"Overall end-to-end time passed: {overall_passed_time} seconds = {self.format_time(overall_passed_time)}."
)
total_time = sum(v for k, v in all_elapsed_times.items())
logger.critical(
f"Sum of running time over all prepare files: {total_time} seconds = {self.format_time(total_time)}"
)
total_card_time = sum(v for k, v in all_elapsed_times.items() if "/cards/" in k)

logger.critical(
f"Sum of running time over all 'prepare/cards' files: {total_card_time} seconds = {self.format_time(total_card_time)}"
)
total_metrics_time = sum(
v for k, v in all_elapsed_times.items() if "/metrics/" in k
)
logger.critical(
f"Sum of running time over all 'prepare/metrics' files: {total_metrics_time} seconds = {self.format_time(total_metrics_time)}"
)
started_but_not_recorded = [
file
for file in all_preparation_files
if file not in all_elapsed_times.keys()
]
logger.critical(
f"Started with {len(all_preparation_files)} prepare files. Recorded times for {len(all_elapsed_times)} files. Did not record times for {len(started_but_not_recorded)} prepare_files: {started_but_not_recorded}."
)
logger.critical("Preparation times table:")
times = dict(
sorted(all_elapsed_times.items(), key=lambda item: item[1], reverse=True)
)
times = {k: self.format_time(v) for k, v in times.items()}
times = dict(sorted(times.items(), key=lambda item: item[1], reverse=True))
print_dict(times, log_level="critical")
if len(all_exceptions) > 0:
logger.critical(
f"The following {len(all_exceptions)} prepare files threw exceptions while executing:"
)
for fn in sorted(all_exceptions.keys()):
logger.critical(f"{fn}: {all_exceptions[fn]}")
raise RuntimeError(
f"{len(all_exceptions)} prepare files failed while executing, see the logs for a detailed list"
)

0 comments on commit 15e2ba2

Please sign in to comment.