Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into ragbench
Browse files Browse the repository at this point in the history
  • Loading branch information
elronbandel committed Feb 18, 2025
2 parents 384cd93 + 3971079 commit a57746e
Show file tree
Hide file tree
Showing 43 changed files with 342 additions and 242 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/catalog_consistency.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ jobs:
DATASETS_VERBOSITY: error
HF_HUB_VERBOSITY: error
HF_DATASETS_DISABLE_PROGRESS_BARS: True
HF_HUB_DOWNLOAD_TIMEOUT: 60
HF_HUB_ETAG_TIMEOUT: 60
TQDM_DISABLE: True

steps:
Expand Down
11 changes: 8 additions & 3 deletions .github/workflows/catalog_preparation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
concurrency:
group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }}
cancel-in-progress: true

jobs:
preparation:

Expand All @@ -20,6 +20,8 @@ jobs:
DATASETS_VERBOSITY: error
HF_HUB_VERBOSITY: error
HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
HF_HUB_DOWNLOAD_TIMEOUT: 60
HF_HUB_ETAG_TIMEOUT: 60
TQDM_DISABLE: "True"

strategy:
Expand All @@ -35,8 +37,11 @@ jobs:

- run: curl -LsSf https://astral.sh/uv/install.sh | sh
- run: uv pip install --system ".[tests]"
- run: huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}

- name: Hugging Face Login
run: |
for i in {1..5}; do
huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
done
- name: Run Tests
run: |
modulo="${{ matrix.modulo }}"
Expand Down
9 changes: 8 additions & 1 deletion .github/workflows/examples_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ jobs:
DATASETS_VERBOSITY: error
HF_HUB_VERBOSITY: error
HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
HF_HUB_DOWNLOAD_TIMEOUT: 60
HF_HUB_ETAG_TIMEOUT: 60
TQDM_DISABLE: "True"
WML_URL: ${{ secrets.WML_URL }}
WML_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }}
Expand All @@ -38,7 +40,12 @@ jobs:

- run: curl -LsSf https://astral.sh/uv/install.sh | sh
- run: uv pip install --system ".[tests,watsonx,inference_tests]"
- run: huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}

- name: Hugging Face Login
run: |
for i in {1..5}; do
huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
done
- name: Run Tests
run: python -m unittest discover -s tests/examples -p "test_*.py"
73 changes: 41 additions & 32 deletions .github/workflows/inference_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,37 +11,46 @@ on:
types: [published]

concurrency:
group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }}
cancel-in-progress: true
group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }}
cancel-in-progress: true

jobs:
inference:

runs-on: ubuntu-latest
env:
OS: ubuntu-latest
UNITXT_DEFAULT_VERBOSITY: error
DATASETS_VERBOSITY: error
HF_HUB_VERBOSITY: error
HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
TQDM_DISABLE: "True"
WML_URL: ${{ secrets.WML_URL }}
WML_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }}
WML_APIKEY: ${{ secrets.WML_APIKEY }}
WX_URL: ${{ secrets.WML_URL }}
WX_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }}
WX_API_KEY: ${{ secrets.WML_APIKEY }}
GENAI_KEY: ${{ secrets.GENAI_KEY }}
steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: '3.10'

- run: curl -LsSf https://astral.sh/uv/install.sh | sh
- run: uv pip install --system ".[tests,watsonx,inference-tests]"
- run: huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}

- name: Run Tests
run: python -m unittest discover -s tests/inference -p "test_*.py"
inference:
runs-on: ubuntu-latest
env:
OS: ubuntu-latest
UNITXT_DEFAULT_VERBOSITY: error
DATASETS_VERBOSITY: error
HF_HUB_VERBOSITY: error
HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
TQDM_DISABLE: "True"
WML_URL: ${{ secrets.WML_URL }}
WML_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }}
WML_APIKEY: ${{ secrets.WML_APIKEY }}
WX_URL: ${{ secrets.WX_URL }}
WX_PROJECT_ID: ${{ secrets.WX_PROJECT_ID }}
WX_API_KEY: ${{ secrets.WX_API_KEY }}
GENAI_KEY: ${{ secrets.GENAI_KEY }}
steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: '3.10'

- run: curl -LsSf https://astral.sh/uv/install.sh | sh
- run: uv pip install --system ".[tests,watsonx,inference-tests]"
- name: Hugging Face Login
run: |
for i in {1..5}; do
huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
done
- name: Install Ollama
run: curl -fsSL https://ollama.com/install.sh | sh
- name: Pull Llama 3.2:1b model
run: ollama pull llama3.2:1b
- name: Start serving the model
run: nohup ollama serve --model llama3.2:1b --port 5000 &

- name: Run Tests
run: python -m unittest discover -s tests/inference -p "test_*.py"
2 changes: 2 additions & 0 deletions .github/workflows/library_eager_execution_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ jobs:
DATASETS_VERBOSITY: error
HF_HUB_VERBOSITY: error
HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
HF_HUB_DOWNLOAD_TIMEOUT: 60
HF_HUB_ETAG_TIMEOUT: 60
TQDM_DISABLE: "True"

steps:
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/library_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ jobs:
DATASETS_VERBOSITY: error
HF_HUB_VERBOSITY: error
HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
HF_HUB_DOWNLOAD_TIMEOUT: 60
HF_HUB_ETAG_TIMEOUT: 60
TQDM_DISABLE: "True"

steps:
Expand Down
11 changes: 7 additions & 4 deletions .github/workflows/performance.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
name: Test Performance

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

Expand All @@ -21,6 +19,8 @@ jobs:
DATASETS_VERBOSITY: error
HF_HUB_VERBOSITY: error
HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
HF_HUB_DOWNLOAD_TIMEOUT: 60
HF_HUB_ETAG_TIMEOUT: 60
TQDM_DISABLE: "True"
steps:
- uses: actions/checkout@v4
Expand All @@ -35,8 +35,11 @@ jobs:
uv pip install --system ".[tests,watsonx,inference-tests]"
uv pip install --system litellm
uv pip install --system diskcache
huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
- name: Hugging Face Login
run: |
for i in {1..5}; do
huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
done
- name: Prepare the dirs for performance evaluation in main
run: |
mkdir -p performance_action
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,6 @@ src/unitxt/catalog/temp_recipe_name.json
prod_env/*
benchmark_output/*
.litellm_cache

src.lock
docs/_static/data.js
cache
2 changes: 1 addition & 1 deletion performance/compare_benchmark_performance_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
print(line1 + line2 + line3 + line4 + line5 + line6 + line7)
print("\n\n")
# Performance degradation check (5% threshold)
if ratio1 > 1.05:
if ratio1 > 1.15:
print("\n**Warning**: Performance degradation in Dataset Generation exceeds 5%!")
print(
"Explore branch performance via 'python performance/bluebench_profiler.py --output_file=<path to json file>',"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"questions": "data/arena-hard-v0.1/question.jsonl",
"model_answer": "data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl",
},
data_classification_policy = ["public"],
),
preprocess_steps=[
# region Question file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
"judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
},
data_classification_policy = ["public"]
),
preprocess_steps=[
"operators.arena_hard_hf_space_processing_steps",
Expand Down Expand Up @@ -69,7 +70,7 @@
],
)

test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
test_card(card, demos_taken_from="test", strict=False, loader_limit=15000)
add_to_catalog(
card,
"cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_gpt_4_judge",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
"judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
},
data_classification_policy = ["public"]
),
preprocess_steps=[
"operators.arena_hard_hf_space_processing_steps",
Expand Down Expand Up @@ -54,7 +55,7 @@
],
)

test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
test_card(card, demos_taken_from="test", strict=False, loader_limit=15000)
add_to_catalog(
card,
"cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_mean_judgment_gpt4_judge",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
"judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
},
data_classification_policy = ["public"]
),
preprocess_steps=[
"operators.arena_hard_hf_space_processing_steps",
Expand Down Expand Up @@ -47,7 +48,7 @@
],
)

test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
test_card(card, demos_taken_from="test", strict=False, loader_limit=15000)
add_to_catalog(
card,
"cards.arena_hard.response_assessment.pairwise_comparative_rating.first_game_only_gpt_4_judge",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from unitxt.catalog import add_to_catalog
from unitxt.loaders import LoadFromHFSpace
from unitxt.operators import (
Fillna,
FilterByCondition,
InterleaveListsToDialogOperator,
MapInstanceValues,
Expand All @@ -20,10 +21,12 @@
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
},
data_classification_policy = ["public"],
),
preprocess_steps=[
"operators.mt_bench.pairwise_hf_space_processing_steps",
FilterByCondition(values={"turn": 2}, condition="eq"),
Fillna(field="reference", value=None),
FilterByCondition(values={"reference": None}, condition="eq"),
FilterByCondition(
values={"winner": ["model_1", "tie", "model_2"]}, condition="in"
Expand Down Expand Up @@ -55,7 +58,7 @@
],
)

test_card(card, demos_taken_from="test", strict=False, loader_limit=1000)
test_card(card, demos_taken_from="test", strict=False, loader_limit=15000)
add_to_catalog(
card,
"cards.mt_bench.response_assessment.pairwise_comparison.multi_turn_gpt4_judgement",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from unitxt.catalog import add_to_catalog
from unitxt.loaders import LoadFromHFSpace
from unitxt.operators import (
Fillna,
FilterByCondition,
InterleaveListsToDialogOperator,
MapInstanceValues,
Expand All @@ -20,10 +21,12 @@
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
},
data_classification_policy = ["public"]
),
preprocess_steps=[
"operators.mt_bench.pairwise_hf_space_processing_steps",
FilterByCondition(values={"turn": 2}, condition="eq"),
Fillna(field="reference", value=None),
FilterByCondition(values={"reference": None}, condition="ne"),
FilterByCondition(
values={"winner": ["model_1", "tie", "model_2"]}, condition="in"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,7 @@
)
from unitxt.catalog import add_to_catalog
from unitxt.loaders import LoadFromHFSpace
from unitxt.operators import (
Copy,
FilterByCondition,
MapInstanceValues,
Rename,
)
from unitxt.operators import Copy, Fillna, FilterByCondition, MapInstanceValues, Rename
from unitxt.test_utils.card import test_card

card = TaskCard(
Expand All @@ -20,10 +15,13 @@
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
},
data_classification_policy = ["public"],

),
preprocess_steps=[
"operators.mt_bench.pairwise_hf_space_processing_steps",
FilterByCondition(values={"turn": 1}, condition="eq"),
Fillna(field="reference", value=None),
FilterByCondition(values={"reference": None}, condition="eq"),
FilterByCondition(
values={"winner": ["model_1", "tie", "model_2"]}, condition="in"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,7 @@
)
from unitxt.catalog import add_to_catalog
from unitxt.loaders import LoadFromHFSpace
from unitxt.operators import (
Copy,
FilterByCondition,
MapInstanceValues,
Rename,
)
from unitxt.operators import Copy, Fillna, FilterByCondition, MapInstanceValues, Rename
from unitxt.test_utils.card import test_card

card = TaskCard(
Expand All @@ -20,10 +15,12 @@
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
},
data_classification_policy = ["public"]
),
preprocess_steps=[
"operators.mt_bench.pairwise_hf_space_processing_steps",
FilterByCondition(values={"turn": 1}, condition="eq"),
Fillna(field="reference", value=None),
FilterByCondition(values={"reference": None}, condition="ne"),
FilterByCondition(
values={"winner": ["model_1", "tie", "model_2"]}, condition="in"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from unitxt.catalog import add_to_catalog
from unitxt.loaders import LoadFromHFSpace
from unitxt.operators import (
Fillna,
FilterByCondition,
InterleaveListsToDialogOperator,
Rename,
Expand All @@ -19,10 +20,13 @@
"model_answer": "data/mt_bench/model_answer/*.jsonl",
"judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl",
},
data_classification_policy = ["public"],

),
preprocess_steps=[
"operators.mt_bench.rating_hf_space_processing_steps",
FilterByCondition(values={"turn": 2}, condition="eq"),
Fillna(field="reference", value=None),
FilterByCondition(values={"reference": None}, condition="eq"),
Rename(field_to_field={"score": "rating", "category": "group"}),
InterleaveListsToDialogOperator(
Expand Down
Loading

0 comments on commit a57746e

Please sign in to comment.