Merge remote-tracking branch 'origin/main' into ragbench

IBM · Feb 18, 2025 · a57746e · a57746e
2 parents 384cd93 + 3971079
commit a57746e
Show file tree

Hide file tree

Showing 43 changed files with 342 additions and 242 deletions.
diff --git a/.github/workflows/catalog_consistency.yml b/.github/workflows/catalog_consistency.yml
@@ -20,6 +20,8 @@ jobs:
        DATASETS_VERBOSITY: error
        HF_HUB_VERBOSITY: error
        HF_DATASETS_DISABLE_PROGRESS_BARS: True
+       HF_HUB_DOWNLOAD_TIMEOUT: 60
+       HF_HUB_ETAG_TIMEOUT: 60
        TQDM_DISABLE: True
 
      steps:

diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
@@ -9,7 +9,7 @@ on:
 concurrency:
     group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }}
     cancel-in-progress: true
-    
+
 jobs:
   preparation:
 
@@ -20,6 +20,8 @@ jobs:
       DATASETS_VERBOSITY: error
       HF_HUB_VERBOSITY: error
       HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
+      HF_HUB_DOWNLOAD_TIMEOUT: 60
+      HF_HUB_ETAG_TIMEOUT: 60
       TQDM_DISABLE: "True"
 
     strategy:
@@ -35,8 +37,11 @@ jobs:
 
     - run: curl -LsSf https://astral.sh/uv/install.sh | sh
     - run: uv pip install --system ".[tests]"
-    - run: huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
-
+    - name:  Hugging Face Login
+      run: |
+        for i in {1..5}; do
+          huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
+        done
     - name: Run Tests
       run: |
         modulo="${{ matrix.modulo }}"

diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml
@@ -23,6 +23,8 @@ jobs:
        DATASETS_VERBOSITY: error
        HF_HUB_VERBOSITY: error
        HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
+       HF_HUB_DOWNLOAD_TIMEOUT: 60
+       HF_HUB_ETAG_TIMEOUT: 60
        TQDM_DISABLE: "True"
        WML_URL: ${{ secrets.WML_URL }}
        WML_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }}
@@ -38,7 +40,12 @@ jobs:
 
      - run: curl -LsSf https://astral.sh/uv/install.sh | sh
      - run: uv pip install --system ".[tests,watsonx,inference_tests]"
-     - run: huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
+
+     - name:  Hugging Face Login
+        run: |
+          for i in {1..5}; do
+            huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
+          done
 
      - name: Run Tests
        run: python -m unittest discover -s tests/examples -p "test_*.py"
diff --git a/.github/workflows/inference_tests.yml b/.github/workflows/inference_tests.yml
@@ -11,37 +11,46 @@ on:
     types: [published]
 
 concurrency:
-    group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }}
-    cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }}
+  cancel-in-progress: true
 
 jobs:
-   inference:
-
-     runs-on: ubuntu-latest
-     env:
-       OS: ubuntu-latest
-       UNITXT_DEFAULT_VERBOSITY: error
-       DATASETS_VERBOSITY: error
-       HF_HUB_VERBOSITY: error
-       HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
-       TQDM_DISABLE: "True"
-       WML_URL: ${{ secrets.WML_URL }}
-       WML_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }}
-       WML_APIKEY: ${{ secrets.WML_APIKEY }}
-       WX_URL: ${{ secrets.WML_URL }}
-       WX_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }}
-       WX_API_KEY: ${{ secrets.WML_APIKEY }}
-       GENAI_KEY: ${{ secrets.GENAI_KEY }}
-     steps:
-     - uses: actions/checkout@v4
-
-     - uses: actions/setup-python@v5
-       with:
-         python-version: '3.10'
-
-     - run: curl -LsSf https://astral.sh/uv/install.sh | sh
-     - run: uv pip install --system ".[tests,watsonx,inference-tests]"
-     - run: huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
-
-     - name: Run Tests
-       run: python -m unittest discover -s tests/inference -p "test_*.py"
+  inference:
+    runs-on: ubuntu-latest
+    env:
+      OS: ubuntu-latest
+      UNITXT_DEFAULT_VERBOSITY: error
+      DATASETS_VERBOSITY: error
+      HF_HUB_VERBOSITY: error
+      HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
+      TQDM_DISABLE: "True"
+      WML_URL: ${{ secrets.WML_URL }}
+      WML_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }}
+      WML_APIKEY: ${{ secrets.WML_APIKEY }}
+      WX_URL: ${{ secrets.WX_URL }}
+      WX_PROJECT_ID: ${{ secrets.WX_PROJECT_ID }}
+      WX_API_KEY: ${{ secrets.WX_API_KEY }}
+      GENAI_KEY: ${{ secrets.GENAI_KEY }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - run: curl -LsSf https://astral.sh/uv/install.sh | sh
+      - run: uv pip install --system ".[tests,watsonx,inference-tests]"
+      - name: Hugging Face Login
+        run: |
+          for i in {1..5}; do
+            huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
+          done
+      - name: Install Ollama
+        run: curl -fsSL https://ollama.com/install.sh | sh
+      - name: Pull Llama 3.2:1b model
+        run: ollama pull llama3.2:1b
+      - name: Start serving the model
+        run: nohup ollama serve --model llama3.2:1b --port 5000 &
+
+      - name: Run Tests
+        run: python -m unittest discover -s tests/inference -p "test_*.py"
diff --git a/.github/workflows/library_eager_execution_tests.yml b/.github/workflows/library_eager_execution_tests.yml
@@ -21,6 +21,8 @@ jobs:
       DATASETS_VERBOSITY: error
       HF_HUB_VERBOSITY: error
       HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
+      HF_HUB_DOWNLOAD_TIMEOUT: 60
+      HF_HUB_ETAG_TIMEOUT: 60
       TQDM_DISABLE: "True"
 
     steps:

diff --git a/.github/workflows/library_tests.yml b/.github/workflows/library_tests.yml
@@ -20,6 +20,8 @@ jobs:
       DATASETS_VERBOSITY: error
       HF_HUB_VERBOSITY: error
       HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
+      HF_HUB_DOWNLOAD_TIMEOUT: 60
+      HF_HUB_ETAG_TIMEOUT: 60
       TQDM_DISABLE: "True"
 
     steps:

diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml
@@ -1,8 +1,6 @@
 name: Test Performance
 
 on:
-  push:
-    branches: [ main ]
   pull_request:
     branches: [ main ]
 
@@ -21,6 +19,8 @@ jobs:
       DATASETS_VERBOSITY: error
       HF_HUB_VERBOSITY: error
       HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
+      HF_HUB_DOWNLOAD_TIMEOUT: 60
+      HF_HUB_ETAG_TIMEOUT: 60
       TQDM_DISABLE: "True"
     steps:
     - uses: actions/checkout@v4
@@ -35,8 +35,11 @@ jobs:
         uv pip install --system ".[tests,watsonx,inference-tests]"
         uv pip install --system litellm
         uv pip install --system diskcache
-        huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
-
+    - name:  Hugging Face Login
+      run: |
+        for i in {1..5}; do
+          huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
+        done
     - name: Prepare the dirs for performance evaluation in main
       run: |
         mkdir -p performance_action

diff --git a/.gitignore b/.gitignore
@@ -158,6 +158,6 @@ src/unitxt/catalog/temp_recipe_name.json
 prod_env/*
 benchmark_output/*
 .litellm_cache
-
+src.lock
 docs/_static/data.js
 cache
diff --git a/performance/compare_benchmark_performance_results.py b/performance/compare_benchmark_performance_results.py
@@ -46,7 +46,7 @@
 print(line1 + line2 + line3 + line4 + line5 + line6 + line7)
 print("\n\n")
 # Performance degradation check (5% threshold)
-if ratio1 > 1.05:
+if ratio1 > 1.15:
     print("\n**Warning**: Performance degradation in Dataset Generation exceeds 5%!")
     print(
         "Explore branch performance via 'python performance/bluebench_profiler.py --output_file=<path to json file>',"

diff --git a/prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py b/prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py
@@ -21,6 +21,7 @@
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl",
         },
+        data_classification_policy = ["public"],
     ),
     preprocess_steps=[
         # region Question file

diff --git a/...cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py b/...cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py
@@ -22,6 +22,7 @@
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
         },
+        data_classification_policy = ["public"]
     ),
     preprocess_steps=[
         "operators.arena_hard_hf_space_processing_steps",
@@ -69,7 +70,7 @@
     ],
 )
 
-test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
+test_card(card, demos_taken_from="test", strict=False, loader_limit=15000)
 add_to_catalog(
     card,
     "cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_gpt_4_judge",

diff --git a/...rd/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py b/...rd/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py
@@ -23,6 +23,7 @@
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
         },
+        data_classification_policy = ["public"]
     ),
     preprocess_steps=[
         "operators.arena_hard_hf_space_processing_steps",
@@ -54,7 +55,7 @@
     ],
 )
 
-test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
+test_card(card, demos_taken_from="test", strict=False, loader_limit=15000)
 add_to_catalog(
     card,
     "cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_mean_judgment_gpt4_judge",

diff --git a/.../arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py b/.../arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py
@@ -20,6 +20,7 @@
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
         },
+        data_classification_policy = ["public"]
     ),
     preprocess_steps=[
         "operators.arena_hard_hf_space_processing_steps",
@@ -47,7 +48,7 @@
     ],
 )
 
-test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
+test_card(card, demos_taken_from="test", strict=False, loader_limit=15000)
 add_to_catalog(
     card,
     "cards.arena_hard.response_assessment.pairwise_comparative_rating.first_game_only_gpt_4_judge",

diff --git a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.py
@@ -4,6 +4,7 @@
 from unitxt.catalog import add_to_catalog
 from unitxt.loaders import LoadFromHFSpace
 from unitxt.operators import (
+    Fillna,
     FilterByCondition,
     InterleaveListsToDialogOperator,
     MapInstanceValues,
@@ -20,10 +21,12 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
         },
+        data_classification_policy = ["public"],
     ),
     preprocess_steps=[
         "operators.mt_bench.pairwise_hf_space_processing_steps",
         FilterByCondition(values={"turn": 2}, condition="eq"),
+        Fillna(field="reference", value=None),
         FilterByCondition(values={"reference": None}, condition="eq"),
         FilterByCondition(
             values={"winner": ["model_1", "tie", "model_2"]}, condition="in"
@@ -55,7 +58,7 @@
     ],
 )
 
-test_card(card, demos_taken_from="test", strict=False, loader_limit=1000)
+test_card(card, demos_taken_from="test", strict=False, loader_limit=15000)
 add_to_catalog(
     card,
     "cards.mt_bench.response_assessment.pairwise_comparison.multi_turn_gpt4_judgement",

diff --git a/...bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.py b/...bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.py
@@ -4,6 +4,7 @@
 from unitxt.catalog import add_to_catalog
 from unitxt.loaders import LoadFromHFSpace
 from unitxt.operators import (
+    Fillna,
     FilterByCondition,
     InterleaveListsToDialogOperator,
     MapInstanceValues,
@@ -20,10 +21,12 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
         },
+        data_classification_policy = ["public"]
     ),
     preprocess_steps=[
         "operators.mt_bench.pairwise_hf_space_processing_steps",
         FilterByCondition(values={"turn": 2}, condition="eq"),
+        Fillna(field="reference", value=None),
         FilterByCondition(values={"reference": None}, condition="ne"),
         FilterByCondition(
             values={"winner": ["model_1", "tie", "model_2"]}, condition="in"

diff --git a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.py
@@ -3,12 +3,7 @@
 )
 from unitxt.catalog import add_to_catalog
 from unitxt.loaders import LoadFromHFSpace
-from unitxt.operators import (
-    Copy,
-    FilterByCondition,
-    MapInstanceValues,
-    Rename,
-)
+from unitxt.operators import Copy, Fillna, FilterByCondition, MapInstanceValues, Rename
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
@@ -20,10 +15,13 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
         },
+        data_classification_policy = ["public"],
+
     ),
     preprocess_steps=[
         "operators.mt_bench.pairwise_hf_space_processing_steps",
         FilterByCondition(values={"turn": 1}, condition="eq"),
+        Fillna(field="reference", value=None),
         FilterByCondition(values={"reference": None}, condition="eq"),
         FilterByCondition(
             values={"winner": ["model_1", "tie", "model_2"]}, condition="in"

diff --git a/...ench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.py b/...ench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.py
@@ -3,12 +3,7 @@
 )
 from unitxt.catalog import add_to_catalog
 from unitxt.loaders import LoadFromHFSpace
-from unitxt.operators import (
-    Copy,
-    FilterByCondition,
-    MapInstanceValues,
-    Rename,
-)
+from unitxt.operators import Copy, Fillna, FilterByCondition, MapInstanceValues, Rename
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
@@ -20,10 +15,12 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
         },
+        data_classification_policy = ["public"]
     ),
     preprocess_steps=[
         "operators.mt_bench.pairwise_hf_space_processing_steps",
         FilterByCondition(values={"turn": 1}, condition="eq"),
+        Fillna(field="reference", value=None),
         FilterByCondition(values={"reference": None}, condition="ne"),
         FilterByCondition(
             values={"winner": ["model_1", "tie", "model_2"]}, condition="in"

diff --git a/prepare/cards/mt_bench/response_assessment/rating/multi_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/rating/multi_turn_gpt4_judgement.py
@@ -4,6 +4,7 @@
 from unitxt.catalog import add_to_catalog
 from unitxt.loaders import LoadFromHFSpace
 from unitxt.operators import (
+    Fillna,
     FilterByCondition,
     InterleaveListsToDialogOperator,
     Rename,
@@ -19,10 +20,13 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl",
         },
+        data_classification_policy = ["public"],
+
     ),
     preprocess_steps=[
         "operators.mt_bench.rating_hf_space_processing_steps",
         FilterByCondition(values={"turn": 2}, condition="eq"),
+        Fillna(field="reference", value=None),
         FilterByCondition(values={"reference": None}, condition="eq"),
         Rename(field_to_field={"score": "rating", "category": "group"}),
         InterleaveListsToDialogOperator(