Merge branch 'main' into add-non-verify-option-to-api-loader

IBM · Feb 17, 2025 · cd379b3 · cd379b3
2 parents 762d1ef + fe79da3
commit cd379b3
Show file tree

Hide file tree

Showing 34 changed files with 249 additions and 188 deletions.
diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
@@ -9,7 +9,7 @@ on:
 concurrency:
     group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }}
     cancel-in-progress: true
-    
+
 jobs:
   preparation:
 
@@ -35,8 +35,11 @@ jobs:
 
     - run: curl -LsSf https://astral.sh/uv/install.sh | sh
     - run: uv pip install --system ".[tests]"
-    - run: huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
-
+    - name:  Hugging Face Login
+      run: |
+        for i in {1..5}; do
+          huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
+        done
     - name: Run Tests
       run: |
         modulo="${{ matrix.modulo }}"

diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml
@@ -38,7 +38,12 @@ jobs:
 
      - run: curl -LsSf https://astral.sh/uv/install.sh | sh
      - run: uv pip install --system ".[tests,watsonx,inference_tests]"
-     - run: huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
+
+     - name:  Hugging Face Login
+        run: |
+          for i in {1..5}; do
+            huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
+          done
 
      - name: Run Tests
        run: python -m unittest discover -s tests/examples -p "test_*.py"
diff --git a/.github/workflows/inference_tests.yml b/.github/workflows/inference_tests.yml
@@ -41,7 +41,10 @@ jobs:
 
      - run: curl -LsSf https://astral.sh/uv/install.sh | sh
      - run: uv pip install --system ".[tests,watsonx,inference-tests]"
-     - run: huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
-
+     - name:  Hugging Face Login
+        run: |
+          for i in {1..5}; do
+            huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
+          done
      - name: Run Tests
        run: python -m unittest discover -s tests/inference -p "test_*.py"
diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml
@@ -1,8 +1,6 @@
 name: Test Performance
 
 on:
-  push:
-    branches: [ main ]
   pull_request:
     branches: [ main ]
 
@@ -35,8 +33,11 @@ jobs:
         uv pip install --system ".[tests,watsonx,inference-tests]"
         uv pip install --system litellm
         uv pip install --system diskcache
-        huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
-
+    - name:  Hugging Face Login
+      run: |
+        for i in {1..5}; do
+          huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
+        done
     - name: Prepare the dirs for performance evaluation in main
       run: |
         mkdir -p performance_action

diff --git a/.gitignore b/.gitignore
@@ -158,6 +158,6 @@ src/unitxt/catalog/temp_recipe_name.json
 prod_env/*
 benchmark_output/*
 .litellm_cache
-
+src.lock
 docs/_static/data.js
 cache
diff --git a/prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py b/prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py
@@ -21,6 +21,7 @@
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl",
         },
+        data_classification_policy = ["public"],
     ),
     preprocess_steps=[
         # region Question file

diff --git a/...cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py b/...cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py
@@ -22,6 +22,7 @@
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
         },
+        data_classification_policy = ["public"]
     ),
     preprocess_steps=[
         "operators.arena_hard_hf_space_processing_steps",
@@ -69,7 +70,7 @@
     ],
 )
 
-test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
+test_card(card, demos_taken_from="test", strict=False, loader_limit=15000)
 add_to_catalog(
     card,
     "cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_gpt_4_judge",

diff --git a/...rd/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py b/...rd/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py
@@ -23,6 +23,7 @@
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
         },
+        data_classification_policy = ["public"]
     ),
     preprocess_steps=[
         "operators.arena_hard_hf_space_processing_steps",
@@ -54,7 +55,7 @@
     ],
 )
 
-test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
+test_card(card, demos_taken_from="test", strict=False, loader_limit=15000)
 add_to_catalog(
     card,
     "cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_mean_judgment_gpt4_judge",

diff --git a/.../arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py b/.../arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py
@@ -20,6 +20,7 @@
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl",
         },
+        data_classification_policy = ["public"]
     ),
     preprocess_steps=[
         "operators.arena_hard_hf_space_processing_steps",
@@ -47,7 +48,7 @@
     ],
 )
 
-test_card(card, demos_taken_from="test", strict=False, loader_limit=100000)
+test_card(card, demos_taken_from="test", strict=False, loader_limit=15000)
 add_to_catalog(
     card,
     "cards.arena_hard.response_assessment.pairwise_comparative_rating.first_game_only_gpt_4_judge",

diff --git a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.py
@@ -4,6 +4,7 @@
 from unitxt.catalog import add_to_catalog
 from unitxt.loaders import LoadFromHFSpace
 from unitxt.operators import (
+    Fillna,
     FilterByCondition,
     InterleaveListsToDialogOperator,
     MapInstanceValues,
@@ -20,10 +21,12 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
         },
+        data_classification_policy = ["public"],
     ),
     preprocess_steps=[
         "operators.mt_bench.pairwise_hf_space_processing_steps",
         FilterByCondition(values={"turn": 2}, condition="eq"),
+        Fillna(field="reference", value=None),
         FilterByCondition(values={"reference": None}, condition="eq"),
         FilterByCondition(
             values={"winner": ["model_1", "tie", "model_2"]}, condition="in"
@@ -55,7 +58,7 @@
     ],
 )
 
-test_card(card, demos_taken_from="test", strict=False, loader_limit=1000)
+test_card(card, demos_taken_from="test", strict=False, loader_limit=15000)
 add_to_catalog(
     card,
     "cards.mt_bench.response_assessment.pairwise_comparison.multi_turn_gpt4_judgement",

diff --git a/...bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.py b/...bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.py
@@ -4,6 +4,7 @@
 from unitxt.catalog import add_to_catalog
 from unitxt.loaders import LoadFromHFSpace
 from unitxt.operators import (
+    Fillna,
     FilterByCondition,
     InterleaveListsToDialogOperator,
     MapInstanceValues,
@@ -20,10 +21,12 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
         },
+        data_classification_policy = ["public"]
     ),
     preprocess_steps=[
         "operators.mt_bench.pairwise_hf_space_processing_steps",
         FilterByCondition(values={"turn": 2}, condition="eq"),
+        Fillna(field="reference", value=None),
         FilterByCondition(values={"reference": None}, condition="ne"),
         FilterByCondition(
             values={"winner": ["model_1", "tie", "model_2"]}, condition="in"

diff --git a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.py
@@ -3,12 +3,7 @@
 )
 from unitxt.catalog import add_to_catalog
 from unitxt.loaders import LoadFromHFSpace
-from unitxt.operators import (
-    Copy,
-    FilterByCondition,
-    MapInstanceValues,
-    Rename,
-)
+from unitxt.operators import Copy, Fillna, FilterByCondition, MapInstanceValues, Rename
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
@@ -20,10 +15,13 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
         },
+        data_classification_policy = ["public"],
+
     ),
     preprocess_steps=[
         "operators.mt_bench.pairwise_hf_space_processing_steps",
         FilterByCondition(values={"turn": 1}, condition="eq"),
+        Fillna(field="reference", value=None),
         FilterByCondition(values={"reference": None}, condition="eq"),
         FilterByCondition(
             values={"winner": ["model_1", "tie", "model_2"]}, condition="in"

diff --git a/...ench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.py b/...ench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.py
@@ -3,12 +3,7 @@
 )
 from unitxt.catalog import add_to_catalog
 from unitxt.loaders import LoadFromHFSpace
-from unitxt.operators import (
-    Copy,
-    FilterByCondition,
-    MapInstanceValues,
-    Rename,
-)
+from unitxt.operators import Copy, Fillna, FilterByCondition, MapInstanceValues, Rename
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
@@ -20,10 +15,12 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_pair.jsonl",
         },
+        data_classification_policy = ["public"]
     ),
     preprocess_steps=[
         "operators.mt_bench.pairwise_hf_space_processing_steps",
         FilterByCondition(values={"turn": 1}, condition="eq"),
+        Fillna(field="reference", value=None),
         FilterByCondition(values={"reference": None}, condition="ne"),
         FilterByCondition(
             values={"winner": ["model_1", "tie", "model_2"]}, condition="in"

diff --git a/prepare/cards/mt_bench/response_assessment/rating/multi_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/rating/multi_turn_gpt4_judgement.py
@@ -4,6 +4,7 @@
 from unitxt.catalog import add_to_catalog
 from unitxt.loaders import LoadFromHFSpace
 from unitxt.operators import (
+    Fillna,
     FilterByCondition,
     InterleaveListsToDialogOperator,
     Rename,
@@ -19,10 +20,13 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl",
         },
+        data_classification_policy = ["public"],
+
     ),
     preprocess_steps=[
         "operators.mt_bench.rating_hf_space_processing_steps",
         FilterByCondition(values={"turn": 2}, condition="eq"),
+        Fillna(field="reference", value=None),
         FilterByCondition(values={"reference": None}, condition="eq"),
         Rename(field_to_field={"score": "rating", "category": "group"}),
         InterleaveListsToDialogOperator(

diff --git a/...are/cards/mt_bench/response_assessment/rating/multi_turn_with_reference_gpt4_judgement.py b/...are/cards/mt_bench/response_assessment/rating/multi_turn_with_reference_gpt4_judgement.py
@@ -4,6 +4,7 @@
 from unitxt.catalog import add_to_catalog
 from unitxt.loaders import LoadFromHFSpace
 from unitxt.operators import (
+    Fillna,
     FilterByCondition,
     InterleaveListsToDialogOperator,
     Rename,
@@ -19,10 +20,12 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl",
         },
+        data_classification_policy = ["public"]
     ),
     preprocess_steps=[
         "operators.mt_bench.rating_hf_space_processing_steps",
         FilterByCondition(values={"turn": 2}, condition="eq"),
+        Fillna(field="reference", value=None),
         FilterByCondition(values={"reference": None}, condition="ne"),
         Rename(field_to_field={"score": "rating", "category": "group"}),
         InterleaveListsToDialogOperator(

diff --git a/prepare/cards/mt_bench/response_assessment/rating/single_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/rating/single_turn_gpt4_judgement.py
@@ -3,7 +3,7 @@
 )
 from unitxt.catalog import add_to_catalog
 from unitxt.loaders import LoadFromHFSpace
-from unitxt.operators import Copy, FilterByCondition, Rename
+from unitxt.operators import Copy, Fillna, FilterByCondition, Rename
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
@@ -15,10 +15,12 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl",
         },
+        data_classification_policy = ["public"],
     ),
     preprocess_steps=[
         "operators.mt_bench.rating_hf_space_processing_steps",
         FilterByCondition(values={"turn": 1}, condition="eq"),
+        Fillna(field="reference", value=None),
         FilterByCondition(values={"reference": None}, condition="eq"),
         Rename(
             field_to_field={

diff --git a/...re/cards/mt_bench/response_assessment/rating/single_turn_with_reference_gpt4_judgement.py b/...re/cards/mt_bench/response_assessment/rating/single_turn_with_reference_gpt4_judgement.py
@@ -3,11 +3,7 @@
 )
 from unitxt.catalog import add_to_catalog
 from unitxt.loaders import LoadFromHFSpace
-from unitxt.operators import (
-    Copy,
-    FilterByCondition,
-    Rename,
-)
+from unitxt.operators import Copy, Fillna, FilterByCondition, Rename
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
@@ -19,10 +15,12 @@
             "model_answer": "data/mt_bench/model_answer/*.jsonl",
             "judgment": "data/mt_bench/model_judgment/gpt-4_single.jsonl",
         },
+        data_classification_policy = ["public"]
     ),
     preprocess_steps=[
         "operators.mt_bench.rating_hf_space_processing_steps",
         FilterByCondition(values={"turn": 1}, condition="eq"),
+        Fillna(field="reference", value=None),
         FilterByCondition(values={"reference": None}, condition="ne"),
         Rename(
             field_to_field={

diff --git a/src/unitxt/catalog/cards/arena_hard/generation/english_gpt_4_0314_reference.json b/src/unitxt/catalog/cards/arena_hard/generation/english_gpt_4_0314_reference.json
@@ -7,7 +7,10 @@
         "data_files": {
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl"
-        }
+        },
+        "data_classification_policy": [
+            "public"
+        ]
     },
     "preprocess_steps": [
         {

diff --git a/...ds/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json b/...ds/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json
@@ -8,7 +8,10 @@
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl"
-        }
+        },
+        "data_classification_policy": [
+            "public"
+        ]
     },
     "preprocess_steps": [
         "operators.arena_hard_hf_space_processing_steps",

diff --git a/.../response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json b/.../response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json
@@ -8,7 +8,10 @@
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl"
-        }
+        },
+        "data_classification_policy": [
+            "public"
+        ]
     },
     "preprocess_steps": [
         "operators.arena_hard_hf_space_processing_steps",

diff --git a/...ena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json b/...ena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json
@@ -8,7 +8,10 @@
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
             "judgment": "data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/*.jsonl"
-        }
+        },
+        "data_classification_policy": [
+            "public"
+        ]
     },
     "preprocess_steps": [
         "operators.arena_hard_hf_space_processing_steps",