Merge branch 'main' into new-text2sql-metrics-scores

IBM · Feb 18, 2025 · 3fe6200 · 3fe6200
2 parents e694a16 + b47f4fe
commit 3fe6200
Show file tree

Hide file tree

Showing 403 changed files with 1,298 additions and 461 deletions.
diff --git a/examples/evaluate_torr.py b/examples/evaluate_torr.py
@@ -0,0 +1,34 @@
+from unitxt import evaluate, load_dataset, settings
+from unitxt.inference import (
+    CrossProviderInferenceEngine,
+)
+
+with settings.context(
+    allow_unverified_code=True,
+    mock_inference_mode=True,
+):
+    test_dataset = load_dataset(
+        "benchmarks.torr",
+        split="test",
+        use_cache=True,
+    )
+    # Infer
+    model = CrossProviderInferenceEngine(
+        model="llama-3-8b-instruct",
+        max_tokens=30,
+    )
+    """
+    We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+    watsonx, bam, openai, azure, aws and more.
+
+    For the arguments these inference engines can receive, please refer to the classes documentation or read
+    about the the open ai api arguments the CrossProviderInferenceEngine follows.
+    """
+
+    predictions = model(test_dataset)
+    results = evaluate(predictions=predictions, data=test_dataset)
+
+    print("Global scores:")
+    print(results.global_scores.summary)
+    print("Subsets scores:")
+    print(results.subsets_scores.summary)
diff --git a/prepare/benchmarks/tables_benchmark.py → prepare/benchmarks/torr.py b/prepare/benchmarks/tables_benchmark.py → prepare/benchmarks/torr.py
@@ -10,12 +10,12 @@
 tables_benchmark_dir = os.path.join(
     constants.catalog_dir,
     "recipes",
-    "tables_benchmark",
+    "torr",
 )
 
 
 # Recursive function to build nested benchmarks
-def build_nested_benchmark(dir_path, prefix="recipes.tables_benchmark"):
+def build_nested_benchmark(dir_path, prefix="recipes.torr"):
     nested_scenarios = OrderedDict()
 
     for entry in sorted(os.listdir(dir_path)):
@@ -44,7 +44,7 @@ def build_nested_benchmark(dir_path, prefix="recipes.tables_benchmark"):
 benchmark = Benchmark(
     tables_benchmark_scenarios.subsets,
     __description__=(
-        "TablesBenchmark is an open-source benchmark developed by domain experts to evaluate various table-related tasks and capabilities.\n\n"
+        "Torr is an open-source benchmark developed by domain experts to evaluate various table-related tasks and capabilities.\n\n"
         ".. image:: https://raw.githubusercontent.com/IBM/unitxt/main/assets/catalog/tables_benchmark.png\n"
         "   :alt: Optional alt text\n"
         "   :width: 30%\n"
@@ -53,4 +53,4 @@ def build_nested_benchmark(dir_path, prefix="recipes.tables_benchmark"):
         "It encompasses diverse domains and evaluates a range of capabilities, with additional tasks and domains integrated over time."
     ),
 )
-add_to_catalog(benchmark, "benchmarks.tables_benchmark", overwrite=True)
+add_to_catalog(benchmark, "benchmarks.torr", overwrite=True)
diff --git a/prepare/cards/ragbench_faithfulness.py b/prepare/cards/ragbench_faithfulness.py
@@ -0,0 +1,41 @@
+from unitxt import add_to_catalog
+from unitxt.blocks import (
+    LoadHF,
+    TaskCard,
+)
+from unitxt.operators import Copy, ExecuteExpression
+from unitxt.templates import NullTemplate
+
+for subset in [
+    "covidqa",
+    "cuad",
+    "delucionqa",
+    "emanual",
+    "expertqa",
+    "finqa",
+    "hagrid",
+    "hotpotqa",
+    "msmarco",
+    "pubmedqa",
+    "tatqa",
+    "techqa",
+]:
+    card = TaskCard(
+        loader=LoadHF(
+            path="rungalileo/ragbench",
+            name=subset,
+            split="test"
+        ),
+        preprocess_steps=[
+            Copy(field="response", to_field="answer"),
+            Copy(field="documents", to_field="contexts"),
+            ExecuteExpression(expression="int(adherence_score)", to_field="number_val"),
+            ExecuteExpression(expression="['yes' if adherence_score else 'no']", to_field="is_faithful"),
+        ],
+        task="tasks.rag_eval.faithfulness.binary",
+        templates={"default": NullTemplate()},
+    )
+
+    add_to_catalog(
+        card, f"cards.rag_eval.faithfulness.ragbench.{subset}", overwrite=True
+    )
diff --git a/prepare/engines/classification/classification_engines.py b/prepare/engines/classification/classification_engines.py
@@ -6,6 +6,7 @@
 )
 
 model_names_to_provider = {
+    "mistral-large-instruct": ["watsonx", "rits"],
     "llama-3-3-70b-instruct": ["watsonx", "rits"],
     "llama-3-1-70b-instruct": ["watsonx", "rits"],
     "gpt-4o": ["open-ai"],

diff --git a/prepare/metrics/llm_as_judge/rag_judge.py b/prepare/metrics/llm_as_judge/rag_judge.py
@@ -102,6 +102,8 @@ def get_prediction_field(metric_type):
     "llama_3_3_70b_instruct_watsonx": "engines.classification.llama_3_3_70b_instruct_watsonx",
     "llama_3_3_70b_instruct_rits": "engines.classification.llama_3_3_70b_instruct_rits",
     "gpt_4o_azure": "engines.classification.gpt_4o_2024_08_06_azure_openai",
+    "mistral_large_instruct_watsonx": "engines.classification.mistral_large_watsonx",
+    "mistral_large_instruct_rits": "engines.classification.mistral_large_instruct_2407_rits",
     generic_engine_label: GenericInferenceEngine(),
 }
 

diff --git a/prepare/recipes/tables_benchmark.py → prepare/recipes/torr.py b/prepare/recipes/tables_benchmark.py → prepare/recipes/torr.py
@@ -51,7 +51,7 @@
 
             add_to_catalog(
                 DatasetRecipe(**kwargs),
-                f"recipes.tables_benchmark.{card}.{serializer}."
+                f"recipes.torr.{card}.{serializer}."
                 + (",".join(augment).split("[")[0] if augment else "no")
                 + f"_augmentation_{num_demos}_demos",
                 overwrite=True,

diff --git a/prepare/templates/rag_eval/rag_eval_numeric.py b/prepare/templates/rag_eval/rag_eval_numeric.py
@@ -53,7 +53,7 @@ def add_rag_templates(
 
 correctness_input_format = """Question: {question}\n
 Ground-truth answer: {ground_truths}\n
-Prediction: {answer}
+Prediction: {answer}.\n
 """
 
 correctness_reference_based_with_context_input_format = """Question: {question}\n