Skip to content

Commit

Permalink
Merge branch 'main' into new-text2sql-metrics-scores
Browse files Browse the repository at this point in the history
  • Loading branch information
elronbandel authored Feb 18, 2025
2 parents e694a16 + b47f4fe commit 3fe6200
Show file tree
Hide file tree
Showing 403 changed files with 1,298 additions and 461 deletions.
34 changes: 34 additions & 0 deletions examples/evaluate_torr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from unitxt import evaluate, load_dataset, settings
from unitxt.inference import (
CrossProviderInferenceEngine,
)

with settings.context(
allow_unverified_code=True,
mock_inference_mode=True,
):
test_dataset = load_dataset(
"benchmarks.torr",
split="test",
use_cache=True,
)
# Infer
model = CrossProviderInferenceEngine(
model="llama-3-8b-instruct",
max_tokens=30,
)
"""
We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
watsonx, bam, openai, azure, aws and more.
For the arguments these inference engines can receive, please refer to the classes documentation or read
about the the open ai api arguments the CrossProviderInferenceEngine follows.
"""

predictions = model(test_dataset)
results = evaluate(predictions=predictions, data=test_dataset)

print("Global scores:")
print(results.global_scores.summary)
print("Subsets scores:")
print(results.subsets_scores.summary)
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
tables_benchmark_dir = os.path.join(
constants.catalog_dir,
"recipes",
"tables_benchmark",
"torr",
)


# Recursive function to build nested benchmarks
def build_nested_benchmark(dir_path, prefix="recipes.tables_benchmark"):
def build_nested_benchmark(dir_path, prefix="recipes.torr"):
nested_scenarios = OrderedDict()

for entry in sorted(os.listdir(dir_path)):
Expand Down Expand Up @@ -44,7 +44,7 @@ def build_nested_benchmark(dir_path, prefix="recipes.tables_benchmark"):
benchmark = Benchmark(
tables_benchmark_scenarios.subsets,
__description__=(
"TablesBenchmark is an open-source benchmark developed by domain experts to evaluate various table-related tasks and capabilities.\n\n"
"Torr is an open-source benchmark developed by domain experts to evaluate various table-related tasks and capabilities.\n\n"
".. image:: https://raw.githubusercontent.com/IBM/unitxt/main/assets/catalog/tables_benchmark.png\n"
" :alt: Optional alt text\n"
" :width: 30%\n"
Expand All @@ -53,4 +53,4 @@ def build_nested_benchmark(dir_path, prefix="recipes.tables_benchmark"):
"It encompasses diverse domains and evaluates a range of capabilities, with additional tasks and domains integrated over time."
),
)
add_to_catalog(benchmark, "benchmarks.tables_benchmark", overwrite=True)
add_to_catalog(benchmark, "benchmarks.torr", overwrite=True)
41 changes: 41 additions & 0 deletions prepare/cards/ragbench_faithfulness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from unitxt import add_to_catalog
from unitxt.blocks import (
LoadHF,
TaskCard,
)
from unitxt.operators import Copy, ExecuteExpression
from unitxt.templates import NullTemplate

for subset in [
"covidqa",
"cuad",
"delucionqa",
"emanual",
"expertqa",
"finqa",
"hagrid",
"hotpotqa",
"msmarco",
"pubmedqa",
"tatqa",
"techqa",
]:
card = TaskCard(
loader=LoadHF(
path="rungalileo/ragbench",
name=subset,
split="test"
),
preprocess_steps=[
Copy(field="response", to_field="answer"),
Copy(field="documents", to_field="contexts"),
ExecuteExpression(expression="int(adherence_score)", to_field="number_val"),
ExecuteExpression(expression="['yes' if adherence_score else 'no']", to_field="is_faithful"),
],
task="tasks.rag_eval.faithfulness.binary",
templates={"default": NullTemplate()},
)

add_to_catalog(
card, f"cards.rag_eval.faithfulness.ragbench.{subset}", overwrite=True
)
1 change: 1 addition & 0 deletions prepare/engines/classification/classification_engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
)

model_names_to_provider = {
"mistral-large-instruct": ["watsonx", "rits"],
"llama-3-3-70b-instruct": ["watsonx", "rits"],
"llama-3-1-70b-instruct": ["watsonx", "rits"],
"gpt-4o": ["open-ai"],
Expand Down
2 changes: 2 additions & 0 deletions prepare/metrics/llm_as_judge/rag_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ def get_prediction_field(metric_type):
"llama_3_3_70b_instruct_watsonx": "engines.classification.llama_3_3_70b_instruct_watsonx",
"llama_3_3_70b_instruct_rits": "engines.classification.llama_3_3_70b_instruct_rits",
"gpt_4o_azure": "engines.classification.gpt_4o_2024_08_06_azure_openai",
"mistral_large_instruct_watsonx": "engines.classification.mistral_large_watsonx",
"mistral_large_instruct_rits": "engines.classification.mistral_large_instruct_2407_rits",
generic_engine_label: GenericInferenceEngine(),
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@

add_to_catalog(
DatasetRecipe(**kwargs),
f"recipes.tables_benchmark.{card}.{serializer}."
f"recipes.torr.{card}.{serializer}."
+ (",".join(augment).split("[")[0] if augment else "no")
+ f"_augmentation_{num_demos}_demos",
overwrite=True,
Expand Down
2 changes: 1 addition & 1 deletion prepare/templates/rag_eval/rag_eval_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def add_rag_templates(

correctness_input_format = """Question: {question}\n
Ground-truth answer: {ground_truths}\n
Prediction: {answer}
Prediction: {answer}.\n
"""

correctness_reference_based_with_context_input_format = """Question: {question}\n
Expand Down
Loading

0 comments on commit 3fe6200

Please sign in to comment.