Skip to content

Commit

Permalink
Merge branch 'main' into ragbench
Browse files Browse the repository at this point in the history
  • Loading branch information
elronbandel authored Feb 18, 2025
2 parents a57746e + 3dd1f9c commit 69f7e1a
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 4 deletions.
12 changes: 11 additions & 1 deletion docs/docs/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,23 @@ These examples demonstrate how to evaluate a datasets of different tasks when pr

Related documentation: :ref:`Evaluating datasets <evaluating_datasets>`

Evaluate a custom dataset with custom metric
===================================================

This example demonstrates to add a custom metric. It adds a referenceless metric that checks if the model output
is a valid json, for an extraction task.

`Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_with_custom_metric.py>`__

Related documentation: :ref:`Add new metric tutorial <adding_metric>`

Evaluate a Named Entity Recognition (NER) dataset
===================================================

This example demonstrates how to evaluate a named entity recognition task.
The ground truth entities are provided as spans within the provided texts,
and the model is prompted to identify these entities.
Classifical f1_micro, f1_macro, and per-entity-type f1 metrics are reported.
Classical f1_micro, f1_macro, and per-entity-type f1 metrics are reported.

`Example code <https://github.com/IBM/unitxt/blob/main/examples/ner_evaluation.py>`__

Expand Down
78 changes: 78 additions & 0 deletions examples/evaluate_with_custom_metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from typing import Dict, List

from unitxt import get_logger
from unitxt.api import create_dataset, evaluate
from unitxt.blocks import Task
from unitxt.inference import (
CrossProviderInferenceEngine,
)
from unitxt.metrics import InstanceMetric
from unitxt.templates import InputOutputTemplate

logger = get_logger()

# Set up question answer pairs in a dictionary
data = [
{"text": "John paid Apple $100 dollars."},
{"text": "IBM was paid 200 dollars by Phil"},
]

class IsValidJson(InstanceMetric):

main_score = "valid_json" # name of the main score
reduction_map = {"mean": ["valid_json"]} # defines that the global score is a mean of the instance scores
ci_scores = ["valid_json"] # define that confidence internal should be calculated on the score
prediction_type = str # the metric expect the prediction as an int

def compute(
self, references: List[str], prediction: str, task_data: List[Dict]
) -> dict:
try:
import json
json.loads(prediction)
return { self.main_score : 1.0, "error" : "no errors. successfully parsed json."}
except Exception as e:
return { self.main_score : 0, "error" : str(e)}


# define the QA task
task = Task(
input_fields={"text": str},
reference_fields={},
prediction_type=str,
metrics=[IsValidJson()],
)


# Create a simple template that formats the input.
# Add lowercase normalization as a post processor.

template = InputOutputTemplate(
instruction="Extract the company name and amount as a json with two keys COMPANY_NAME and AMOUNT. Return only the a valid json that can be parsed, without any explanations or prefixes and suffixes",
input_format="{text}",
output_format="",
)
# Verbalize the dataset using the template
dataset = create_dataset(
task=task, test_set=data, template=template, format="formats.chat_api", split="test"
)


# Infer using SmolLM2 using HF API
#model = HFPipelineBasedInferenceEngine(
# model_name="HuggingFaceTB/SmolLM2-1.7B-Instruct", max_new_tokens=32
#)
# Change to this to infer with external APIs:
# from unitxt.inference import CrossProviderInferenceEngine
model = CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam". "rits"]


predictions = model(dataset)
results = evaluate(predictions=predictions, data=dataset)

print("Instance Results:")
print(results.instance_scores)

print("Global Results:")
print(results.global_scores.summary)
7 changes: 4 additions & 3 deletions examples/standalone_qa_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,16 @@
)
# Change to this to infer with external APIs:
# from unitxt.inference import CrossProviderInferenceEngine
# engine = CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
# model = CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam". "rits"]


predictions = model(dataset)
results = evaluate(predictions=predictions, data=dataset)

print("Instance Results:")
print(results.instance_scores)

print("Global Results:")
print(results.global_scores.summary)

print("Instance Results:")
print(results.instance_scores.summary)

0 comments on commit 69f7e1a

Please sign in to comment.