Merge branch 'main' into ragbench

IBM · Feb 18, 2025 · 69f7e1a · 69f7e1a
2 parents a57746e + 3dd1f9c
commit 69f7e1a
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 4 deletions.
diff --git a/docs/docs/examples.rst b/docs/docs/examples.rst
@@ -52,13 +52,23 @@ These examples demonstrate how to evaluate a datasets of different tasks when pr
 
 Related documentation: :ref:`Evaluating datasets <evaluating_datasets>`
 
+Evaluate a custom dataset with custom metric
+===================================================
+
+This example demonstrates to add a custom metric.  It adds a referenceless metric that checks if the model output 
+is a valid json, for an extraction task.
+
+`Example code  <https://github.com/IBM/unitxt/blob/main/examples/evaluate_with_custom_metric.py>`__
+
+Related documentation: :ref:`Add new metric tutorial <adding_metric>` 
+
 Evaluate a Named Entity Recognition (NER) dataset
 ===================================================
 
 This example demonstrates how to evaluate a named entity recognition task.
 The ground truth entities are provided as spans within the provided texts, 
 and the model is prompted to identify these entities.
-Classifical f1_micro, f1_macro, and per-entity-type f1 metrics are reported.
+Classical f1_micro, f1_macro, and per-entity-type f1 metrics are reported.
 
 `Example code  <https://github.com/IBM/unitxt/blob/main/examples/ner_evaluation.py>`__
 

diff --git a/examples/evaluate_with_custom_metric.py b/examples/evaluate_with_custom_metric.py
@@ -0,0 +1,78 @@
+from typing import Dict, List
+
+from unitxt import get_logger
+from unitxt.api import create_dataset, evaluate
+from unitxt.blocks import Task
+from unitxt.inference import (
+    CrossProviderInferenceEngine,
+)
+from unitxt.metrics import InstanceMetric
+from unitxt.templates import InputOutputTemplate
+
+logger = get_logger()
+
+# Set up question answer pairs in a dictionary
+data = [
+    {"text": "John paid Apple $100 dollars."},
+    {"text": "IBM was paid 200 dollars by Phil"},
+]
+
+class IsValidJson(InstanceMetric):
+
+    main_score = "valid_json" # name of the main score
+    reduction_map = {"mean": ["valid_json"]} # defines that the global score is a mean of the instance scores
+    ci_scores = ["valid_json"] # define that confidence internal should be calculated on the score
+    prediction_type = str # the metric expect the prediction as an int
+
+    def compute(
+        self, references: List[str], prediction: str, task_data: List[Dict]
+    ) -> dict:
+        try:
+            import json
+            json.loads(prediction)
+            return { self.main_score : 1.0, "error" : "no errors. successfully parsed json."}
+        except Exception as e:
+            return { self.main_score : 0, "error" : str(e)}
+
+
+# define the QA task
+task = Task(
+    input_fields={"text": str},
+    reference_fields={},
+    prediction_type=str,
+    metrics=[IsValidJson()],
+)
+
+
+# Create a simple template that formats the input.
+# Add lowercase normalization as a post processor.
+
+template = InputOutputTemplate(
+    instruction="Extract the company name and amount as a json with two keys COMPANY_NAME and AMOUNT. Return only the a valid json that can be parsed, without any explanations or prefixes and suffixes",
+    input_format="{text}",
+    output_format="",
+)
+# Verbalize the dataset using the template
+dataset = create_dataset(
+    task=task, test_set=data, template=template, format="formats.chat_api", split="test"
+)
+
+
+# Infer using SmolLM2 using HF API
+#model = HFPipelineBasedInferenceEngine(
+#    model_name="HuggingFaceTB/SmolLM2-1.7B-Instruct", max_new_tokens=32
+#)
+# Change to this to infer with external APIs:
+# from unitxt.inference import CrossProviderInferenceEngine
+model = CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
+# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam". "rits"]
+
+
+predictions = model(dataset)
+results = evaluate(predictions=predictions, data=dataset)
+
+print("Instance Results:")
+print(results.instance_scores)
+
+print("Global Results:")
+print(results.global_scores.summary)
diff --git a/examples/standalone_qa_evaluation.py b/examples/standalone_qa_evaluation.py
@@ -43,15 +43,16 @@
 )
 # Change to this to infer with external APIs:
 # from unitxt.inference import CrossProviderInferenceEngine
-# engine = CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
+# model = CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
 # The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam". "rits"]
 
 
 predictions = model(dataset)
 results = evaluate(predictions=predictions, data=dataset)
 
+print("Instance Results:")
+print(results.instance_scores)
+
 print("Global Results:")
 print(results.global_scores.summary)
 
-print("Instance Results:")
-print(results.instance_scores.summary)