Skip to content

Commit

Permalink
Merge branch 'main' into update_rag_metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
elronbandel authored Jan 19, 2025
2 parents e96bb97 + b17ac7c commit 7481cf4
Show file tree
Hide file tree
Showing 66 changed files with 1,773 additions and 244 deletions.
61 changes: 21 additions & 40 deletions examples/evaluate_existing_dataset_by_llm_as_judge_direct.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from unitxt.inference import (
CrossProviderInferenceEngine,
)
from unitxt.text_utils import print_dict

logger = get_logger()
settings = get_settings()
Expand All @@ -16,15 +15,14 @@
metrics = [
"metrics.llm_as_judge.direct.rits.llama3_1_70b"
"[context_fields=[context,question],"
f"criteria=metrics.llm_as_judge.direct.criterias.{criteria},"
f"score_prefix={criteria}_]"
f"criteria=metrics.llm_as_judge.direct.criterias.{criteria}]"
for criteria in criterias
]
dataset = load_dataset(
card="cards.squad",
metrics=metrics,
loader_limit=10,
max_test_instances=10,
loader_limit=20,
max_test_instances=20,
split="test",
)

Expand All @@ -48,37 +46,20 @@
evaluated_predictions = evaluate(predictions=predictions, data=dataset)
evaluated_gold_answers = evaluate(predictions=gold_answers, data=dataset)

print_dict(
evaluated_predictions[0],
keys_to_print=[
"source",
"score",
],
)
print_dict(
evaluated_gold_answers[0],
keys_to_print=[
"source",
"score",
],
)

for criteria in criterias:
logger.info(f"Scores for criteria '{criteria}'")
gold_answer_scores = [
instance["score"]["instance"][f"{criteria}_llm_as_a_judge_score"]
for instance in evaluated_gold_answers
instance["score"]["instance"][criteria] for instance in evaluated_gold_answers
]
gold_answer_position_bias = [
int(instance["score"]["instance"][f"{criteria}_positional_bias"])
instance["score"]["instance"][f"{criteria}_positional_bias"]
for instance in evaluated_gold_answers
]
prediction_scores = [
instance["score"]["instance"][f"{criteria}_llm_as_a_judge_score"]
for instance in evaluated_predictions
instance["score"]["instance"][criteria] for instance in evaluated_predictions
]
prediction_position_bias = [
int(instance["score"]["instance"][f"{criteria}_positional_bias"])
prediction_scores_position_bias = [
instance["score"]["instance"][f"{criteria}_positional_bias"]
for instance in evaluated_predictions
]

Expand All @@ -92,27 +73,27 @@
f"Positional bias occurrence on gold answers: {statistics.mean(gold_answer_position_bias)}"
)
logger.info(
f"Positional bias occurrence on predicted answers: {statistics.mean(prediction_position_bias)}\n"
f"Positional bias occurrence on predicted answers: {statistics.mean(prediction_scores_position_bias)}\n"
)

"""
Output with 100 examples
Output with 20 examples
Scores for criteria 'answer_relevance'
Scores of gold answers: 0.9625 +/- 0.14811526360619054
Scores of predicted answers: 0.5125 +/- 0.4638102516061385
Positional bias occurrence on gold answers: 0.03
Positional bias occurrence on predicted answers: 0.12
Scores of gold answers: 0.8875 +/- 0.18978866362906205
Scores of predicted answers: 0.7625 +/- 0.3390679950439998
Positional bias occurrence on gold answers: 0.25
Positional bias occurrence on predicted answers: 0.25
Scores for criteria 'coherence'
Scores of gold answers: 0.159 +/- 0.15689216524464028
Scores of predicted answers: 0.066 +/- 0.11121005695384194
Positional bias occurrence on gold answers: 0.16
Positional bias occurrence on predicted answers: 0.07
Scores of gold answers: 0.8125 +/- 0.2910394257972982
Scores of predicted answers: 0.6875 +/- 0.39632356531129037
Positional bias occurrence on gold answers: 0.3
Positional bias occurrence on predicted answers: 0.3
Scores for criteria 'conciseness'
Scores of gold answers: 1.0 +/- 0.0
Scores of predicted answers: 0.34 +/- 0.47609522856952335
Positional bias occurrence on gold answers: 0.03
Positional bias occurrence on predicted answers: 0.01
Scores of predicted answers: 0.6 +/- 0.5026246899500346
Positional bias occurrence on gold answers: 0
Positional bias occurrence on predicted answers: 0.05
"""
89 changes: 89 additions & 0 deletions examples/evaluate_existing_dataset_by_llm_as_judge_pairwise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import json

from unitxt import get_logger, get_settings, load_dataset
from unitxt.api import evaluate
from unitxt.inference import (
CrossProviderInferenceEngine,
)
from unitxt.templates import NullTemplate

logger = get_logger()
settings = get_settings()

num_test_instances = 10

# Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog.
# We set loader_limit to 20 to reduce download time.

dataset = load_dataset(
card="cards.squad",
loader_limit=num_test_instances,
max_test_instances=num_test_instances,
split="test",
)

# Infer a model to get predictions.
inference_model_1 = CrossProviderInferenceEngine(
model="llama-3-2-1b-instruct", provider="watsonx"
)

inference_model_2 = CrossProviderInferenceEngine(
model="llama-3-8b-instruct", provider="watsonx"
)

inference_model_3 = CrossProviderInferenceEngine(
model="llama-3-70b-instruct", provider="watsonx"
)

"""
We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
watsonx, bam, openai, azure, aws and more.
For the arguments these inference engines can receive, please refer to the classes documentation or read
about the the open ai api arguments the CrossProviderInferenceEngine follows.
"""
predictions_1 = inference_model_1.infer(dataset)
predictions_2 = inference_model_2.infer(dataset)
predictions_3 = inference_model_3.infer(dataset)

gold_answers = [d[0] for d in dataset["references"]]

# Evaluate the predictions using the defined metric.
predictions = [
list(t)
for t in list(zip(gold_answers, predictions_1, predictions_2, predictions_3))
]

print(json.dumps(predictions, indent=4))

criterias = ["factually_consistent"]
metrics = [
"metrics.llm_as_judge.pairwise.rits.llama3_1_405b"
f"[criteria=metrics.llm_as_judge.pairwise.criterias.{criteria},"
"context_fields=[context,question]]"
for criteria in criterias
]
dataset = load_dataset(
card="cards.squad",
loader_limit=num_test_instances,
max_test_instances=num_test_instances,
metrics=metrics,
template=NullTemplate(),
split="test",
)

evaluated_predictions = evaluate(predictions=predictions, data=dataset)

prediction_scores_by_system = {
f"system_{system}": {
"per_instance_winrate": [
instance["score"]["instance"][f"{system}_winrate"]
for instance in evaluated_predictions
],
"mean_winrate": evaluated_predictions[0]["score"]["global"][
f"{system}_winrate"
],
}
for system in range(1, len(predictions[0]) + 1)
}
print(json.dumps(prediction_scores_by_system, indent=4))
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from unitxt import evaluate, load_dataset
from unitxt.blocks import Task, TaskCard
from unitxt.llm_as_judge_operators import CreateYesNoCriteriaFromString
from unitxt.llm_as_judge import CreateYesNoCriteriaFromString
from unitxt.loaders import LoadFromDictionary

data = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@
print(results.global_scores.summary)

print("Instance Scores:")
print(results.instance_scores.summary)
print(results.instance_scores)
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,4 @@
print(results.global_scores.summary)

print("Instance Scores:")
print(results.instance_scores.summary)
print(results.instance_scores)
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@

from unitxt import evaluate, load_dataset
from unitxt.blocks import Task, TaskCard
from unitxt.llm_as_judge_operators import (
CreateCriteriaFromString,
)
from unitxt.llm_as_judge import CreateCriteriaFromString
from unitxt.loaders import LoadFromDictionary
from unitxt.templates import NullTemplate

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from unitxt import evaluate, load_dataset
from unitxt.blocks import Task, TaskCard
from unitxt.llm_as_judge_operators import LoadCriteria
from unitxt.llm_as_judge import LoadCriteria
from unitxt.loaders import LoadFromDictionary
from unitxt.templates import NullTemplate

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
from unitxt.api import evaluate, load_dataset
from unitxt.card import Task, TaskCard
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.llm_as_judge import LLMJudgePairwise
from unitxt.llm_as_judge_operators import CreateCriteriaFromDict
from unitxt.llm_as_judge import CreateCriteriaFromDict, LLMJudgePairwise
from unitxt.loaders import LoadFromDictionary
from unitxt.templates import NullTemplate

Expand Down
46 changes: 25 additions & 21 deletions prepare/metrics/llm_as_judge/llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,25 +71,29 @@ def get_evaluator(

logger.debug("Registering evaluators...")
for evaluator_metadata in EVALUATORS_METADATA:
for provider in evaluator_metadata.providers:
for evaluator_type in [
EvaluatorTypeEnum.DIRECT,
EvaluatorTypeEnum.PAIRWISE,
]:
evaluator = get_evaluator(
name=evaluator_metadata.name,
evaluator_type=evaluator_type,
provider=provider,
)
if evaluator_metadata.name not in [
EvaluatorNameEnum.GRANITE_GUARDIAN_2B,
EvaluatorNameEnum.GRANITE_GUARDIAN_8B,
]:
for provider in evaluator_metadata.providers:
for evaluator_type in [
EvaluatorTypeEnum.DIRECT,
EvaluatorTypeEnum.PAIRWISE,
]:
evaluator = get_evaluator(
name=evaluator_metadata.name,
evaluator_type=evaluator_type,
provider=provider,
)

metric_name = (
evaluator_metadata.name.value.lower()
.replace("-", "_")
.replace(".", "_")
.replace(" ", "_")
)
add_to_catalog(
evaluator,
f"metrics.llm_as_judge.{evaluator_type.value}.{provider.value.lower()}.{metric_name}",
overwrite=True,
)
metric_name = (
evaluator_metadata.name.value.lower()
.replace("-", "_")
.replace(".", "_")
.replace(" ", "_")
)
add_to_catalog(
evaluator,
f"metrics.llm_as_judge.{evaluator_type.value}.{provider.value.lower()}.{metric_name}",
overwrite=True,
)
19 changes: 13 additions & 6 deletions src/unitxt/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from abc import abstractmethod
from typing import Dict, Union
from typing import Dict, List, Optional, Union

from .dataclass import NonPositionalField
from .formats import Format
from .fusion import FixedFusion, WeightedFusion
from .fusion import FixedFusion
from .operator import SourceOperator
from .standard import DatasetRecipe
from .stream import MultiStream
Expand All @@ -15,6 +15,10 @@ class BaseBenchmark(SourceOperator):
num_demos: int = NonPositionalField(default=None)
system_prompt: SystemPrompt = NonPositionalField(default=None)
loader_limit: int = NonPositionalField(default=None)
splits: List[str] = NonPositionalField(
default_factory=lambda: ["train", "validation", "test"]
)
subset: Optional[str] = NonPositionalField(default=None)

@abstractmethod
def reset(self):
Expand Down Expand Up @@ -65,14 +69,17 @@ def prepare(self):
def process(
self,
) -> MultiStream:
if self.subset is not None:
subsets = {self.subset: self.subsets[self.subset]}
else:
subsets = self.subsets
if self.max_total_samples is None:
operator = FixedFusion(
subsets=self.subsets,
subsets=subsets,
max_instances_per_subset=self.max_samples_per_subset,
include_splits=self.splits,
)
else:
operator = WeightedFusion(
subsets=self.subsets, max_total_samples=self.max_total_samples
)
raise NotImplementedError()

return operator()
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
"__type__": "llm_judge_direct",
"inference_engine": {
"__type__": "lite_llm_inference_engine",
"model": "watsonx/ibm/granite-guardian-3-2b",
"model": "gpt-4o-2024-08-06",
"max_tokens": 1024,
"seed": 42
},
"evaluator_name": "GRANITE_GUARDIAN_2B",
"evaluator_name": "GPT4",
"generate_summaries": false
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
"__type__": "llm_judge_direct",
"inference_engine": {
"__type__": "lite_llm_inference_engine",
"model": "watsonx/ibm/granite-guardian-3-8b",
"model": "o1-mini-2024-09-12",
"max_tokens": 1024,
"seed": 42
},
"evaluator_name": "GRANITE_GUARDIAN_8B",
"evaluator_name": "O1_MINI",
"generate_summaries": false
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"__type__": "llm_judge_direct",
"inference_engine": {
"__type__": "lite_llm_inference_engine",
"model": "o1-preview-2024-09-12",
"max_tokens": 1024,
"seed": 42
},
"evaluator_name": "O1_PREVIEW",
"generate_summaries": false
}
Loading

0 comments on commit 7481cf4

Please sign in to comment.