diff --git a/examples/evaluate_rag_end_to_end_dataset_with_given_predictions.py b/examples/evaluate_rag_end_to_end_dataset_with_given_predictions.py index 2d7e3a5098..ac6399bae8 100644 --- a/examples/evaluate_rag_end_to_end_dataset_with_given_predictions.py +++ b/examples/evaluate_rag_end_to_end_dataset_with_given_predictions.py @@ -45,11 +45,21 @@ }, ] +# select recommended metrics according to your available resources. +metrics = [ + "metrics.rag.end_to_end.recommended.cpu_only.all", + # "metrics.rag.end_to_end.recommended.small_llm.all", + # "metrics.rag.end_to_end.recommended.llmaj_watsonx.all", + # "metrics.rag.end_to_end.recommended.llmaj_rits.all" + # "metrics.rag.end_to_end.recommended.llmaj_azure.all" +] + dataset = create_dataset( task="tasks.rag.end_to_end", test_set=dataset, split="test", postprocessors=[], + metrics=metrics, ) results = evaluate(predictions, dataset) diff --git a/examples/evaluate_rag_response_generation.py b/examples/evaluate_rag_response_generation.py index 01249b5b0b..1856f61a40 100644 --- a/examples/evaluate_rag_response_generation.py +++ b/examples/evaluate_rag_response_generation.py @@ -58,6 +58,15 @@ ), ) +# select recommended metrics according to your available resources. +metrics = [ + "metrics.rag.response_generation.recommended.cpu_only.all", + # "metrics.rag.response_generation.recommended.small_llm.all", + # "metrics.rag.response_generation.recommended.llmaj_watsonx.all", + # "metrics.rag.response_generation.recommended.llmaj_rits.all" + # "metrics.rag.response_generation.recommended.llmaj_azure.all" +] + # Verbalize the dataset using the template dataset = load_dataset( card=card, @@ -65,6 +74,7 @@ format="formats.chat_api", split="test", max_test_instances=10, + metrics=metrics, )