-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fixes and adjustment in rag metrics and related inference engines (#1466
) * add new classification engines and remove deleted llama-3-70b-instruct in bam Signed-off-by: lilacheden <[email protected]> * allow proprietary data on RITSInferenceEngine Signed-off-by: lilacheden <[email protected]> * propagate score_prefix from metricPipeline to its metric Signed-off-by: lilacheden <[email protected]> * Adjust autorag metrics to unitxt flow with rag.response_generation task Signed-off-by: lilacheden <[email protected]> * Adjust granite_guardian to unitxt flow Signed-off-by: lilacheden <[email protected]> * Add AzureOpenAIInferenceEngine Signed-off-by: lilacheden <[email protected]> * lowercase engine label Signed-off-by: lilacheden <[email protected]> * move to azure openai classification engines Signed-off-by: lilacheden <[email protected]> * set chat_api format for openai Signed-off-by: lilacheden <[email protected]> * update secrets Signed-off-by: lilacheden <[email protected]> * delete old inference engine Signed-off-by: lilacheden <[email protected]> * update secrets Signed-off-by: lilacheden <[email protected]> * add numeric and verbal postprocessors Signed-off-by: lilacheden <[email protected]> * fix rag judge example Signed-off-by: lilacheden <[email protected]> * add numeric and verbal rag judge templates Signed-off-by: lilacheden <[email protected]> * add rag judges that use the new templates Signed-off-by: lilacheden <[email protected]> * fix import Signed-off-by: lilacheden <[email protected]> * rename metrics with correct template Signed-off-by: lilacheden <[email protected]> * avoid import from prepare Signed-off-by: lilacheden <[email protected]> * remove old metrics Signed-off-by: lilacheden <[email protected]> * add token overlap based context relevance and answer relevance metrics Signed-off-by: lilacheden <[email protected]> * add postprocessors tests Signed-off-by: lilacheden <[email protected]> * keep only recommended rag llmaj, deprecate old path to metrics Signed-off-by: lilacheden <[email protected]> * update secret Signed-off-by: lilacheden <[email protected]> * update secret again Signed-off-by: lilacheden <[email protected]> * fix typo Signed-off-by: lilacheden <[email protected]> * remove gen_ai from inference test Signed-off-by: lilacheden <[email protected]> * comment out input_tokens test Signed-off-by: lilacheden <[email protected]>
- Loading branch information
1 parent
6e7284b
commit 1350d56
Showing
101 changed files
with
1,274 additions
and
226 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
from unitxt import add_to_catalog | ||
from unitxt.artifact import UnitxtArtifactNotFoundError, fetch_artifact | ||
from unitxt.inference import GenericInferenceEngine | ||
from unitxt.llm_as_judge import ( | ||
TaskBasedLLMasJudge, | ||
) | ||
|
||
metric_type_to_template_dict = { | ||
"faithfulness": { | ||
"q_c_a": "judge_with_question_simplified", | ||
"c_a": "judge_no_question_simplified", | ||
}, | ||
"context_relevance": {"q_c_ares": "judge_context_relevance_ares"}, | ||
"correctness_holistic": {"q_c_a": "judge_correctness_simple"}, | ||
"answer_correctness": {"q_a_gt_loose": "judge_loose_match_no_context"}, | ||
"answer_relevance": {"q_a": "judge_answer_relevance"}, | ||
} | ||
metric_type_to_realization = { | ||
"faithfulness": "_verbal", | ||
"context_relevance": "_numeric", | ||
"correctness_holistic": "_numeric", | ||
"answer_correctness": "_numeric", | ||
"answer_relevance": "_numeric", | ||
} | ||
|
||
generic_engine_label = "generic_inference_engine" | ||
inference_models = { | ||
"llama_3_1_70b_instruct_wml": "engines.classification.llama_3_1_70b_instruct_wml", | ||
generic_engine_label: GenericInferenceEngine(), | ||
} | ||
|
||
|
||
def get_prediction_field(metric_type): | ||
return None if metric_type == "context_relevance" else "answer" | ||
|
||
|
||
for metric_type, template_dict in metric_type_to_template_dict.items(): | ||
for template_short_name, template_name in template_dict.items(): | ||
task_name = f"tasks.rag_eval.{metric_type}.binary" | ||
for logprobs_label in [ | ||
"", | ||
"_logprobs", | ||
metric_type_to_realization[metric_type], | ||
]: | ||
use_logprobs = logprobs_label == "_logprobs" | ||
template = ( | ||
f"templates.rag_eval.{metric_type}.{template_name}{logprobs_label}" | ||
) | ||
try: | ||
t = fetch_artifact(template)[0] | ||
except UnitxtArtifactNotFoundError: | ||
continue | ||
for inf_label, inference_model in inference_models.items(): | ||
if ( | ||
use_logprobs and inf_label == generic_engine_label | ||
): # engine GenericInferenceEngine does not support logprobs | ||
continue | ||
|
||
metric_label = f"{metric_type}_{template_short_name}{logprobs_label}" | ||
metric = TaskBasedLLMasJudge( | ||
inference_model=inference_model, | ||
template=template, | ||
task=task_name, | ||
format=None, | ||
main_score=metric_label, | ||
prediction_field=get_prediction_field(metric_type), | ||
infer_log_probs=use_logprobs, | ||
) | ||
|
||
new_catalog_name = f"metrics.rag.{metric_type}.{inf_label}_{template_short_name}{logprobs_label}" | ||
|
||
add_to_catalog( | ||
metric, | ||
new_catalog_name, | ||
overwrite=True, | ||
) | ||
|
||
if logprobs_label in ["_logprobs", ""]: | ||
metric = TaskBasedLLMasJudge( | ||
inference_model=inference_model, | ||
template=template, | ||
task=task_name, | ||
format=None, | ||
main_score=metric_label, | ||
prediction_field=get_prediction_field(metric_type), | ||
infer_log_probs=use_logprobs, | ||
__deprecated_msg__=f"This metric should be replaced with {new_catalog_name}", | ||
) | ||
# for backwards compatibility: keep also legacy path to metrics | ||
add_to_catalog( | ||
metric, | ||
f"metrics.llm_as_judge.binary.{inf_label}_{metric_label}", | ||
overwrite=True, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
Oops, something went wrong.