From 1ad5553c226cec18829431a6a141894d2cdd031f Mon Sep 17 00:00:00 2001 From: Jason Dai Date: Thu, 2 Nov 2023 17:41:54 -0700 Subject: [PATCH] feat(components): add LLM Eval pipeline parameter for customizing eval dataset reference ground truth field PiperOrigin-RevId: 579029514 --- components/google-cloud/RELEASE.md | 1 + .../model_evaluation/llm_evaluation/component.py | 2 +- .../evaluation_llm_text_generation_pipeline.py | 11 ++++++----- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/components/google-cloud/RELEASE.md b/components/google-cloud/RELEASE.md index d65b7c4246b6..608a9c520285 100644 --- a/components/google-cloud/RELEASE.md +++ b/components/google-cloud/RELEASE.md @@ -1,5 +1,6 @@ ## Upcoming release * Bump supported KFP versions to kfp>=2.0.0b10,<=2.4.0 +* Add LLM Eval pipeline parameter for customizing eval dataset reference ground truth field ## Release 2.5.0 * Upload tensorboard metrics from `preview.llm.rlhf_pipeline` if a `tensorboard_resource_id` is provided at runtime. diff --git a/components/google-cloud/google_cloud_pipeline_components/_implementation/model_evaluation/llm_evaluation/component.py b/components/google-cloud/google_cloud_pipeline_components/_implementation/model_evaluation/llm_evaluation/component.py index 972e752097eb..99f931666750 100644 --- a/components/google-cloud/google_cloud_pipeline_components/_implementation/model_evaluation/llm_evaluation/component.py +++ b/components/google-cloud/google_cloud_pipeline_components/_implementation/model_evaluation/llm_evaluation/component.py @@ -32,7 +32,7 @@ def model_evaluation_text_generation( project: str, location: str, evaluation_task: str = 'text-generation', - target_field_name: str = 'instance.ground_truth', + target_field_name: str = 'instance.output_text', prediction_field_name: str = 'predictions.content', predictions_format: str = 'jsonl', joined_predictions_gcs_source: dsl.Input[Artifact] = None, diff --git a/components/google-cloud/google_cloud_pipeline_components/preview/model_evaluation/evaluation_llm_text_generation_pipeline.py b/components/google-cloud/google_cloud_pipeline_components/preview/model_evaluation/evaluation_llm_text_generation_pipeline.py index cae0f06a133c..51acd49b1763 100644 --- a/components/google-cloud/google_cloud_pipeline_components/preview/model_evaluation/evaluation_llm_text_generation_pipeline.py +++ b/components/google-cloud/google_cloud_pipeline_components/preview/model_evaluation/evaluation_llm_text_generation_pipeline.py @@ -30,11 +30,12 @@ def evaluation_llm_text_generation_pipeline( # pylint: disable=dangerous-defaul location: str, batch_predict_gcs_source_uris: List[str], batch_predict_gcs_destination_output_uri: str, - batch_predict_model_parameters: Dict[str, str] = {}, model_name: str = 'publishers/google/models/text-bison@001', evaluation_task: str = 'text-generation', + target_field_name: str = 'output_text', batch_predict_instances_format: str = 'jsonl', batch_predict_predictions_format: str = 'jsonl', + batch_predict_model_parameters: Dict[str, str] = {}, machine_type: str = 'e2-highmem-16', service_account: str = '', network: str = '', @@ -52,10 +53,11 @@ def evaluation_llm_text_generation_pipeline( # pylint: disable=dangerous-defaul Args: project: The GCP project that runs the pipeline components. location: The GCP region that runs the pipeline components. - batch_predict_gcs_source_uris: Google Cloud Storage URI(-s) to your instances data to run batch prediction on. The instances data should also contain the ground truth (target) data, used for evaluation. May contain wildcards. For more information on wildcards, see https://cloud.google.com/storage/docs/gsutil/addlhelp/WildcardNames. For more details about this input config, see https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig. - batch_predict_gcs_destination_output_uri: The Google Cloud Storage location of the directory where the output is to be written to. + batch_predict_gcs_source_uris: Google Cloud Storage URI(-s) to your eval dataset instances data to run batch prediction on. The instances data should also contain the ground truth (target) data, used for evaluation. May contain wildcards. For more information on wildcards, see https://cloud.google.com/storage/docs/gsutil/addlhelp/WildcardNames. For more details about this input config, see https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig. + batch_predict_gcs_destination_output_uri: The Google Cloud Storage location of the directory where the eval pipeline output is to be written to. model_name: The Model name used to run evaluation. Must be a publisher Model or a managed Model sharing the same ancestor location. Starting this job has no impact on any existing deployments of the Model and their resources. evaluation_task: The task that the large language model will be evaluated on. The evaluation component computes a set of metrics relevant to that specific task. Currently supported tasks are: `summarization`, `question-answering`, `text-generation`. + target_field_name: The field name of the eval dataset instance that contains an example reference text response. Alternatively referred to as the ground truth (or ground_truth_column) field. If not set, defaulted to `output_text`. batch_predict_instances_format: The format in which instances are given, must be one of the Model's supportedInputStorageFormats. Only "jsonl" is currently supported. For more details about this input config, see https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig. batch_predict_predictions_format: The format in which Vertex AI gives the predictions. Must be one of the Model's supportedOutputStorageFormats. Only "jsonl" is currently supported. For more details about this output config, see https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#OutputConfig. batch_predict_model_parameters: A map of parameters that govern the predictions. Some acceptable parameters include: maxOutputTokens, topK, topP, and temperature. @@ -102,8 +104,7 @@ def evaluation_llm_text_generation_pipeline( # pylint: disable=dangerous-defaul project=project, location=location, evaluation_task=evaluation_task, - target_field_name='instance.ground_truth', - prediction_field_name='predictions.content', + target_field_name=f'instance.{target_field_name}', predictions_format=batch_predict_predictions_format, joined_predictions_gcs_source=batch_predict_task.outputs[ 'gcs_output_directory'