diff --git a/python/src/aiconfig/eval/api/__init__.py b/python/src/aiconfig/eval/api/__init__.py
index 2ccf6e81a..4c82bf8b1 100644
--- a/python/src/aiconfig/eval/api/__init__.py
+++ b/python/src/aiconfig/eval/api/__init__.py
@@ -9,18 +9,23 @@
     TestSuiteWithInputsSettings,
 )
 """
-from .. import common, metrics
+from .. import test_suite_common, test_suite_metrics
 
 # pyright: reportWildcardImportFromLibrary=false
-from ..lib import (
+from ..test_suite_lib import (
     TestSuiteWithInputsSettings,
     run_test_suite_outputs_only,
     run_test_suite_with_inputs,
 )
+from ..test_suite_metrics import TestSuiteMetric, brevity, substring_match
 
 __all__ = [
-    "common",
-    "metrics",
+    "TestSuiteMetric",
+    "test_suite_common",
+    "test_suite_metrics",
+    "brevity",
+    "substring_match",
+    "run_test_suite_with_inputs",
     "run_test_suite_outputs_only",
     "run_test_suite_with_inputs",
     "TestSuiteWithInputsSettings",
diff --git a/python/src/aiconfig/eval/batch_common.py b/python/src/aiconfig/eval/batch_common.py
new file mode 100644
index 000000000..9122b8d05
--- /dev/null
+++ b/python/src/aiconfig/eval/batch_common.py
@@ -0,0 +1,31 @@
+from abc import abstractmethod
+from typing import Protocol, Sequence, TypeVar
+
+from aiconfig.eval import batch_common, common
+
+T_Ref = TypeVar("T_Ref")
+T_Ref_contra = TypeVar("T_Ref_contra", contravariant=True)
+
+
+class BatchEvaluationFunctionWithReference(
+    Protocol[
+        common.T_Evaluable, batch_common.T_Ref_contra, common.T_MetricValue_inv
+    ]
+):
+    @abstractmethod
+    async def __call__(
+        self,
+        data: Sequence[common.T_Evaluable],
+        ref: Sequence[batch_common.T_Ref_contra],
+    ) -> list[common.T_MetricValue_inv]:
+        pass
+
+
+class BatchEvaluationFunctionWithoutReference(
+    Protocol[common.T_Evaluable, common.T_MetricValue_inv]
+):
+    @abstractmethod
+    async def __call__(
+        self, data: Sequence[common.T_Evaluable]
+    ) -> list[common.T_MetricValue_inv]:
+        pass
diff --git a/python/src/aiconfig/eval/batch_lib.py b/python/src/aiconfig/eval/batch_lib.py
new file mode 100644
index 000000000..82b247576
--- /dev/null
+++ b/python/src/aiconfig/eval/batch_lib.py
@@ -0,0 +1,402 @@
+import asyncio
+import logging
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Coroutine, Generic, Sequence, cast
+
+import lastmile_utils.lib.core.api as core_utils
+import pandas as pd
+import result
+from aiconfig.eval import batch_common, batch_metrics, common
+from result import Result
+
+logging.basicConfig(format=core_utils.LOGGER_FMT)
+LOGGER = logging.getLogger(__name__)
+
+# Types
+
+
+@dataclass(frozen=True)
+class BatchEvalGeneralSettings:
+    eval_fn_timeout_s: int = 5
+
+
+@dataclass(frozen=True)
+class EvaluableTableWithReference(
+    Generic[common.T_Evaluable, batch_common.T_Ref]
+):
+    df: pd.DataFrame
+
+    @staticmethod
+    def make(
+        # At this point, don't care about the type of input_data. It's display-only now.
+        input_data: Sequence[Any] | None,
+        evaluable: Sequence[common.T_Evaluable],
+        ref_data: Sequence[batch_common.T_Ref],
+    ) -> Result[
+        "EvaluableTableWithReference[common.T_Evaluable, batch_common.T_Ref]",
+        str,
+    ]:
+        # make_df is untyped, but it's safe to cast it here because the types are annotated in this function signature.
+        # We can clearly see here that the output df types will match the input types, so it's safe to cast the output.
+        df = common.make_df(
+            {
+                "input_data": input_data,
+                "ref_data": ref_data,
+                "evaluable": evaluable,
+            }
+        )
+        out: Result[
+            EvaluableTableWithReference[
+                common.T_Evaluable, batch_common.T_Ref
+            ],
+            str,
+        ] = cast(
+            #
+            Result[
+                EvaluableTableWithReference[
+                    common.T_Evaluable, batch_common.T_Ref
+                ],
+                str,
+            ],
+            df.map(EvaluableTableWithReference),
+        )
+        return out
+
+    async def calculate(
+        self,
+        metric: batch_metrics.BatchMetricWithReference[
+            common.T_Evaluable, batch_common.T_Ref, common.T_MetricValue
+        ],
+    ) -> Result["ResultTable[common.T_Evaluable, common.T_MetricValue]", str]:
+        evaluable: Sequence[common.T_Evaluable] = cast(
+            #
+            Sequence[common.T_Evaluable],
+            self.df.evaluable,  # type: ignore[pandas]
+        )
+
+        ref_data: Sequence[batch_common.T_Ref] = cast(
+            #
+            Sequence[batch_common.T_Ref],
+            self.df.ref_data,  # type: ignore[pandas]
+        )
+
+        @core_utils.exception_to_err_with_traceback_async
+        async def _run():
+            return await metric.evaluation_fn(evaluable, ref_data)
+
+        def _make(
+            values_ok: list[common.T_MetricValue],
+        ) -> Result[
+            ResultTable[common.T_Evaluable, common.T_MetricValue], str
+        ]:
+            # safe annotation, we know what's in the df.
+            out: Result[
+                ResultTable[common.T_Evaluable, common.T_MetricValue], str
+            ] = ResultTable.make(self.df, values_ok)
+            return out
+
+        values = await _run()
+        out = values.and_then(_make)
+        return out
+
+
+@dataclass(frozen=True)
+class EvaluableTableWithoutRef(Generic[common.T_Evaluable]):
+    df: pd.DataFrame
+
+    @staticmethod
+    def make(
+        # At this point, I don't care about the type of input_data. It's display-only now.
+        input_data: Sequence[Any] | None,
+        evaluable: Sequence[common.T_Evaluable],
+    ) -> Result["EvaluableTableWithoutRef[common.T_Evaluable]", str]:
+        # make_df is untyped, but it's safe to cast it here because the types are annotated in this function signature.
+        # We can clearly see here that the output df types will match the input types, so it's safe to cast the output.
+        df = common.make_df({"input_data": input_data, "evaluable": evaluable})
+        out: Result[EvaluableTableWithoutRef[common.T_Evaluable], str] = cast(
+            #
+            Result[EvaluableTableWithoutRef[common.T_Evaluable], str],
+            df.map(EvaluableTableWithoutRef),
+        )
+        return out
+
+    async def calculate(
+        self,
+        metric: batch_metrics.BatchMetricWithoutReference[
+            common.T_Evaluable, common.T_MetricValue
+        ],
+    ) -> Result["ResultTable[common.T_Evaluable, common.T_MetricValue]", str]:
+        evaluable: Sequence[common.T_Evaluable] = cast(
+            #
+            Sequence[common.T_Evaluable],
+            self.df.evaluable,  # type: ignore[pandas]
+        )
+
+        @core_utils.exception_to_err_with_traceback_async
+        async def _run():
+            return await metric.evaluation_fn(evaluable)
+
+        def _make(
+            values_ok: list[common.T_MetricValue],
+        ) -> Result[
+            ResultTable[common.T_Evaluable, common.T_MetricValue], str
+        ]:
+            # safe annotation, we know what's in the df.
+            out: Result[
+                ResultTable[common.T_Evaluable, common.T_MetricValue], str
+            ] = ResultTable.make(self.df, values_ok)
+            return out
+
+        values = await _run()
+        out = values.and_then(_make)
+        return out
+
+
+@dataclass(frozen=True)
+class ResultTable(Generic[common.T_Evaluable, common.T_MetricValue]):
+    df: pd.DataFrame
+
+    @staticmethod
+    def make(
+        df_evaluable: pd.DataFrame,
+        metric_values: Sequence[common.T_MetricValue],
+    ) -> Result["ResultTable[common.T_Evaluable, common.T_MetricValue]", str]:
+        if len(df_evaluable) != len(metric_values):
+            return result.Err(
+                f"len(df_evaluable) != len(metric_values): {len(df_evaluable)} != {len(metric_values)}"
+            )
+        else:
+            return result.Ok(ResultTable(df_evaluable.assign(metric_values=metric_values)))  # type: ignore[pandas]
+
+    @staticmethod
+    def concatenate_tables(
+        tables: Sequence[
+            "ResultTable[common.T_Evaluable, common.T_MetricValue]"
+        ],
+    ) -> Result["ResultTable[common.T_Evaluable, common.T_MetricValue]", str]:
+        dfs = [table.df for table in tables if len(table.df) > 0]
+        df = pd.concat(dfs)  # type: ignore[pandas]
+        return result.Ok(ResultTable(df))
+
+
+# API
+
+
+async def run_evaluation(
+    #
+    evaluable: Sequence[str],
+    reference: Sequence[str] | None,
+    metrics: batch_metrics.BatchMetrics[str, str, common.T_MetricValue],
+    settings: BatchEvalGeneralSettings | None = None,
+) -> pd.DataFrame:
+    settings_ = settings or BatchEvalGeneralSettings()
+    res_table = await _evaluable_to_result_table(
+        None, evaluable, reference, metrics, settings_
+    )
+    return res_table.map(_process_result_table_to_df).unwrap_or_raise(
+        ValueError
+    )
+
+
+async def _evaluable_to_result_table(
+    # Intentional any. Inputs is display-only
+    inputs: Sequence[Any] | None,
+    evaluable: Sequence[str],
+    reference: Sequence[str] | None,
+    metrics: batch_metrics.BatchMetrics[str, str, common.T_MetricValue],
+    settings: BatchEvalGeneralSettings,
+):
+    match metrics:
+        case batch_metrics.BatchMetricsWithReference(metrics=metrics_):
+            if not reference:
+                raise ValueError(
+                    "got BatchMetricsWithReference, reference cannot be None"
+                )
+            else:
+                table = EvaluableTableWithReference.make(
+                    inputs, evaluable, reference
+                )
+                res = await result.do_async(
+                    #
+                    await _run_evaluation_helper_with_ref(
+                        table_ok, metrics_, settings
+                    )
+                    for table_ok in table
+                )
+                return res
+        case batch_metrics.BatchMetricsWithoutReference(metrics=metrics_):
+            if reference:
+                raise ValueError(
+                    "got BatchMetricsWithoutReference, reference must be None"
+                )
+            else:
+                table = EvaluableTableWithoutRef.make(inputs, evaluable)
+                res = await result.do_async(
+                    #
+                    await _run_evaluation_helper_without_ref(
+                        table_ok, metrics_, settings
+                    )
+                    for table_ok in table
+                )
+                return res
+
+
+async def run_aiconfig_and_evaluation(
+    #
+    aiconfig_path: str,
+    prompt_name: str,
+    aiconfig_params: Sequence[common.TextBasedInputDatum],
+    reference: Sequence[str] | None,
+    metrics: batch_metrics.BatchMetrics[str, str, common.T_MetricValue],
+    settings: BatchEvalGeneralSettings | None = None,
+) -> pd.DataFrame:
+    settings_ = settings or BatchEvalGeneralSettings()
+    evaluable = await _run_aiconfig_batch_helper(
+        aiconfig_path, prompt_name, aiconfig_params
+    )
+
+    res_table = await result.do_async(
+        await _evaluable_to_result_table(
+            aiconfig_params, evaluable_ok, reference, metrics, settings_
+        )
+        for evaluable_ok in evaluable
+    )
+    return res_table.map(_process_result_table_to_df).unwrap_or_raise(
+        ValueError
+    )
+
+
+# Implementation
+
+
+async def _run_aiconfig_batch_helper(
+    #
+    aiconfig_path: str,
+    prompt_name: str,
+    params_seq: Sequence[common.TextBasedInputDatum],
+) -> result.Result[list[common.TextOutput], str]:
+    aiconfig = common.load_aiconfig_runtime(aiconfig_path)
+
+    out = await result.do_async(
+        await common.batch_run_aiconfig_on_text_based_input(
+            #
+            aiconfig_ok,
+            prompt_name,
+            params_seq,
+        )
+        for aiconfig_ok in aiconfig
+    )
+    return out
+
+
+async def _run_evaluation_helper_with_ref(
+    evaluable_with_ref: EvaluableTableWithReference[
+        common.T_Evaluable, batch_common.T_Ref
+    ],
+    metrics: Sequence[
+        batch_metrics.BatchMetricWithReference[
+            common.T_Evaluable, batch_common.T_Ref, common.T_MetricValue
+        ]
+    ],
+    settings: BatchEvalGeneralSettings,
+) -> result.Result[ResultTable[common.T_Evaluable, common.T_MetricValue], str]:
+    timeout_s = settings.eval_fn_timeout_s
+
+    async def _calculate(
+        metric: batch_metrics.BatchMetricWithReference[
+            common.T_Evaluable, batch_common.T_Ref, common.T_MetricValue
+        ],
+    ):
+        async def _thunk() -> Result[
+            ResultTable[common.T_Evaluable, common.T_MetricValue], str
+        ]:
+            return await evaluable_with_ref.calculate(metric)
+
+        values = await async_thunk_with_timeout(_thunk(), timeout_s=timeout_s)
+        return values
+
+    res = await core_utils.result_reduce_list_all_ok_async(
+        map(
+            partial(_calculate),
+            metrics,
+        )
+    )
+
+    match res:
+        case result.Ok(res_):
+            list_results = core_utils.result_reduce_list_all_ok(res_)
+            match list_results:
+                case result.Ok(list_results_ok):
+                    all_results = ResultTable.concatenate_tables(
+                        list_results_ok
+                    )
+                    return all_results
+                case result.Err(err):
+                    return result.Err(err)
+        case result.Err(err):
+            return result.Err(err)
+
+
+async def _run_evaluation_helper_without_ref(
+    evaluable_without_ref: EvaluableTableWithoutRef[common.T_Evaluable],
+    metrics: Sequence[
+        batch_metrics.BatchMetricWithoutReference[
+            common.T_Evaluable, common.T_MetricValue
+        ]
+    ],
+    settings: BatchEvalGeneralSettings,
+) -> result.Result[ResultTable[common.T_Evaluable, common.T_MetricValue], str]:
+    timeout_s = settings.eval_fn_timeout_s
+
+    async def _calculate(
+        metric: batch_metrics.BatchMetricWithoutReference[
+            common.T_Evaluable, common.T_MetricValue
+        ],
+    ):
+        async def _thunk() -> Result[
+            ResultTable[common.T_Evaluable, common.T_MetricValue], str
+        ]:
+            return await evaluable_without_ref.calculate(metric)
+
+        values = await async_thunk_with_timeout(_thunk(), timeout_s=timeout_s)
+        return values
+
+    res = await core_utils.result_reduce_list_all_ok_async(
+        map(
+            partial(_calculate),
+            metrics,
+        )
+    )
+
+    match res:
+        case result.Ok(res_):
+            list_results = core_utils.result_reduce_list_all_ok(res_)
+            match list_results:
+                case result.Ok(list_results_ok):
+                    all_results = ResultTable.concatenate_tables(
+                        list_results_ok
+                    )
+                    return all_results
+                case result.Err(err):
+                    return result.Err(err)
+        case result.Err(err):
+            return result.Err(err)
+
+
+async def async_thunk_with_timeout(
+    thunk: Coroutine[None, None, common.T_cov], timeout_s: int
+) -> result.Result[common.T_cov, str]:
+    task = asyncio.create_task(thunk)
+    try:
+        res = await asyncio.wait_for(task, timeout=timeout_s)
+        return result.Ok(res)
+    except asyncio.TimeoutError:
+        task.cancel()
+        return result.Err(
+            f"async_thunk_with_timeout, {thunk.__name__} timed out after {timeout_s}s"
+        )
+
+
+def _process_result_table_to_df(eval_res: ResultTable[common.T_Evaluable, common.T_MetricValue]) -> pd.DataFrame:  # type: ignore[pandas untyped]
+    raise NotImplementedError
diff --git a/python/src/aiconfig/eval/batch_metrics.py b/python/src/aiconfig/eval/batch_metrics.py
new file mode 100644
index 000000000..a8367005b
--- /dev/null
+++ b/python/src/aiconfig/eval/batch_metrics.py
@@ -0,0 +1,80 @@
+from dataclasses import dataclass
+from typing import Generic, Sequence
+
+from aiconfig.eval import batch_common, common
+
+
+@dataclass(frozen=True)
+class BatchMetricWithReference(
+    Generic[common.T_Evaluable, batch_common.T_Ref, common.T_MetricValue]
+):
+    """See metrics.py for examples."""
+
+    evaluation_fn: batch_common.BatchEvaluationFunctionWithReference[
+        common.T_Evaluable, batch_common.T_Ref, common.T_MetricValue
+    ]
+    metric_metadata: common.EvaluationMetricMetadata[
+        common.T_Evaluable, common.T_MetricValue
+    ]
+
+    async def __call__(
+        self,
+        data: Sequence[common.T_Evaluable],
+        ref: Sequence[batch_common.T_Ref],
+    ) -> list[common.T_MetricValue]:
+        """
+        For convenience, make a Metric callable.
+        Similar to torch Module `forward()`.
+        """
+        return await self.evaluation_fn(data, ref)
+
+
+@dataclass(frozen=True)
+class BatchMetricWithoutReference(
+    Generic[common.T_Evaluable, common.T_MetricValue]
+):
+    """See metrics.py for examples."""
+
+    evaluation_fn: batch_common.BatchEvaluationFunctionWithoutReference[
+        common.T_Evaluable, common.T_MetricValue
+    ]
+    metric_metadata: common.EvaluationMetricMetadata[
+        common.T_Evaluable, common.T_MetricValue
+    ]
+
+    async def __call__(
+        self, data: Sequence[common.T_Evaluable]
+    ) -> list[common.T_MetricValue]:
+        """
+        For convenience, make a Metric callable.
+        Similar to torch Module `forward()`.
+        """
+        return await self.evaluation_fn(data)
+
+
+@dataclass
+class BatchMetricsWithReference(
+    Generic[common.T_Evaluable, batch_common.T_Ref, common.T_MetricValue]
+):
+    metrics: Sequence[
+        BatchMetricWithReference[
+            common.T_Evaluable, batch_common.T_Ref, common.T_MetricValue
+        ]
+    ]
+
+
+@dataclass
+class BatchMetricsWithoutReference(
+    Generic[common.T_Evaluable, common.T_MetricValue]
+):
+    metrics: Sequence[
+        BatchMetricWithoutReference[common.T_Evaluable, common.T_MetricValue]
+    ]
+
+
+BatchMetrics = (
+    BatchMetricsWithReference[
+        common.T_Evaluable, batch_common.T_Ref, common.T_MetricValue
+    ]
+    | BatchMetricsWithoutReference[common.T_Evaluable, common.T_MetricValue]
+)
diff --git a/python/src/aiconfig/eval/common.py b/python/src/aiconfig/eval/common.py
index 666f8b783..15eb16bea 100644
--- a/python/src/aiconfig/eval/common.py
+++ b/python/src/aiconfig/eval/common.py
@@ -1,22 +1,25 @@
 import json
-from abc import ABC, abstractmethod
+from abc import ABC
 from dataclasses import dataclass
-from typing import Any, Generic, NewType, Protocol, Type, TypeVar
+from typing import Any, Generic, NewType, Sequence, TypeVar
 
 import lastmile_utils.lib.core.api as core_utils
-import result
+import pandas as pd
 from aiconfig.Config import AIConfigRuntime
-from pydantic import BaseModel
+from aiconfig.eval import common
+from frozendict import frozendict
 from result import Result
 
+T_cov = TypeVar("T_cov", covariant=True)
+U_cov = TypeVar("U_cov", covariant=True)
+
 T_InputDatum = TypeVar("T_InputDatum", contravariant=True)
 T_OutputDatum = TypeVar("T_OutputDatum", contravariant=True)
 
-T_Evaluable = TypeVar("T_Evaluable", contravariant=True)
-
-T_BaseModel = TypeVar("T_BaseModel", bound=BaseModel)
 
-SerializedJSON = NewType("SerializedJSON", str)
+# NOTE: it's probably better to avoid NewType in the future, because it doesn't
+# ... create a ... new type. For example, you can't pattern match against it.
+TextOutput = NewType("TextOutput", str)
 
 
 @dataclass(frozen=True)
@@ -33,26 +36,15 @@ class CustomMetricValue(ABC):
     """
 
 
+T_Evaluable = TypeVar("T_Evaluable", contravariant=True)
+
+
 T_MetricValue = TypeVar(
     "T_MetricValue", int, float, str, bool, CustomMetricValue, covariant=True
 )
-
-
-class CompletionTextToSerializedJSON(Protocol):
-    @abstractmethod
-    def __call__(self, output_datum: str) -> Result[SerializedJSON, str]:
-        pass
-
-
-@dataclass(frozen=True)
-class CustomMetricPydanticObject(CustomMetricValue, Generic[T_BaseModel]):
-    data: T_BaseModel
-
-
-class EvaluationFunction(Protocol, Generic[T_Evaluable, T_MetricValue]):
-    @abstractmethod
-    async def __call__(self, datum: T_Evaluable) -> T_MetricValue:
-        pass
+T_MetricValue_inv = TypeVar(
+    "T_MetricValue_inv", int, float, str, bool, CustomMetricValue
+)
 
 
 class EvaluationMetricMetadata(
@@ -63,7 +55,7 @@ class EvaluationMetricMetadata(
     to ensure that numbers are interpreted as intended.
 
 
-    Assumptions:
+    Assumptions:t
     * If the best and worst values are not None, then the metric is assumed to be ordered.
       In this case (if the metric is ordered) then the comparison operators <, <=, >, and >=
       must be implemented (see CustomMetricValue).
@@ -95,8 +87,8 @@ def _serialize_extra_metadata(self) -> str:
 
     name: str
     description: str
-    best_value: T_MetricValue | None = None
-    worst_value: T_MetricValue | None = None
+    best_value: common.T_MetricValue | None = None
+    worst_value: common.T_MetricValue | None = None
     # e.g. {"substring": "hello", "case_sensitive": False}
     extra_metadata: dict[str, Any] = {}
 
@@ -108,74 +100,13 @@ def __repr__(self) -> str:
 
 
 @dataclass(frozen=True)
-class SampleMetricValue(Generic[T_Evaluable, T_MetricValue]):
-    # `None` is used to signal that there was an error during calculation.
-    # In this case, error information is written to stderr (see lib.py:_evaluate_for_sample()).
-    value: T_MetricValue | None
-    metric_metadata: EvaluationMetricMetadata[T_Evaluable, T_MetricValue]
-
-    def __post_init__(self) -> None:
-        metric_metadata = self.metric_metadata
-        worst_value, best_value = (
-            metric_metadata.worst_value,
-            metric_metadata.best_value,
-        )
-        value = self.value
-        if worst_value is None and best_value is None:
-            # fine
-            return
-        elif worst_value is None or best_value is None:
-            raise ValueError(
-                f"""
-                    [{metric_metadata.name}]
-                    {metric_metadata.description}
-
-                    You must define both worst_value and best_value, or neither.
-                    You defined worst_value = {worst_value} and best_value = {best_value}.
-                """
-            )
-        elif worst_value == best_value:
-            raise ValueError("best_value and worst_value cannot be equal")
-        elif value is not None and worst_value < best_value and not worst_value <= value <= best_value:  # type: ignore[fixme]
-            raise ValueError(
-                f"""
-                    [{metric_metadata.name}]
-                    {metric_metadata.description}
-
-                    Value {value} is not in range [{worst_value}, {best_value}]. 
-                    You defined worst_value = {worst_value} and best_value = {best_value},
-                    but got value outside that range.
-                """
-            )
-        elif value is not None and worst_value > best_value and not worst_value >= value >= best_value:  # type: ignore[fixme]
-            raise ValueError(
-                f"""
-                    [{metric_metadata.name}]
-                    {metric_metadata.description}
-
-                    Value {value} is not in range [{worst_value}, {best_value}]. 
-                    You defined worst_value = {worst_value} and best_value = {best_value},
-                    but got value outside that range.
-                """
-            )
-
+class TextBasedInputDatum:
+    value: str | frozendict[str, str]
 
-class TextRatingsData(core_utils.Record):
-    conciseness_rating: int
-    conciseness_confidence: float
-    conciseness_reasoning: str
 
-
-def get_llm_structured_response(
-    input_text: str,
-    chat_completion_create: CompletionTextToSerializedJSON,
-    basemodel_type: Type[T_BaseModel],
-) -> Result[T_BaseModel, str]:
-    return result.do(
-        core_utils.safe_model_validate_json(response_ok, basemodel_type)
-        # get the serialized JSON response
-        for response_ok in chat_completion_create(input_text)
-    )
+@core_utils.exception_to_err_with_traceback
+def load_aiconfig_runtime(aiconfig_path: str) -> AIConfigRuntime:
+    return AIConfigRuntime.load(aiconfig_path)
 
 
 @core_utils.exception_to_err_with_traceback_async
@@ -186,3 +117,53 @@ async def run_aiconfig_get_output_text(
     run_with_dependencies: bool,
 ):
     return await aiconfig.run_and_get_output_text(prompt_name, params, run_with_dependencies=run_with_dependencies)  # type: ignore
+
+
+async def run_aiconfig_on_text_based_input(
+    runtime: AIConfigRuntime,
+    prompt_name: str,
+    params: common.TextBasedInputDatum,
+) -> Result[str, str]:
+    def _get_params_for_aiconfig(
+        params: common.TextBasedInputDatum,
+    ) -> dict[str, str]:
+        match params.value:
+            case str(input_text):
+                return {"the_query": input_text}
+            case frozendict():
+                return dict(params.value)
+
+    params_for_aiconfig = _get_params_for_aiconfig(params)
+    return await run_aiconfig_get_output_text(
+        runtime, prompt_name, params_for_aiconfig, run_with_dependencies=True
+    )
+
+
+async def batch_run_aiconfig_on_text_based_input(
+    aiconfig: AIConfigRuntime,
+    prompt_name: str,
+    params_seq: Sequence[common.TextBasedInputDatum],
+) -> Result[list[TextOutput], str]:
+    async def _run(
+        input_datum: common.TextBasedInputDatum,
+    ) -> Result[TextOutput, str]:
+        return (
+            await run_aiconfig_on_text_based_input(
+                aiconfig, prompt_name, input_datum
+            )
+        ).map(TextOutput)
+
+    # TODO: fix the race condition and then use gather
+    # https://github.com/lastmile-ai/aiconfig/issues/434
+    res_outputs_: list[Result[TextOutput, str]] = []
+    for input_datum in params_seq:
+        res_outputs_.append(await _run(input_datum))
+    res_outputs = core_utils.result_reduce_list_all_ok(res_outputs_)
+    # res_outputs = await core_utils.result_reduce_list_all_ok_async(list(map(_run, all_inputs)))
+
+    return res_outputs
+
+
+@core_utils.exception_to_err_with_traceback
+def make_df(data: Any) -> pd.DataFrame:
+    return pd.DataFrame(data)
diff --git a/python/src/aiconfig/eval/openai.py b/python/src/aiconfig/eval/openai.py
index 9d5adc3fc..1caa3da75 100644
--- a/python/src/aiconfig/eval/openai.py
+++ b/python/src/aiconfig/eval/openai.py
@@ -5,7 +5,7 @@
 import lastmile_utils.lib.core.api as core_utils
 import openai
 import openai.types.chat as openai_types
-from aiconfig.eval import common
+from aiconfig.eval import test_suite_common
 from result import Err, Ok, Result
 
 
@@ -44,13 +44,17 @@ def default_openai_chat_completion_create(
 
 def extract_json_from_chat_completion(
     chat_completion: openai_types.ChatCompletion,
-) -> Result[common.SerializedJSON, str]:
+) -> Result[test_suite_common.SerializedJSON, str]:
     choice = chat_completion.choices[0]
     message = choice.message
     if message.tool_calls is None:
         return Err("No tool calls found")
 
-    return Ok(common.SerializedJSON(message.tool_calls[0].function.arguments))
+    return Ok(
+        test_suite_common.SerializedJSON(
+            message.tool_calls[0].function.arguments
+        )
+    )
 
 
 def make_fn_completion_text_to_serialized_json(
@@ -58,10 +62,10 @@ def make_fn_completion_text_to_serialized_json(
     properties: dict[str, dict[str, str]],
     required: list[str],
     openai_chat_completion_create: OpenAIChatCompletionCreate,
-) -> common.CompletionTextToSerializedJSON:
+) -> test_suite_common.CompletionTextToSerializedJSON:
     def _chat_completion_create(
         output_datum: str,
-    ) -> Result[common.SerializedJSON, str]:
+    ) -> Result[test_suite_common.SerializedJSON, str]:
         openai_chat_completion_params = _make_openai_completion_params(
             output_datum, eval_llm_name, properties, required
         )
@@ -69,7 +73,9 @@ def _chat_completion_create(
             openai_chat_completion_params
         ).and_then(extract_json_from_chat_completion)
 
-    out: common.CompletionTextToSerializedJSON = _chat_completion_create
+    out: test_suite_common.CompletionTextToSerializedJSON = (
+        _chat_completion_create
+    )
     return out
 
 
diff --git a/python/src/aiconfig/eval/test_suite_common.py b/python/src/aiconfig/eval/test_suite_common.py
new file mode 100644
index 000000000..0bce3fd69
--- /dev/null
+++ b/python/src/aiconfig/eval/test_suite_common.py
@@ -0,0 +1,109 @@
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Generic, NewType, Protocol, Type, TypeVar
+
+import lastmile_utils.lib.core.api as core_utils
+import result
+from aiconfig.eval import common
+from pydantic import BaseModel
+from result import Result
+
+T_BaseModel = TypeVar("T_BaseModel", bound=BaseModel)
+
+SerializedJSON = NewType("SerializedJSON", str)
+
+
+class CompletionTextToSerializedJSON(Protocol):
+    @abstractmethod
+    def __call__(self, output_datum: str) -> Result[SerializedJSON, str]:
+        pass
+
+
+@dataclass(frozen=True)
+class CustomMetricPydanticObject(
+    common.CustomMetricValue, Generic[T_BaseModel]
+):
+    data: T_BaseModel
+
+
+class EvaluationFunction(
+    Protocol, Generic[common.T_Evaluable, common.T_MetricValue]
+):
+    @abstractmethod
+    async def __call__(
+        self, datum: common.T_Evaluable
+    ) -> common.T_MetricValue:
+        pass
+
+
+@dataclass(frozen=True)
+class SampleMetricValue(Generic[common.T_Evaluable, common.T_MetricValue]):
+    # `None` is used to signal that there was an error during calculation.
+    # In this case, error information is written to stderr (see lib.py:_evaluate_for_sample()).
+    value: common.T_MetricValue | None
+    metric_metadata: common.EvaluationMetricMetadata[
+        common.T_Evaluable, common.T_MetricValue
+    ]
+
+    def __post_init__(self) -> None:
+        metric_metadata = self.metric_metadata
+        worst_value, best_value = (
+            metric_metadata.worst_value,
+            metric_metadata.best_value,
+        )
+        value = self.value
+        if worst_value is None and best_value is None:
+            # fine
+            return
+        elif worst_value is None or best_value is None:
+            raise ValueError(
+                f"""
+                    [{metric_metadata.name}]
+                    {metric_metadata.description}
+
+                    You must define both worst_value and best_value, or neither.
+                    You defined worst_value = {worst_value} and best_value = {best_value}.
+                """
+            )
+        elif worst_value == best_value:
+            raise ValueError("best_value and worst_value cannot be equal")
+        elif value is not None and worst_value < best_value and not worst_value <= value <= best_value:  # type: ignore[fixme]
+            raise ValueError(
+                f"""
+                    [{metric_metadata.name}]
+                    {metric_metadata.description}
+
+                    Value {value} is not in range [{worst_value}, {best_value}]. 
+                    You defined worst_value = {worst_value} and best_value = {best_value},
+                    but got value outside that range.
+                """
+            )
+        elif value is not None and worst_value > best_value and not worst_value >= value >= best_value:  # type: ignore[fixme]
+            raise ValueError(
+                f"""
+                    [{metric_metadata.name}]
+                    {metric_metadata.description}
+
+                    Value {value} is not in range [{worst_value}, {best_value}]. 
+                    You defined worst_value = {worst_value} and best_value = {best_value},
+                    but got value outside that range.
+                """
+            )
+
+
+class TextRatingsData(core_utils.Record):
+    conciseness_rating: int
+    conciseness_confidence: float
+    conciseness_reasoning: str
+
+
+def get_llm_structured_response(
+    input_text: str,
+    chat_completion_create: CompletionTextToSerializedJSON,
+    basemodel_type: Type[T_BaseModel],
+) -> Result[T_BaseModel, str]:
+    return result.do(
+        core_utils.safe_model_validate_json(response_ok, basemodel_type)
+        # get the serialized JSON response
+        for response_ok in chat_completion_create(input_text)
+    )
diff --git a/python/src/aiconfig/eval/examples/travel/travel_aiconfig_test_suite_settings.json b/python/src/aiconfig/eval/test_suite_examples/travel/travel_aiconfig_test_suite_settings.json
similarity index 100%
rename from python/src/aiconfig/eval/examples/travel/travel_aiconfig_test_suite_settings.json
rename to python/src/aiconfig/eval/test_suite_examples/travel/travel_aiconfig_test_suite_settings.json
diff --git a/python/src/aiconfig/eval/examples/travel/travel_eval.ipynb b/python/src/aiconfig/eval/test_suite_examples/travel/travel_eval.ipynb
similarity index 78%
rename from python/src/aiconfig/eval/examples/travel/travel_eval.ipynb
rename to python/src/aiconfig/eval/test_suite_examples/travel/travel_eval.ipynb
index 92d936fd4..846399f86 100644
--- a/python/src/aiconfig/eval/examples/travel/travel_eval.ipynb
+++ b/python/src/aiconfig/eval/test_suite_examples/travel/travel_eval.ipynb
@@ -39,7 +39,7 @@
      "text": [
       "\u001b[33mWARNING: Ignoring invalid distribution -etuptools (/opt/homebrew/Caskroom/miniconda/base/envs/aiconfig/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n",
       "\u001b[0mCollecting lastmile-utils\n",
-      "  Using cached lastmile_utils-0.0.13-py3-none-any.whl.metadata (901 bytes)\n",
+      "  Using cached lastmile_utils-0.0.21-py3-none-any.whl.metadata (901 bytes)\n",
       "Collecting black==23.11.0 (from lastmile-utils)\n",
       "  Using cached black-23.11.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (66 kB)\n",
       "Collecting chardet==5.2.0 (from lastmile-utils)\n",
@@ -62,12 +62,12 @@
       "  Using cached pytest-7.4.3-py3-none-any.whl.metadata (7.9 kB)\n",
       "Collecting python-dotenv==1.0.0 (from lastmile-utils)\n",
       "  Using cached python_dotenv-1.0.0-py3-none-any.whl (19 kB)\n",
-      "Collecting result==0.15.0 (from lastmile-utils)\n",
-      "  Using cached result-0.15.0-py3-none-any.whl.metadata (12 kB)\n",
+      "Collecting result==0.16.0 (from lastmile-utils)\n",
+      "  Using cached result-0.16.0-py3-none-any.whl.metadata (857 bytes)\n",
       "Collecting autoflake==2.2.1 (from lastmile-utils)\n",
       "  Using cached autoflake-2.2.1-py3-none-any.whl.metadata (7.3 kB)\n",
       "Collecting pyflakes>=3.0.0 (from autoflake==2.2.1->lastmile-utils)\n",
-      "  Using cached pyflakes-3.1.0-py2.py3-none-any.whl.metadata (3.5 kB)\n",
+      "  Downloading pyflakes-3.2.0-py2.py3-none-any.whl.metadata (3.5 kB)\n",
       "Collecting tomli>=2.0.1 (from autoflake==2.2.1->lastmile-utils)\n",
       "  Using cached tomli-2.0.1-py3-none-any.whl (12 kB)\n",
       "Collecting click>=8.0.0 (from black==23.11.0->lastmile-utils)\n",
@@ -86,16 +86,18 @@
       "  Using cached mccabe-0.7.0-py2.py3-none-any.whl (7.3 kB)\n",
       "Collecting pycodestyle<2.12.0,>=2.11.0 (from flake8==6.1.0->lastmile-utils)\n",
       "  Using cached pycodestyle-2.11.1-py2.py3-none-any.whl.metadata (4.5 kB)\n",
+      "Collecting pyflakes>=3.0.0 (from autoflake==2.2.1->lastmile-utils)\n",
+      "  Using cached pyflakes-3.1.0-py2.py3-none-any.whl.metadata (3.5 kB)\n",
       "Collecting json-spec (from jsoncomment==0.4.2->lastmile-utils)\n",
       "  Using cached json_spec-0.11.0-py3-none-any.whl (41 kB)\n",
       "Collecting numpy<2,>=1.22.4 (from pandas==2.1.2->lastmile-utils)\n",
-      "  Using cached numpy-1.26.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (61 kB)\n",
+      "  Using cached numpy-1.26.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (61 kB)\n",
       "Collecting python-dateutil>=2.8.2 (from pandas==2.1.2->lastmile-utils)\n",
       "  Using cached python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n",
       "Collecting pytz>=2020.1 (from pandas==2.1.2->lastmile-utils)\n",
       "  Using cached pytz-2023.3.post1-py2.py3-none-any.whl.metadata (22 kB)\n",
       "Collecting tzdata>=2022.1 (from pandas==2.1.2->lastmile-utils)\n",
-      "  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)\n",
+      "  Using cached tzdata-2023.4-py2.py3-none-any.whl.metadata (1.4 kB)\n",
       "Collecting annotated-types>=0.4.0 (from pydantic==2.4.2->lastmile-utils)\n",
       "  Using cached annotated_types-0.6.0-py3-none-any.whl.metadata (12 kB)\n",
       "Collecting pydantic-core==2.10.1 (from pydantic==2.4.2->lastmile-utils)\n",
@@ -115,14 +117,14 @@
       "Collecting exceptiongroup>=1.0.0rc8 (from pytest==7.4.3->lastmile-utils)\n",
       "  Using cached exceptiongroup-1.2.0-py3-none-any.whl.metadata (6.6 kB)\n",
       "Collecting setuptools (from nodeenv>=1.6.0->pyright==1.1.335->lastmile-utils)\n",
-      "  Using cached setuptools-69.0.2-py3-none-any.whl.metadata (6.3 kB)\n",
+      "  Using cached setuptools-69.0.3-py3-none-any.whl.metadata (6.3 kB)\n",
       "Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas==2.1.2->lastmile-utils)\n",
       "  Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)\n",
       "Collecting importlib-metadata<6.0.0,>=5.0.0 (from json-spec->jsoncomment==0.4.2->lastmile-utils)\n",
       "  Using cached importlib_metadata-5.2.0-py3-none-any.whl (21 kB)\n",
       "Collecting zipp>=0.5 (from importlib-metadata<6.0.0,>=5.0.0->json-spec->jsoncomment==0.4.2->lastmile-utils)\n",
       "  Using cached zipp-3.17.0-py3-none-any.whl.metadata (3.7 kB)\n",
-      "Using cached lastmile_utils-0.0.13-py3-none-any.whl (14 kB)\n",
+      "Using cached lastmile_utils-0.0.21-py3-none-any.whl (15 kB)\n",
       "Using cached autoflake-2.2.1-py3-none-any.whl (32 kB)\n",
       "Using cached black-23.11.0-cp310-cp310-macosx_11_0_arm64.whl (1.4 MB)\n",
       "Using cached chardet-5.2.0-py3-none-any.whl (199 kB)\n",
@@ -132,7 +134,7 @@
       "Using cached pylint-3.0.2-py3-none-any.whl (510 kB)\n",
       "Using cached pyright-1.1.335-py3-none-any.whl (17 kB)\n",
       "Using cached pytest-7.4.3-py3-none-any.whl (325 kB)\n",
-      "Using cached result-0.15.0-py3-none-any.whl (10 kB)\n",
+      "Using cached result-0.16.0-py3-none-any.whl (6.8 kB)\n",
       "Using cached pydantic_core-2.10.1-cp310-cp310-macosx_11_0_arm64.whl (1.7 MB)\n",
       "Using cached annotated_types-0.6.0-py3-none-any.whl (12 kB)\n",
       "Using cached astroid-3.0.2-py3-none-any.whl (275 kB)\n",
@@ -140,7 +142,7 @@
       "Using cached dill-0.3.7-py3-none-any.whl (115 kB)\n",
       "Using cached exceptiongroup-1.2.0-py3-none-any.whl (16 kB)\n",
       "Using cached nodeenv-1.8.0-py2.py3-none-any.whl (22 kB)\n",
-      "Using cached numpy-1.26.2-cp310-cp310-macosx_11_0_arm64.whl (14.0 MB)\n",
+      "Using cached numpy-1.26.3-cp310-cp310-macosx_11_0_arm64.whl (14.0 MB)\n",
       "Using cached packaging-23.2-py3-none-any.whl (53 kB)\n",
       "Using cached pathspec-0.12.1-py3-none-any.whl (31 kB)\n",
       "Using cached platformdirs-4.1.0-py3-none-any.whl (17 kB)\n",
@@ -150,7 +152,8 @@
       "Using cached pytz-2023.3.post1-py2.py3-none-any.whl (502 kB)\n",
       "Using cached tomlkit-0.12.3-py3-none-any.whl (37 kB)\n",
       "Using cached typing_extensions-4.9.0-py3-none-any.whl (32 kB)\n",
-      "Using cached setuptools-69.0.2-py3-none-any.whl (819 kB)\n",
+      "Using cached tzdata-2023.4-py2.py3-none-any.whl (346 kB)\n",
+      "Using cached setuptools-69.0.3-py3-none-any.whl (819 kB)\n",
       "Using cached zipp-3.17.0-py3-none-any.whl (7.4 kB)\n",
       "\u001b[33mWARNING: Ignoring invalid distribution -etuptools (/opt/homebrew/Caskroom/miniconda/base/envs/aiconfig/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n",
       "\u001b[0mInstalling collected packages: pytz, zipp, tzdata, typing-extensions, tomlkit, tomli, six, setuptools, result, python-dotenv, pyflakes, pycodestyle, pluggy, platformdirs, pathspec, packaging, numpy, mypy-extensions, mccabe, isort, iniconfig, exceptiongroup, dill, click, chardet, annotated-types, python-dateutil, pytest, pydantic-core, nodeenv, importlib-metadata, flake8, black, autoflake, astroid, pyright, pylint, pydantic, pandas, json-spec, jsoncomment, lastmile-utils\n",
@@ -163,9 +166,9 @@
       "    Uninstalling zipp-3.17.0:\n",
       "      Successfully uninstalled zipp-3.17.0\n",
       "  Attempting uninstall: tzdata\n",
-      "    Found existing installation: tzdata 2023.3\n",
-      "    Uninstalling tzdata-2023.3:\n",
-      "      Successfully uninstalled tzdata-2023.3\n",
+      "    Found existing installation: tzdata 2023.4\n",
+      "    Uninstalling tzdata-2023.4:\n",
+      "      Successfully uninstalled tzdata-2023.4\n",
       "  Attempting uninstall: typing-extensions\n",
       "    Found existing installation: typing_extensions 4.9.0\n",
       "    Uninstalling typing_extensions-4.9.0:\n",
@@ -183,13 +186,13 @@
       "    Uninstalling six-1.16.0:\n",
       "      Successfully uninstalled six-1.16.0\n",
       "  Attempting uninstall: setuptools\n",
-      "    Found existing installation: setuptools 69.0.2\n",
-      "    Uninstalling setuptools-69.0.2:\n",
-      "      Successfully uninstalled setuptools-69.0.2\n",
+      "    Found existing installation: setuptools 69.0.3\n",
+      "    Uninstalling setuptools-69.0.3:\n",
+      "      Successfully uninstalled setuptools-69.0.3\n",
       "  Attempting uninstall: result\n",
-      "    Found existing installation: result 0.15.0\n",
-      "    Uninstalling result-0.15.0:\n",
-      "      Successfully uninstalled result-0.15.0\n",
+      "    Found existing installation: result 0.16.0\n",
+      "    Uninstalling result-0.16.0:\n",
+      "      Successfully uninstalled result-0.16.0\n",
       "  Attempting uninstall: python-dotenv\n",
       "    Found existing installation: python-dotenv 1.0.0\n",
       "    Uninstalling python-dotenv-1.0.0:\n",
@@ -219,9 +222,9 @@
       "    Uninstalling packaging-23.2:\n",
       "      Successfully uninstalled packaging-23.2\n",
       "  Attempting uninstall: numpy\n",
-      "    Found existing installation: numpy 1.26.2\n",
-      "    Uninstalling numpy-1.26.2:\n",
-      "      Successfully uninstalled numpy-1.26.2\n",
+      "    Found existing installation: numpy 1.26.3\n",
+      "    Uninstalling numpy-1.26.3:\n",
+      "      Successfully uninstalled numpy-1.26.3\n",
       "  Attempting uninstall: mypy-extensions\n",
       "    Found existing installation: mypy-extensions 1.0.0\n",
       "    Uninstalling mypy-extensions-1.0.0:\n",
@@ -319,14 +322,16 @@
       "    Uninstalling jsoncomment-0.4.2:\n",
       "      Successfully uninstalled jsoncomment-0.4.2\n",
       "  Attempting uninstall: lastmile-utils\n",
-      "    Found existing installation: lastmile-utils 0.0.13\n",
-      "    Uninstalling lastmile-utils-0.0.13:\n",
-      "      Successfully uninstalled lastmile-utils-0.0.13\n",
+      "    Found existing installation: lastmile_utils 0.0.21\n",
+      "    Uninstalling lastmile_utils-0.0.21:\n",
+      "      Successfully uninstalled lastmile_utils-0.0.21\n",
       "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
-      "python-aiconfig 1.1.7 requires lastmile-utils==0.0.10, but you have lastmile-utils 0.0.13 which is incompatible.\u001b[0m\u001b[31m\n",
-      "\u001b[0mSuccessfully installed annotated-types-0.6.0 astroid-3.0.2 autoflake-2.2.1 black-23.11.0 chardet-5.2.0 click-8.1.7 dill-0.3.7 exceptiongroup-1.2.0 flake8-6.1.0 importlib-metadata-5.2.0 iniconfig-2.0.0 isort-5.12.0 json-spec-0.11.0 jsoncomment-0.4.2 lastmile-utils-0.0.13 mccabe-0.7.0 mypy-extensions-1.0.0 nodeenv-1.8.0 numpy-1.26.2 packaging-23.2 pandas-2.1.2 pathspec-0.12.1 platformdirs-4.1.0 pluggy-1.3.0 pycodestyle-2.11.1 pydantic-2.4.2 pydantic-core-2.10.1 pyflakes-3.1.0 pylint-3.0.2 pyright-1.1.335 pytest-7.4.3 python-dateutil-2.8.2 python-dotenv-1.0.0 pytz-2023.3.post1 result-0.15.0 setuptools-69.0.2 six-1.16.0 tomli-2.0.1 tomlkit-0.12.3 typing-extensions-4.9.0 tzdata-2023.3 zipp-3.17.0\n",
+      "fastapi 0.105.0 requires anyio<4.0.0,>=3.7.1, but you have anyio 4.2.0 which is incompatible.\n",
+      "langchain 0.0.339 requires anyio<4.0, but you have anyio 4.2.0 which is incompatible.\n",
+      "python-aiconfig 1.1.9 requires lastmile-utils==0.0.20, but you have lastmile-utils 0.0.21 which is incompatible.\u001b[0m\u001b[31m\n",
+      "\u001b[0mSuccessfully installed annotated-types-0.6.0 astroid-3.0.2 autoflake-2.2.1 black-23.11.0 chardet-5.2.0 click-8.1.7 dill-0.3.7 exceptiongroup-1.2.0 flake8-6.1.0 importlib-metadata-5.2.0 iniconfig-2.0.0 isort-5.12.0 json-spec-0.11.0 jsoncomment-0.4.2 lastmile-utils-0.0.21 mccabe-0.7.0 mypy-extensions-1.0.0 nodeenv-1.8.0 numpy-1.26.3 packaging-23.2 pandas-2.1.2 pathspec-0.12.1 platformdirs-4.1.0 pluggy-1.3.0 pycodestyle-2.11.1 pydantic-2.4.2 pydantic-core-2.10.1 pyflakes-3.1.0 pylint-3.0.2 pyright-1.1.335 pytest-7.4.3 python-dateutil-2.8.2 python-dotenv-1.0.0 pytz-2023.3.post1 result-0.16.0 setuptools-69.0.3 six-1.16.0 tomli-2.0.1 tomlkit-0.12.3 typing-extensions-4.9.0 tzdata-2023.4 zipp-3.17.0\n",
       "\u001b[33mWARNING: Ignoring invalid distribution -etuptools (/opt/homebrew/Caskroom/miniconda/base/envs/aiconfig/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n",
-      "\u001b[0mlastmile-utils               0.0.13\n"
+      "\u001b[0m"
      ]
     }
    ],
@@ -401,7 +406,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -413,21 +418,11 @@
       "    on our data along with some off-the-shelf metrics.\n",
       "    \n"
      ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "3"
-      ]
-     },
-     "execution_count": 34,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
     "from typing import Literal\n",
-    "from aiconfig.eval.api import common as common, metrics as metrics\n",
+    "from aiconfig.eval.api import test_suite_common as common, test_suite_metrics as metrics\n",
     "import lastmile_utils.lib.core.api as core_utils\n",
     "\n",
     "print(\n",
@@ -438,11 +433,11 @@
     ")\n",
     "\n",
     "# 1. Helper function to construct a Metric that counts a specific letter.\n",
-    "def make_letter_count_metric(letter_to_count: str) -> metrics.Metric[str, int]:\n",
+    "def make_letter_count_metric(letter_to_count: str) -> metrics.TestSuiteMetric[str, int]:\n",
     "    async def letter_count_metric(datum: str):\n",
     "        return datum.count(letter_to_count)\n",
     "    \n",
-    "    output_metric = metrics.Metric(\n",
+    "    output_metric = metrics.TestSuiteMetric(\n",
     "        evaluation_fn=letter_count_metric,\n",
     "        metric_metadata=common.EvaluationMetricMetadata(\n",
     "            name=\"letter_count\",\n",
@@ -489,7 +484,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -558,7 +553,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -570,34 +565,38 @@
       "Test input:\n",
       " different kinds of cuisines \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function _calculate_brevity at 0x132fa1a20>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function metric.<locals>._construct.<locals>.evaluation_fn at 0x2bc1e8940>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"brevity\",\n",
       "  \"description\": \"Absolute text length\",\n",
       "  \"best_value\": 1,\n",
       "  \"worst_value\": 9223372036854775807,\n",
-      "  \"extra_metadata\": {},\n",
-      "  \"id\": \"24952ce05ce6dcbd370ccc3b39d410edeab8e1cf420130a83cf9388df6bcfdc3\"\n",
+      "  \"extra_metadata\": {\n",
+      "    \"args\": []\n",
+      "  },\n",
+      "  \"id\": \"5b29b6ba68aeeadc42b7333015f4b158f7514f68c05fef79a702e98cf9983085\"\n",
       "}))\n",
       "\n",
       "Test input:\n",
       " different kinds of cuisines \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function substring_match.<locals>._fn at 0x1428140d0>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function metric.<locals>._construct.<locals>.evaluation_fn at 0x2bc355360>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"substring_match\",\n",
       "  \"description\": \"True (pass) if contains given substring\",\n",
       "  \"best_value\": true,\n",
       "  \"worst_value\": false,\n",
       "  \"extra_metadata\": {\n",
-      "    \"substring\": \"Magnolia Bakery\",\n",
+      "    \"args\": [\n",
+      "      \"Magnolia Bakery\"\n",
+      "    ],\n",
       "    \"case_sensitive\": false\n",
       "  },\n",
-      "  \"id\": \"0c461362f44884023dda5537ce88263ba20d555562bac8abc05bcde0ce1aacf6\"\n",
+      "  \"id\": \"12b2b88421a53f87fa1502c48a3bfa8b84aa22af3528178f0ec8d699db041d8d\"\n",
       "}))\n",
       "\n",
       "Test input:\n",
       " different kinds of cuisines \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x132f3c430>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x2bc1e8d30>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"text_ratings\",\n",
       "  \"description\": \"Text ratings\",\n",
       "  \"best_value\": null,\n",
@@ -613,7 +612,7 @@
       "Test input:\n",
       " different kinds of cuisines \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function make_letter_count_metric.<locals>.letter_count_metric at 0x142815510>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function make_letter_count_metric.<locals>.letter_count_metric at 0x2bc1eb880>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"letter_count\",\n",
       "  \"description\": \"Counts the number of times the given letter appears in the text\",\n",
       "  \"best_value\": null,\n",
@@ -627,7 +626,7 @@
       "Test input:\n",
       " different kinds of cuisines \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x142817760>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x2bc355120>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"emotional_valence\",\n",
       "  \"description\": \"Emotional valence\",\n",
       "  \"best_value\": null,\n",
@@ -643,34 +642,38 @@
       "Test input:\n",
       " iconic midtown skyscrapers \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function _calculate_brevity at 0x132fa1a20>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function metric.<locals>._construct.<locals>.evaluation_fn at 0x2bc1e8940>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"brevity\",\n",
       "  \"description\": \"Absolute text length\",\n",
       "  \"best_value\": 1,\n",
       "  \"worst_value\": 9223372036854775807,\n",
-      "  \"extra_metadata\": {},\n",
-      "  \"id\": \"24952ce05ce6dcbd370ccc3b39d410edeab8e1cf420130a83cf9388df6bcfdc3\"\n",
+      "  \"extra_metadata\": {\n",
+      "    \"args\": []\n",
+      "  },\n",
+      "  \"id\": \"5b29b6ba68aeeadc42b7333015f4b158f7514f68c05fef79a702e98cf9983085\"\n",
       "}))\n",
       "\n",
       "Test input:\n",
       " iconic midtown skyscrapers \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function substring_match.<locals>._fn at 0x142816b90>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function metric.<locals>._construct.<locals>.evaluation_fn at 0x2bc3553f0>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"substring_match\",\n",
       "  \"description\": \"True (pass) if contains given substring\",\n",
       "  \"best_value\": true,\n",
       "  \"worst_value\": false,\n",
       "  \"extra_metadata\": {\n",
-      "    \"substring\": \"Empire State Building\",\n",
+      "    \"args\": [\n",
+      "      \"Empire State Building\"\n",
+      "    ],\n",
       "    \"case_sensitive\": false\n",
       "  },\n",
-      "  \"id\": \"53e4c7163f49fdc7727286e638ff07bcb570faaa334456775c616c2f4ad3eb3f\"\n",
+      "  \"id\": \"17bb1efe1fb306bce98240f3534f5d29c68564e4e7c1c0db17198247d19754e3\"\n",
       "}))\n",
       "\n",
       "Test input:\n",
       " iconic midtown skyscrapers \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x132f3c430>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x2bc1e8d30>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"text_ratings\",\n",
       "  \"description\": \"Text ratings\",\n",
       "  \"best_value\": null,\n",
@@ -686,7 +689,7 @@
       "Test input:\n",
       " iconic midtown skyscrapers \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function make_letter_count_metric.<locals>.letter_count_metric at 0x142815510>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function make_letter_count_metric.<locals>.letter_count_metric at 0x2bc1eb880>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"letter_count\",\n",
       "  \"description\": \"Counts the number of times the given letter appears in the text\",\n",
       "  \"best_value\": null,\n",
@@ -700,7 +703,7 @@
       "Test input:\n",
       " iconic midtown skyscrapers \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x142817760>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x2bc355120>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"emotional_valence\",\n",
       "  \"description\": \"Emotional valence\",\n",
       "  \"best_value\": null,\n",
@@ -724,7 +727,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -770,13 +773,11 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>different kinds of cuisines</td>\n",
-       "      <td>Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \n",
-       "\n",
-       "Day 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \n",
-       "\n",
-       "Day 3: Evening of Spanish tapas with flamenco performances in NYC.</td>\n",
-       "      <td>218</td>\n",
-       "      <td>24952ce05ce6dcbd370ccc3b39d410edeab8e1cf420130a83cf9388df6bcfdc3</td>\n",
+       "      <td>1. Explore Chelsea Market's international food stalls.\n",
+       "2. Guided Manhattan Chinatown food tour.\n",
+       "3. Experience Italian heritage and cuisine in Little Italy.</td>\n",
+       "      <td>155</td>\n",
+       "      <td>5b29b6ba68aeeadc42b7333015f4b158f7514f68c05fef79a702e98cf9983085</td>\n",
        "      <td>brevity</td>\n",
        "      <td>Absolute text length</td>\n",
        "      <td>1</td>\n",
@@ -785,13 +786,11 @@
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>different kinds of cuisines</td>\n",
-       "      <td>Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \n",
-       "\n",
-       "Day 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \n",
-       "\n",
-       "Day 3: Evening of Spanish tapas with flamenco performances in NYC.</td>\n",
+       "      <td>1. Explore Chelsea Market's international food stalls.\n",
+       "2. Guided Manhattan Chinatown food tour.\n",
+       "3. Experience Italian heritage and cuisine in Little Italy.</td>\n",
        "      <td>False</td>\n",
-       "      <td>0c461362f44884023dda5537ce88263ba20d555562bac8abc05bcde0ce1aacf6</td>\n",
+       "      <td>12b2b88421a53f87fa1502c48a3bfa8b84aa22af3528178f0ec8d699db041d8d</td>\n",
        "      <td>substring_match</td>\n",
        "      <td>True (pass) if contains given substring</td>\n",
        "      <td>True</td>\n",
@@ -800,12 +799,10 @@
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>different kinds of cuisines</td>\n",
-       "      <td>Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \n",
-       "\n",
-       "Day 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \n",
-       "\n",
-       "Day 3: Evening of Spanish tapas with flamenco performances in NYC.</td>\n",
-       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text is concise and provides clear information about the activities planned for each day.\"\\n})</td>\n",
+       "      <td>1. Explore Chelsea Market's international food stalls.\n",
+       "2. Guided Manhattan Chinatown food tour.\n",
+       "3. Experience Italian heritage and cuisine in Little Italy.</td>\n",
+       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text is concise and provides clear information about three different food-related experiences in New York City.\"\\n})</td>\n",
        "      <td>300b32bb8a01befd5e729eaf73506bdba01f910c0db0c8f70136dd2e48e298a7</td>\n",
        "      <td>text_ratings</td>\n",
        "      <td>Text ratings</td>\n",
@@ -815,12 +812,10 @@
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>different kinds of cuisines</td>\n",
-       "      <td>Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \n",
-       "\n",
-       "Day 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \n",
-       "\n",
-       "Day 3: Evening of Spanish tapas with flamenco performances in NYC.</td>\n",
-       "      <td>4</td>\n",
+       "      <td>1. Explore Chelsea Market's international food stalls.\n",
+       "2. Guided Manhattan Chinatown food tour.\n",
+       "3. Experience Italian heritage and cuisine in Little Italy.</td>\n",
+       "      <td>0</td>\n",
        "      <td>855b84d49dadc258f82d949bf3d57a100c788e6e093e4615e8b4e03567f1ffc9</td>\n",
        "      <td>letter_count</td>\n",
        "      <td>Counts the number of times the given letter appears in the text</td>\n",
@@ -830,11 +825,9 @@
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>different kinds of cuisines</td>\n",
-       "      <td>Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \n",
-       "\n",
-       "Day 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \n",
-       "\n",
-       "Day 3: Evening of Spanish tapas with flamenco performances in NYC.</td>\n",
+       "      <td>1. Explore Chelsea Market's international food stalls.\n",
+       "2. Guided Manhattan Chinatown food tour.\n",
+       "3. Experience Italian heritage and cuisine in Little Italy.</td>\n",
        "      <td>CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})</td>\n",
        "      <td>a351b0b7ab1639eb32695430b3e1bb65c96d11b528730c103d5879234a3bd8bb</td>\n",
        "      <td>emotional_valence</td>\n",
@@ -845,11 +838,11 @@
        "    <tr>\n",
        "      <th>5</th>\n",
        "      <td>iconic midtown skyscrapers</td>\n",
-       "      <td>1. Visit Empire State Building, explore exhibits. \n",
-       "2. Proceed to Top of the Rock, photograph city views. \n",
-       "3. Explore New York Public Library Schwarzman Building, see exhibitions.</td>\n",
-       "      <td>178</td>\n",
-       "      <td>24952ce05ce6dcbd370ccc3b39d410edeab8e1cf420130a83cf9388df6bcfdc3</td>\n",
+       "      <td>Day 1: Empire State Building, Skyride.\n",
+       "Day 2: Rockefeller Center, Top of the Rock.\n",
+       "Day 3: One World Trade Center, 9/11 Memorial &amp; Museum.</td>\n",
+       "      <td>137</td>\n",
+       "      <td>5b29b6ba68aeeadc42b7333015f4b158f7514f68c05fef79a702e98cf9983085</td>\n",
        "      <td>brevity</td>\n",
        "      <td>Absolute text length</td>\n",
        "      <td>1</td>\n",
@@ -858,11 +851,11 @@
        "    <tr>\n",
        "      <th>6</th>\n",
        "      <td>iconic midtown skyscrapers</td>\n",
-       "      <td>1. Visit Empire State Building, explore exhibits. \n",
-       "2. Proceed to Top of the Rock, photograph city views. \n",
-       "3. Explore New York Public Library Schwarzman Building, see exhibitions.</td>\n",
+       "      <td>Day 1: Empire State Building, Skyride.\n",
+       "Day 2: Rockefeller Center, Top of the Rock.\n",
+       "Day 3: One World Trade Center, 9/11 Memorial &amp; Museum.</td>\n",
        "      <td>True</td>\n",
-       "      <td>53e4c7163f49fdc7727286e638ff07bcb570faaa334456775c616c2f4ad3eb3f</td>\n",
+       "      <td>17bb1efe1fb306bce98240f3534f5d29c68564e4e7c1c0db17198247d19754e3</td>\n",
        "      <td>substring_match</td>\n",
        "      <td>True (pass) if contains given substring</td>\n",
        "      <td>True</td>\n",
@@ -871,10 +864,10 @@
        "    <tr>\n",
        "      <th>7</th>\n",
        "      <td>iconic midtown skyscrapers</td>\n",
-       "      <td>1. Visit Empire State Building, explore exhibits. \n",
-       "2. Proceed to Top of the Rock, photograph city views. \n",
-       "3. Explore New York Public Library Schwarzman Building, see exhibitions.</td>\n",
-       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 4,\\n  \"conciseness_confidence\": 0.8,\\n  \"conciseness_reasoning\": \"The text is concise and provides clear instructions for visiting three different attractions in New York City.\"\\n})</td>\n",
+       "      <td>Day 1: Empire State Building, Skyride.\n",
+       "Day 2: Rockefeller Center, Top of the Rock.\n",
+       "Day 3: One World Trade Center, 9/11 Memorial &amp; Museum.</td>\n",
+       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text provides a clear and concise itinerary for three days in New York City, mentioning the main attractions to visit each day.\"\\n})</td>\n",
        "      <td>300b32bb8a01befd5e729eaf73506bdba01f910c0db0c8f70136dd2e48e298a7</td>\n",
        "      <td>text_ratings</td>\n",
        "      <td>Text ratings</td>\n",
@@ -884,10 +877,10 @@
        "    <tr>\n",
        "      <th>8</th>\n",
        "      <td>iconic midtown skyscrapers</td>\n",
-       "      <td>1. Visit Empire State Building, explore exhibits. \n",
-       "2. Proceed to Top of the Rock, photograph city views. \n",
-       "3. Explore New York Public Library Schwarzman Building, see exhibitions.</td>\n",
-       "      <td>1</td>\n",
+       "      <td>Day 1: Empire State Building, Skyride.\n",
+       "Day 2: Rockefeller Center, Top of the Rock.\n",
+       "Day 3: One World Trade Center, 9/11 Memorial &amp; Museum.</td>\n",
+       "      <td>0</td>\n",
        "      <td>855b84d49dadc258f82d949bf3d57a100c788e6e093e4615e8b4e03567f1ffc9</td>\n",
        "      <td>letter_count</td>\n",
        "      <td>Counts the number of times the given letter appears in the text</td>\n",
@@ -897,10 +890,10 @@
        "    <tr>\n",
        "      <th>9</th>\n",
        "      <td>iconic midtown skyscrapers</td>\n",
-       "      <td>1. Visit Empire State Building, explore exhibits. \n",
-       "2. Proceed to Top of the Rock, photograph city views. \n",
-       "3. Explore New York Public Library Schwarzman Building, see exhibitions.</td>\n",
-       "      <td>CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})</td>\n",
+       "      <td>Day 1: Empire State Building, Skyride.\n",
+       "Day 2: Rockefeller Center, Top of the Rock.\n",
+       "Day 3: One World Trade Center, 9/11 Memorial &amp; Museum.</td>\n",
+       "      <td>CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"neutral\",\\n  \"confidence_probability\": 0.9\\n})</td>\n",
        "      <td>a351b0b7ab1639eb32695430b3e1bb65c96d11b528730c103d5879234a3bd8bb</td>\n",
        "      <td>emotional_valence</td>\n",
        "      <td>Emotional valence</td>\n",
@@ -924,73 +917,58 @@
        "8   iconic midtown skyscrapers   \n",
        "9   iconic midtown skyscrapers   \n",
        "\n",
-       "                                                                                                                                                                                                              aiconfig_output  \\\n",
-       "0  Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \n",
-       "\n",
-       "Day 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \n",
-       "\n",
-       "Day 3: Evening of Spanish tapas with flamenco performances in NYC.\n",
-       "   \n",
-       "1  Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \n",
-       "\n",
-       "Day 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \n",
-       "\n",
-       "Day 3: Evening of Spanish tapas with flamenco performances in NYC.\n",
-       "   \n",
-       "2  Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \n",
-       "\n",
-       "Day 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \n",
-       "\n",
-       "Day 3: Evening of Spanish tapas with flamenco performances in NYC.\n",
-       "   \n",
-       "3  Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \n",
-       "\n",
-       "Day 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \n",
+       "                                                                                                                                               aiconfig_output  \\\n",
+       "0  1. Explore Chelsea Market's international food stalls.\n",
+       "2. Guided Manhattan Chinatown food tour.\n",
+       "3. Experience Italian heritage and cuisine in Little Italy.   \n",
+       "1  1. Explore Chelsea Market's international food stalls.\n",
+       "2. Guided Manhattan Chinatown food tour.\n",
+       "3. Experience Italian heritage and cuisine in Little Italy.   \n",
+       "2  1. Explore Chelsea Market's international food stalls.\n",
+       "2. Guided Manhattan Chinatown food tour.\n",
+       "3. Experience Italian heritage and cuisine in Little Italy.   \n",
+       "3  1. Explore Chelsea Market's international food stalls.\n",
+       "2. Guided Manhattan Chinatown food tour.\n",
+       "3. Experience Italian heritage and cuisine in Little Italy.   \n",
+       "4  1. Explore Chelsea Market's international food stalls.\n",
+       "2. Guided Manhattan Chinatown food tour.\n",
+       "3. Experience Italian heritage and cuisine in Little Italy.   \n",
+       "5                    Day 1: Empire State Building, Skyride.\n",
+       "Day 2: Rockefeller Center, Top of the Rock.\n",
+       "Day 3: One World Trade Center, 9/11 Memorial & Museum.   \n",
+       "6                    Day 1: Empire State Building, Skyride.\n",
+       "Day 2: Rockefeller Center, Top of the Rock.\n",
+       "Day 3: One World Trade Center, 9/11 Memorial & Museum.   \n",
+       "7                    Day 1: Empire State Building, Skyride.\n",
+       "Day 2: Rockefeller Center, Top of the Rock.\n",
+       "Day 3: One World Trade Center, 9/11 Memorial & Museum.   \n",
+       "8                    Day 1: Empire State Building, Skyride.\n",
+       "Day 2: Rockefeller Center, Top of the Rock.\n",
+       "Day 3: One World Trade Center, 9/11 Memorial & Museum.   \n",
+       "9                    Day 1: Empire State Building, Skyride.\n",
+       "Day 2: Rockefeller Center, Top of the Rock.\n",
+       "Day 3: One World Trade Center, 9/11 Memorial & Museum.   \n",
        "\n",
-       "Day 3: Evening of Spanish tapas with flamenco performances in NYC.\n",
-       "   \n",
-       "4  Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \n",
-       "\n",
-       "Day 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \n",
-       "\n",
-       "Day 3: Evening of Spanish tapas with flamenco performances in NYC.\n",
-       "   \n",
-       "5                                          1. Visit Empire State Building, explore exhibits. \n",
-       "2. Proceed to Top of the Rock, photograph city views. \n",
-       "3. Explore New York Public Library Schwarzman Building, see exhibitions.   \n",
-       "6                                          1. Visit Empire State Building, explore exhibits. \n",
-       "2. Proceed to Top of the Rock, photograph city views. \n",
-       "3. Explore New York Public Library Schwarzman Building, see exhibitions.   \n",
-       "7                                          1. Visit Empire State Building, explore exhibits. \n",
-       "2. Proceed to Top of the Rock, photograph city views. \n",
-       "3. Explore New York Public Library Schwarzman Building, see exhibitions.   \n",
-       "8                                          1. Visit Empire State Building, explore exhibits. \n",
-       "2. Proceed to Top of the Rock, photograph city views. \n",
-       "3. Explore New York Public Library Schwarzman Building, see exhibitions.   \n",
-       "9                                          1. Visit Empire State Building, explore exhibits. \n",
-       "2. Proceed to Top of the Rock, photograph city views. \n",
-       "3. Explore New York Public Library Schwarzman Building, see exhibitions.   \n",
-       "\n",
-       "                                                                                                                                                                                                                                              value  \\\n",
-       "0                                                                                                                                                                                                                                               218   \n",
-       "1                                                                                                                                                                                                                                             False   \n",
-       "2                   CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text is concise and provides clear information about the activities planned for each day.\"\\n})   \n",
-       "3                                                                                                                                                                                                                                                 4   \n",
-       "4                                                                                                                                           CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})   \n",
-       "5                                                                                                                                                                                                                                               178   \n",
-       "6                                                                                                                                                                                                                                              True   \n",
-       "7  CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 4,\\n  \"conciseness_confidence\": 0.8,\\n  \"conciseness_reasoning\": \"The text is concise and provides clear instructions for visiting three different attractions in New York City.\"\\n})   \n",
-       "8                                                                                                                                                                                                                                                 1   \n",
-       "9                                                                                                                                           CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})   \n",
+       "                                                                                                                                                                                                                                                                   value  \\\n",
+       "0                                                                                                                                                                                                                                                                    155   \n",
+       "1                                                                                                                                                                                                                                                                  False   \n",
+       "2                  CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text is concise and provides clear information about three different food-related experiences in New York City.\"\\n})   \n",
+       "3                                                                                                                                                                                                                                                                      0   \n",
+       "4                                                                                                                                                                CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})   \n",
+       "5                                                                                                                                                                                                                                                                    137   \n",
+       "6                                                                                                                                                                                                                                                                   True   \n",
+       "7  CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text provides a clear and concise itinerary for three days in New York City, mentioning the main attractions to visit each day.\"\\n})   \n",
+       "8                                                                                                                                                                                                                                                                      0   \n",
+       "9                                                                                                                                                              CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"neutral\",\\n  \"confidence_probability\": 0.9\\n})   \n",
        "\n",
        "                                                          metric_id  \\\n",
-       "0  24952ce05ce6dcbd370ccc3b39d410edeab8e1cf420130a83cf9388df6bcfdc3   \n",
-       "1  0c461362f44884023dda5537ce88263ba20d555562bac8abc05bcde0ce1aacf6   \n",
+       "0  5b29b6ba68aeeadc42b7333015f4b158f7514f68c05fef79a702e98cf9983085   \n",
+       "1  12b2b88421a53f87fa1502c48a3bfa8b84aa22af3528178f0ec8d699db041d8d   \n",
        "2  300b32bb8a01befd5e729eaf73506bdba01f910c0db0c8f70136dd2e48e298a7   \n",
        "3  855b84d49dadc258f82d949bf3d57a100c788e6e093e4615e8b4e03567f1ffc9   \n",
        "4  a351b0b7ab1639eb32695430b3e1bb65c96d11b528730c103d5879234a3bd8bb   \n",
-       "5  24952ce05ce6dcbd370ccc3b39d410edeab8e1cf420130a83cf9388df6bcfdc3   \n",
-       "6  53e4c7163f49fdc7727286e638ff07bcb570faaa334456775c616c2f4ad3eb3f   \n",
+       "5  5b29b6ba68aeeadc42b7333015f4b158f7514f68c05fef79a702e98cf9983085   \n",
+       "6  17bb1efe1fb306bce98240f3534f5d29c68564e4e7c1c0db17198247d19754e3   \n",
        "7  300b32bb8a01befd5e729eaf73506bdba01f910c0db0c8f70136dd2e48e298a7   \n",
        "8  855b84d49dadc258f82d949bf3d57a100c788e6e093e4615e8b4e03567f1ffc9   \n",
        "9  a351b0b7ab1639eb32695430b3e1bb65c96d11b528730c103d5879234a3bd8bb   \n",
@@ -1032,7 +1010,7 @@
        "9                None                 None  "
       ]
      },
-     "execution_count": 37,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1051,7 +1029,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -1102,54 +1080,54 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>different kinds of cuisines</th>\n",
-       "      <th>Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \\n\\nDay 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \\n\\nDay 3: Evening of Spanish tapas with flamenco performances in NYC.\\n</th>\n",
-       "      <td>218</td>\n",
+       "      <th>1. Explore Chelsea Market's international food stalls.\\n2. Guided Manhattan Chinatown food tour.\\n3. Experience Italian heritage and cuisine in Little Italy.</th>\n",
+       "      <td>155</td>\n",
        "      <td>CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})</td>\n",
-       "      <td>4</td>\n",
+       "      <td>0</td>\n",
        "      <td>False</td>\n",
-       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text is concise and provides clear information about the activities planned for each day.\"\\n})</td>\n",
+       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text is concise and provides clear information about three different food-related experiences in New York City.\"\\n})</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>iconic midtown skyscrapers</th>\n",
-       "      <th>1. Visit Empire State Building, explore exhibits. \\n2. Proceed to Top of the Rock, photograph city views. \\n3. Explore New York Public Library Schwarzman Building, see exhibitions.</th>\n",
-       "      <td>178</td>\n",
-       "      <td>CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})</td>\n",
-       "      <td>1</td>\n",
+       "      <th>Day 1: Empire State Building, Skyride.\\nDay 2: Rockefeller Center, Top of the Rock.\\nDay 3: One World Trade Center, 9/11 Memorial &amp; Museum.</th>\n",
+       "      <td>137</td>\n",
+       "      <td>CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"neutral\",\\n  \"confidence_probability\": 0.9\\n})</td>\n",
+       "      <td>0</td>\n",
        "      <td>True</td>\n",
-       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 4,\\n  \"conciseness_confidence\": 0.8,\\n  \"conciseness_reasoning\": \"The text is concise and provides clear instructions for visiting three different attractions in New York City.\"\\n})</td>\n",
+       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text provides a clear and concise itinerary for three days in New York City, mentioning the main attractions to visit each day.\"\\n})</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "metric_name                                                                                                                                                                                                                                                 brevity  \\\n",
-       "input                       aiconfig_output                                                                                                                                                                                                                           \n",
-       "different kinds of cuisines Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \\n\\nDay 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \\n\\nDay 3: Evening of Spanish tapas with flamenco performances in NYC.\\n     218   \n",
-       "iconic midtown skyscrapers  1. Visit Empire State Building, explore exhibits. \\n2. Proceed to Top of the Rock, photograph city views. \\n3. Explore New York Public Library Schwarzman Building, see exhibitions.                                                178   \n",
+       "metric_name                                                                                                                                                                               brevity  \\\n",
+       "input                       aiconfig_output                                                                                                                                                         \n",
+       "different kinds of cuisines 1. Explore Chelsea Market's international food stalls.\\n2. Guided Manhattan Chinatown food tour.\\n3. Experience Italian heritage and cuisine in Little Italy.     155   \n",
+       "iconic midtown skyscrapers  Day 1: Empire State Building, Skyride.\\nDay 2: Rockefeller Center, Top of the Rock.\\nDay 3: One World Trade Center, 9/11 Memorial & Museum.                       137   \n",
        "\n",
-       "metric_name                                                                                                                                                                                                                                                                                                                                        emotional_valence  \\\n",
-       "input                       aiconfig_output                                                                                                                                                                                                                                                                                                                            \n",
-       "different kinds of cuisines Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \\n\\nDay 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \\n\\nDay 3: Evening of Spanish tapas with flamenco performances in NYC.\\n  CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})   \n",
-       "iconic midtown skyscrapers  1. Visit Empire State Building, explore exhibits. \\n2. Proceed to Top of the Rock, photograph city views. \\n3. Explore New York Public Library Schwarzman Building, see exhibitions.                                             CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})   \n",
+       "metric_name                                                                                                                                                                                                                                                                        emotional_valence  \\\n",
+       "input                       aiconfig_output                                                                                                                                                                                                                                                            \n",
+       "different kinds of cuisines 1. Explore Chelsea Market's international food stalls.\\n2. Guided Manhattan Chinatown food tour.\\n3. Experience Italian heritage and cuisine in Little Italy.    CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})   \n",
+       "iconic midtown skyscrapers  Day 1: Empire State Building, Skyride.\\nDay 2: Rockefeller Center, Top of the Rock.\\nDay 3: One World Trade Center, 9/11 Memorial & Museum.                    CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"neutral\",\\n  \"confidence_probability\": 0.9\\n})   \n",
        "\n",
-       "metric_name                                                                                                                                                                                                                                                 letter_count  \\\n",
-       "input                       aiconfig_output                                                                                                                                                                                                                                \n",
-       "different kinds of cuisines Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \\n\\nDay 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \\n\\nDay 3: Evening of Spanish tapas with flamenco performances in NYC.\\n            4   \n",
-       "iconic midtown skyscrapers  1. Visit Empire State Building, explore exhibits. \\n2. Proceed to Top of the Rock, photograph city views. \\n3. Explore New York Public Library Schwarzman Building, see exhibitions.                                                       1   \n",
+       "metric_name                                                                                                                                                                               letter_count  \\\n",
+       "input                       aiconfig_output                                                                                                                                                              \n",
+       "different kinds of cuisines 1. Explore Chelsea Market's international food stalls.\\n2. Guided Manhattan Chinatown food tour.\\n3. Experience Italian heritage and cuisine in Little Italy.            0   \n",
+       "iconic midtown skyscrapers  Day 1: Empire State Building, Skyride.\\nDay 2: Rockefeller Center, Top of the Rock.\\nDay 3: One World Trade Center, 9/11 Memorial & Museum.                              0   \n",
        "\n",
-       "metric_name                                                                                                                                                                                                                                                 substring_match  \\\n",
-       "input                       aiconfig_output                                                                                                                                                                                                                                   \n",
-       "different kinds of cuisines Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \\n\\nDay 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \\n\\nDay 3: Evening of Spanish tapas with flamenco performances in NYC.\\n           False   \n",
-       "iconic midtown skyscrapers  1. Visit Empire State Building, explore exhibits. \\n2. Proceed to Top of the Rock, photograph city views. \\n3. Explore New York Public Library Schwarzman Building, see exhibitions.                                                       True   \n",
+       "metric_name                                                                                                                                                                               substring_match  \\\n",
+       "input                       aiconfig_output                                                                                                                                                                 \n",
+       "different kinds of cuisines 1. Explore Chelsea Market's international food stalls.\\n2. Guided Manhattan Chinatown food tour.\\n3. Experience Italian heritage and cuisine in Little Italy.           False   \n",
+       "iconic midtown skyscrapers  Day 1: Empire State Building, Skyride.\\nDay 2: Rockefeller Center, Top of the Rock.\\nDay 3: One World Trade Center, 9/11 Memorial & Museum.                              True   \n",
        "\n",
-       "metric_name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      text_ratings  \n",
-       "input                       aiconfig_output                                                                                                                                                                                                                                                                                                                                                                                                                                                                    \n",
-       "different kinds of cuisines Day 1: Food tour in Manhattan's Chinatown, tasting regional Chinese dishes. \\n\\nDay 2: Brooklyn pizza-making class, uncovering NY-style pizza secrets. \\n\\nDay 3: Evening of Spanish tapas with flamenco performances in NYC.\\n                   CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text is concise and provides clear information about the activities planned for each day.\"\\n})  \n",
-       "iconic midtown skyscrapers  1. Visit Empire State Building, explore exhibits. \\n2. Proceed to Top of the Rock, photograph city views. \\n3. Explore New York Public Library Schwarzman Building, see exhibitions.                                             CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 4,\\n  \"conciseness_confidence\": 0.8,\\n  \"conciseness_reasoning\": \"The text is concise and provides clear instructions for visiting three different attractions in New York City.\"\\n})  "
+       "metric_name                                                                                                                                                                                                                                                                                                                                                                                                                                         text_ratings  \n",
+       "input                       aiconfig_output                                                                                                                                                                                                                                                                                                                                                                                                                       \n",
+       "different kinds of cuisines 1. Explore Chelsea Market's international food stalls.\\n2. Guided Manhattan Chinatown food tour.\\n3. Experience Italian heritage and cuisine in Little Italy.                  CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text is concise and provides clear information about three different food-related experiences in New York City.\"\\n})  \n",
+       "iconic midtown skyscrapers  Day 1: Empire State Building, Skyride.\\nDay 2: Rockefeller Center, Top of the Rock.\\nDay 3: One World Trade Center, 9/11 Memorial & Museum.                    CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text provides a clear and concise itinerary for three days in New York City, mentioning the main attractions to visit each day.\"\\n})  "
       ]
      },
-     "execution_count": 38,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1169,7 +1147,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -1188,7 +1166,7 @@
     "    run_test_suite_outputs_only,\n",
     ")\n",
     "\n",
-    "from aiconfig.eval.api import metrics\n",
+    "from aiconfig.eval.api import test_suite_metrics as metrics\n",
     "\n",
     "\n",
     "# This is similar to \"test_inputs_with_substrings\" above, but we have the AIConfig *outputs*\n",
@@ -1221,7 +1199,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -1233,34 +1211,38 @@
       "Test output:\n",
       " Begin at Chelsea Market for diverse food options. Continue to Queens for immersive food tours. Conclude at Smorgasburg for unique outdoor food market experience \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function _calculate_brevity at 0x132fa1a20>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function metric.<locals>._construct.<locals>.evaluation_fn at 0x2bc1e8940>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"brevity\",\n",
       "  \"description\": \"Absolute text length\",\n",
       "  \"best_value\": 1,\n",
       "  \"worst_value\": 9223372036854775807,\n",
-      "  \"extra_metadata\": {},\n",
-      "  \"id\": \"24952ce05ce6dcbd370ccc3b39d410edeab8e1cf420130a83cf9388df6bcfdc3\"\n",
+      "  \"extra_metadata\": {\n",
+      "    \"args\": []\n",
+      "  },\n",
+      "  \"id\": \"5b29b6ba68aeeadc42b7333015f4b158f7514f68c05fef79a702e98cf9983085\"\n",
       "}))\n",
       "\n",
       "Test output:\n",
       " Begin at Chelsea Market for diverse food options. Continue to Queens for immersive food tours. Conclude at Smorgasburg for unique outdoor food market experience \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function substring_match.<locals>._fn at 0x1426c83a0>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function metric.<locals>._construct.<locals>.evaluation_fn at 0x2bc388790>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"substring_match\",\n",
       "  \"description\": \"True (pass) if contains given substring\",\n",
       "  \"best_value\": true,\n",
       "  \"worst_value\": false,\n",
       "  \"extra_metadata\": {\n",
-      "    \"substring\": \"Magnolia Bakery\",\n",
+      "    \"args\": [\n",
+      "      \"Magnolia Bakery\"\n",
+      "    ],\n",
       "    \"case_sensitive\": false\n",
       "  },\n",
-      "  \"id\": \"0c461362f44884023dda5537ce88263ba20d555562bac8abc05bcde0ce1aacf6\"\n",
+      "  \"id\": \"12b2b88421a53f87fa1502c48a3bfa8b84aa22af3528178f0ec8d699db041d8d\"\n",
       "}))\n",
       "\n",
       "Test output:\n",
       " Begin at Chelsea Market for diverse food options. Continue to Queens for immersive food tours. Conclude at Smorgasburg for unique outdoor food market experience \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x132f3c430>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x2bc1e8d30>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"text_ratings\",\n",
       "  \"description\": \"Text ratings\",\n",
       "  \"best_value\": null,\n",
@@ -1276,7 +1258,7 @@
       "Test output:\n",
       " Begin at Chelsea Market for diverse food options. Continue to Queens for immersive food tours. Conclude at Smorgasburg for unique outdoor food market experience \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function make_letter_count_metric.<locals>.letter_count_metric at 0x142815510>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function make_letter_count_metric.<locals>.letter_count_metric at 0x2bc1eb880>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"letter_count\",\n",
       "  \"description\": \"Counts the number of times the given letter appears in the text\",\n",
       "  \"best_value\": null,\n",
@@ -1290,7 +1272,7 @@
       "Test output:\n",
       " Begin at Chelsea Market for diverse food options. Continue to Queens for immersive food tours. Conclude at Smorgasburg for unique outdoor food market experience \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x142817760>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x2bc355120>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"emotional_valence\",\n",
       "  \"description\": \"Emotional valence\",\n",
       "  \"best_value\": null,\n",
@@ -1306,34 +1288,38 @@
       "Test output:\n",
       " 1. Empire State Building: Observation deck visit, explore exhibits and historical displays. 2. Rockefeller Center: Visit \"Top of the Rock\", ice-skating, NBC Studio tour, shopping and dining. 3. Chrysler Building: Admire exterior and iconic spire, photo opportunities. \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function _calculate_brevity at 0x132fa1a20>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function metric.<locals>._construct.<locals>.evaluation_fn at 0x2bc1e8940>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"brevity\",\n",
       "  \"description\": \"Absolute text length\",\n",
       "  \"best_value\": 1,\n",
       "  \"worst_value\": 9223372036854775807,\n",
-      "  \"extra_metadata\": {},\n",
-      "  \"id\": \"24952ce05ce6dcbd370ccc3b39d410edeab8e1cf420130a83cf9388df6bcfdc3\"\n",
+      "  \"extra_metadata\": {\n",
+      "    \"args\": []\n",
+      "  },\n",
+      "  \"id\": \"5b29b6ba68aeeadc42b7333015f4b158f7514f68c05fef79a702e98cf9983085\"\n",
       "}))\n",
       "\n",
       "Test output:\n",
       " 1. Empire State Building: Observation deck visit, explore exhibits and historical displays. 2. Rockefeller Center: Visit \"Top of the Rock\", ice-skating, NBC Studio tour, shopping and dining. 3. Chrysler Building: Admire exterior and iconic spire, photo opportunities. \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function substring_match.<locals>._fn at 0x1426c8940>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function metric.<locals>._construct.<locals>.evaluation_fn at 0x2bc388550>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"substring_match\",\n",
       "  \"description\": \"True (pass) if contains given substring\",\n",
       "  \"best_value\": true,\n",
       "  \"worst_value\": false,\n",
       "  \"extra_metadata\": {\n",
-      "    \"substring\": \"Empire State Building\",\n",
+      "    \"args\": [\n",
+      "      \"Empire State Building\"\n",
+      "    ],\n",
       "    \"case_sensitive\": false\n",
       "  },\n",
-      "  \"id\": \"53e4c7163f49fdc7727286e638ff07bcb570faaa334456775c616c2f4ad3eb3f\"\n",
+      "  \"id\": \"17bb1efe1fb306bce98240f3534f5d29c68564e4e7c1c0db17198247d19754e3\"\n",
       "}))\n",
       "\n",
       "Test output:\n",
       " 1. Empire State Building: Observation deck visit, explore exhibits and historical displays. 2. Rockefeller Center: Visit \"Top of the Rock\", ice-skating, NBC Studio tour, shopping and dining. 3. Chrysler Building: Admire exterior and iconic spire, photo opportunities. \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x132f3c430>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x2bc1e8d30>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"text_ratings\",\n",
       "  \"description\": \"Text ratings\",\n",
       "  \"best_value\": null,\n",
@@ -1349,7 +1335,7 @@
       "Test output:\n",
       " 1. Empire State Building: Observation deck visit, explore exhibits and historical displays. 2. Rockefeller Center: Visit \"Top of the Rock\", ice-skating, NBC Studio tour, shopping and dining. 3. Chrysler Building: Admire exterior and iconic spire, photo opportunities. \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function make_letter_count_metric.<locals>.letter_count_metric at 0x142815510>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function make_letter_count_metric.<locals>.letter_count_metric at 0x2bc1eb880>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"letter_count\",\n",
       "  \"description\": \"Counts the number of times the given letter appears in the text\",\n",
       "  \"best_value\": null,\n",
@@ -1363,7 +1349,7 @@
       "Test output:\n",
       " 1. Empire State Building: Observation deck visit, explore exhibits and historical displays. 2. Rockefeller Center: Visit \"Top of the Rock\", ice-skating, NBC Studio tour, shopping and dining. 3. Chrysler Building: Admire exterior and iconic spire, photo opportunities. \n",
       "Function:\n",
-      " Metric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x142817760>, metric_metadata=EvaluationMetricMetadata({\n",
+      " TestSuiteMetric(evaluation_fn=<function make_structured_llm_metric.<locals>._make_evaluation_fn.<locals>._evaluation_fn at 0x2bc355120>, metric_metadata=EvaluationMetricMetadata({\n",
       "  \"name\": \"emotional_valence\",\n",
       "  \"description\": \"Emotional valence\",\n",
       "  \"best_value\": null,\n",
@@ -1387,7 +1373,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -1435,7 +1421,7 @@
        "      <td>Missing</td>\n",
        "      <td>Begin at Chelsea Market for diverse food options. Continue to Queens for immersive food tours. Conclude at Smorgasburg for unique outdoor food market experience</td>\n",
        "      <td>160</td>\n",
-       "      <td>24952ce05ce6dcbd370ccc3b39d410edeab8e1cf420130a83cf9388df6bcfdc3</td>\n",
+       "      <td>5b29b6ba68aeeadc42b7333015f4b158f7514f68c05fef79a702e98cf9983085</td>\n",
        "      <td>brevity</td>\n",
        "      <td>Absolute text length</td>\n",
        "      <td>1</td>\n",
@@ -1446,7 +1432,7 @@
        "      <td>Missing</td>\n",
        "      <td>Begin at Chelsea Market for diverse food options. Continue to Queens for immersive food tours. Conclude at Smorgasburg for unique outdoor food market experience</td>\n",
        "      <td>False</td>\n",
-       "      <td>0c461362f44884023dda5537ce88263ba20d555562bac8abc05bcde0ce1aacf6</td>\n",
+       "      <td>12b2b88421a53f87fa1502c48a3bfa8b84aa22af3528178f0ec8d699db041d8d</td>\n",
        "      <td>substring_match</td>\n",
        "      <td>True (pass) if contains given substring</td>\n",
        "      <td>True</td>\n",
@@ -1456,7 +1442,7 @@
        "      <th>2</th>\n",
        "      <td>Missing</td>\n",
        "      <td>Begin at Chelsea Market for diverse food options. Continue to Queens for immersive food tours. Conclude at Smorgasburg for unique outdoor food market experience</td>\n",
-       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 4,\\n  \"conciseness_confidence\": 0.8,\\n  \"conciseness_reasoning\": \"The text provides a clear and concise description of the itinerary, mentioning the starting point at Chelsea Market, the visit to Queens for food tours, and the conclusion at Smorgasburg for an outdoor food market experience.\"\\n})</td>\n",
+       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 4,\\n  \"conciseness_confidence\": 0.8,\\n  \"conciseness_reasoning\": \"The text provides a clear and concise description of the itinerary, mentioning the starting point, the main activity in Queens, and the final destination.\"\\n})</td>\n",
        "      <td>300b32bb8a01befd5e729eaf73506bdba01f910c0db0c8f70136dd2e48e298a7</td>\n",
        "      <td>text_ratings</td>\n",
        "      <td>Text ratings</td>\n",
@@ -1490,7 +1476,7 @@
        "      <td>Missing</td>\n",
        "      <td>1. Empire State Building: Observation deck visit, explore exhibits and historical displays. 2. Rockefeller Center: Visit \"Top of the Rock\", ice-skating, NBC Studio tour, shopping and dining. 3. Chrysler Building: Admire exterior and iconic spire, photo opportunities.</td>\n",
        "      <td>267</td>\n",
-       "      <td>24952ce05ce6dcbd370ccc3b39d410edeab8e1cf420130a83cf9388df6bcfdc3</td>\n",
+       "      <td>5b29b6ba68aeeadc42b7333015f4b158f7514f68c05fef79a702e98cf9983085</td>\n",
        "      <td>brevity</td>\n",
        "      <td>Absolute text length</td>\n",
        "      <td>1</td>\n",
@@ -1501,7 +1487,7 @@
        "      <td>Missing</td>\n",
        "      <td>1. Empire State Building: Observation deck visit, explore exhibits and historical displays. 2. Rockefeller Center: Visit \"Top of the Rock\", ice-skating, NBC Studio tour, shopping and dining. 3. Chrysler Building: Admire exterior and iconic spire, photo opportunities.</td>\n",
        "      <td>True</td>\n",
-       "      <td>53e4c7163f49fdc7727286e638ff07bcb570faaa334456775c616c2f4ad3eb3f</td>\n",
+       "      <td>17bb1efe1fb306bce98240f3534f5d29c68564e4e7c1c0db17198247d19754e3</td>\n",
        "      <td>substring_match</td>\n",
        "      <td>True (pass) if contains given substring</td>\n",
        "      <td>True</td>\n",
@@ -1511,7 +1497,7 @@
        "      <th>7</th>\n",
        "      <td>Missing</td>\n",
        "      <td>1. Empire State Building: Observation deck visit, explore exhibits and historical displays. 2. Rockefeller Center: Visit \"Top of the Rock\", ice-skating, NBC Studio tour, shopping and dining. 3. Chrysler Building: Admire exterior and iconic spire, photo opportunities.</td>\n",
-       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text is concise and provides a clear description of each attraction.\"\\n})</td>\n",
+       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text provides a concise description of the attractions and activities at each location.\"\\n})</td>\n",
        "      <td>300b32bb8a01befd5e729eaf73506bdba01f910c0db0c8f70136dd2e48e298a7</td>\n",
        "      <td>text_ratings</td>\n",
        "      <td>Text ratings</td>\n",
@@ -1569,26 +1555,26 @@
        "8  1. Empire State Building: Observation deck visit, explore exhibits and historical displays. 2. Rockefeller Center: Visit \"Top of the Rock\", ice-skating, NBC Studio tour, shopping and dining. 3. Chrysler Building: Admire exterior and iconic spire, photo opportunities.   \n",
        "9  1. Empire State Building: Observation deck visit, explore exhibits and historical displays. 2. Rockefeller Center: Visit \"Top of the Rock\", ice-skating, NBC Studio tour, shopping and dining. 3. Chrysler Building: Admire exterior and iconic spire, photo opportunities.   \n",
        "\n",
-       "                                                                                                                                                                                                                                                                                                                                                                 value  \\\n",
-       "0                                                                                                                                                                                                                                                                                                                                                                  160   \n",
-       "1                                                                                                                                                                                                                                                                                                                                                                False   \n",
-       "2  CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 4,\\n  \"conciseness_confidence\": 0.8,\\n  \"conciseness_reasoning\": \"The text provides a clear and concise description of the itinerary, mentioning the starting point at Chelsea Market, the visit to Queens for food tours, and the conclusion at Smorgasburg for an outdoor food market experience.\"\\n})   \n",
-       "3                                                                                                                                                                                                                                                                                                                                                                    0   \n",
-       "4                                                                                                                                                                                                                                                              CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})   \n",
-       "5                                                                                                                                                                                                                                                                                                                                                                  267   \n",
-       "6                                                                                                                                                                                                                                                                                                                                                                 True   \n",
-       "7                                                                                                                                                           CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text is concise and provides a clear description of each attraction.\"\\n})   \n",
-       "8                                                                                                                                                                                                                                                                                                                                                                    0   \n",
-       "9                                                                                                                                                                                                                                                              CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})   \n",
+       "                                                                                                                                                                                                                                                                                          value  \\\n",
+       "0                                                                                                                                                                                                                                                                                           160   \n",
+       "1                                                                                                                                                                                                                                                                                         False   \n",
+       "2  CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 4,\\n  \"conciseness_confidence\": 0.8,\\n  \"conciseness_reasoning\": \"The text provides a clear and concise description of the itinerary, mentioning the starting point, the main activity in Queens, and the final destination.\"\\n})   \n",
+       "3                                                                                                                                                                                                                                                                                             0   \n",
+       "4                                                                                                                                                                                       CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})   \n",
+       "5                                                                                                                                                                                                                                                                                           267   \n",
+       "6                                                                                                                                                                                                                                                                                          True   \n",
+       "7                                                                 CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text provides a concise description of the attractions and activities at each location.\"\\n})   \n",
+       "8                                                                                                                                                                                                                                                                                             0   \n",
+       "9                                                                                                                                                                                       CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})   \n",
        "\n",
        "                                                          metric_id  \\\n",
-       "0  24952ce05ce6dcbd370ccc3b39d410edeab8e1cf420130a83cf9388df6bcfdc3   \n",
-       "1  0c461362f44884023dda5537ce88263ba20d555562bac8abc05bcde0ce1aacf6   \n",
+       "0  5b29b6ba68aeeadc42b7333015f4b158f7514f68c05fef79a702e98cf9983085   \n",
+       "1  12b2b88421a53f87fa1502c48a3bfa8b84aa22af3528178f0ec8d699db041d8d   \n",
        "2  300b32bb8a01befd5e729eaf73506bdba01f910c0db0c8f70136dd2e48e298a7   \n",
        "3  855b84d49dadc258f82d949bf3d57a100c788e6e093e4615e8b4e03567f1ffc9   \n",
        "4  a351b0b7ab1639eb32695430b3e1bb65c96d11b528730c103d5879234a3bd8bb   \n",
-       "5  24952ce05ce6dcbd370ccc3b39d410edeab8e1cf420130a83cf9388df6bcfdc3   \n",
-       "6  53e4c7163f49fdc7727286e638ff07bcb570faaa334456775c616c2f4ad3eb3f   \n",
+       "5  5b29b6ba68aeeadc42b7333015f4b158f7514f68c05fef79a702e98cf9983085   \n",
+       "6  17bb1efe1fb306bce98240f3534f5d29c68564e4e7c1c0db17198247d19754e3   \n",
        "7  300b32bb8a01befd5e729eaf73506bdba01f910c0db0c8f70136dd2e48e298a7   \n",
        "8  855b84d49dadc258f82d949bf3d57a100c788e6e093e4615e8b4e03567f1ffc9   \n",
        "9  a351b0b7ab1639eb32695430b3e1bb65c96d11b528730c103d5879234a3bd8bb   \n",
@@ -1630,7 +1616,7 @@
        "9                None                 None  "
       ]
      },
-     "execution_count": 42,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1646,7 +1632,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -1699,7 +1685,7 @@
        "      <td>CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})</td>\n",
        "      <td>0</td>\n",
        "      <td>True</td>\n",
-       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text is concise and provides a clear description of each attraction.\"\\n})</td>\n",
+       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text provides a concise description of the attractions and activities at each location.\"\\n})</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Begin at Chelsea Market for diverse food options. Continue to Queens for immersive food tours. Conclude at Smorgasburg for unique outdoor food market experience</th>\n",
@@ -1707,7 +1693,7 @@
        "      <td>CustomMetricPydanticObject(data={\\n  \"emotional_valence\": \"happy\",\\n  \"confidence_probability\": 0.9\\n})</td>\n",
        "      <td>0</td>\n",
        "      <td>False</td>\n",
-       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 4,\\n  \"conciseness_confidence\": 0.8,\\n  \"conciseness_reasoning\": \"The text provides a clear and concise description of the itinerary, mentioning the starting point at Chelsea Market, the visit to Queens for food tours, and the conclusion at Smorgasburg for an outdoor food market experience.\"\\n})</td>\n",
+       "      <td>CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 4,\\n  \"conciseness_confidence\": 0.8,\\n  \"conciseness_reasoning\": \"The text provides a clear and concise description of the itinerary, mentioning the starting point, the main activity in Queens, and the final destination.\"\\n})</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1734,13 +1720,13 @@
        "1. Empire State Building: Observation deck visit, explore exhibits and historical displays. 2. Rockefeller Center: Visit \"Top of the Rock\", ice-skating, NBC Studio tour, shopping and dining. 3. Chrysler Building: Admire exterior and iconic spire, photo opportunities.            True   \n",
        "Begin at Chelsea Market for diverse food options. Continue to Queens for immersive food tours. Conclude at Smorgasburg for unique outdoor food market experience                                                                                                                      False   \n",
        "\n",
-       "metric_name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         text_ratings  \n",
-       "aiconfig_output                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   \n",
-       "1. Empire State Building: Observation deck visit, explore exhibits and historical displays. 2. Rockefeller Center: Visit \"Top of the Rock\", ice-skating, NBC Studio tour, shopping and dining. 3. Chrysler Building: Admire exterior and iconic spire, photo opportunities.                                                                                                                                                           CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text is concise and provides a clear description of each attraction.\"\\n})  \n",
-       "Begin at Chelsea Market for diverse food options. Continue to Queens for immersive food tours. Conclude at Smorgasburg for unique outdoor food market experience                                                                                                             CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 4,\\n  \"conciseness_confidence\": 0.8,\\n  \"conciseness_reasoning\": \"The text provides a clear and concise description of the itinerary, mentioning the starting point at Chelsea Market, the visit to Queens for food tours, and the conclusion at Smorgasburg for an outdoor food market experience.\"\\n})  "
+       "metric_name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  text_ratings  \n",
+       "aiconfig_output                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            \n",
+       "1. Empire State Building: Observation deck visit, explore exhibits and historical displays. 2. Rockefeller Center: Visit \"Top of the Rock\", ice-skating, NBC Studio tour, shopping and dining. 3. Chrysler Building: Admire exterior and iconic spire, photo opportunities.                                                                 CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 5,\\n  \"conciseness_confidence\": 0.9,\\n  \"conciseness_reasoning\": \"The text provides a concise description of the attractions and activities at each location.\"\\n})  \n",
+       "Begin at Chelsea Market for diverse food options. Continue to Queens for immersive food tours. Conclude at Smorgasburg for unique outdoor food market experience                                                                                                             CustomMetricPydanticObject(data={\\n  \"conciseness_rating\": 4,\\n  \"conciseness_confidence\": 0.8,\\n  \"conciseness_reasoning\": \"The text provides a clear and concise description of the itinerary, mentioning the starting point, the main activity in Queens, and the final destination.\"\\n})  "
       ]
      },
-     "execution_count": 43,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/python/src/aiconfig/eval/examples/travel/travel_parametrized.aiconfig.json b/python/src/aiconfig/eval/test_suite_examples/travel/travel_parametrized.aiconfig.json
similarity index 100%
rename from python/src/aiconfig/eval/examples/travel/travel_parametrized.aiconfig.json
rename to python/src/aiconfig/eval/test_suite_examples/travel/travel_parametrized.aiconfig.json
diff --git a/python/src/aiconfig/eval/examples/travel/travel_promptfoo_config.yaml b/python/src/aiconfig/eval/test_suite_examples/travel/travel_promptfoo_config.yaml
similarity index 100%
rename from python/src/aiconfig/eval/examples/travel/travel_promptfoo_config.yaml
rename to python/src/aiconfig/eval/test_suite_examples/travel/travel_promptfoo_config.yaml
diff --git a/python/src/aiconfig/eval/lib.py b/python/src/aiconfig/eval/test_suite_lib.py
similarity index 93%
rename from python/src/aiconfig/eval/lib.py
rename to python/src/aiconfig/eval/test_suite_lib.py
index fd66090f6..1c7eaf3cb 100644
--- a/python/src/aiconfig/eval/lib.py
+++ b/python/src/aiconfig/eval/test_suite_lib.py
@@ -6,10 +6,11 @@
 from typing import Any, Generic, NewType, Sequence, Tuple, TypeVar
 
 import aiconfig.eval.common as common
+import aiconfig.eval.test_suite_common as test_suite_common
 import lastmile_utils.lib.core.api as core_utils
 import pandas as pd
 from aiconfig.Config import AIConfigRuntime
-from aiconfig.eval.metrics import Metric
+from aiconfig.eval.test_suite_metrics import TestSuiteMetric
 from frozendict import frozendict
 from result import Err, Ok, Result
 
@@ -18,13 +19,13 @@
 
 
 # TODO: figure out a way to do heterogenous list without Any
-# Each test is a (input_datum, Metric) pair
+# Each test is a (input_datum, TestSuiteMetric) pair
 UserTestSuiteWithInputs = Sequence[
-    Tuple[str | dict[str, str], Metric[str, Any]]
+    Tuple[str | dict[str, str], TestSuiteMetric[str, Any]]
 ]
 
-# Each test is a (output_datum, Metric) pair
-UserTestSuiteOutputsOnly = Sequence[Tuple[str, Metric[str, Any]]]
+# Each test is a (output_datum, TestSuiteMetric) pair
+UserTestSuiteOutputsOnly = Sequence[Tuple[str, TestSuiteMetric[str, Any]]]
 
 
 # NOTE: it's probably better to avoid NewType in the future, because it doesn't
@@ -97,7 +98,7 @@ class SampleEvaluationResult(
 ):
     input_datum: common.T_InputDatum | None
     output_datum: common.T_OutputDatum
-    metric_value: common.SampleMetricValue[
+    metric_value: test_suite_common.SampleMetricValue[
         common.T_OutputDatum, common.T_MetricValue
     ]
 
@@ -111,7 +112,7 @@ class SampleEvaluationParams(
     # input_sample is here for documentation/debugging.
     input_sample: common.T_InputDatum | None
     output_sample: common.T_OutputDatum
-    metric: Metric[common.T_OutputDatum, common.T_MetricValue]
+    metric: TestSuiteMetric[common.T_OutputDatum, common.T_MetricValue]
 
     def __str__(self) -> str:
         return f"\nSampleEvaluationParams:\n\t{self.output_sample=}\n\t{self.metric=}"
@@ -124,7 +125,7 @@ def __str__(self) -> str:
 DatasetEvaluationParams = Sequence[
     SampleEvaluationParams[common.T_InputDatum, common.T_OutputDatum, Any]
 ]
-MetricList = list[Metric[common.T_OutputDatum, Any]]
+MetricList = list[TestSuiteMetric[common.T_OutputDatum, Any]]
 
 
 async def _evaluate_for_sample(
@@ -157,7 +158,7 @@ def _ok_with_log(
     result = SampleEvaluationResult(
         input_datum=eval_params.input_sample,
         output_datum=sample,
-        metric_value=common.SampleMetricValue(
+        metric_value=test_suite_common.SampleMetricValue(
             #
             value=_ok_with_log(res_),
             metric_metadata=metric.metric_metadata,
@@ -258,15 +259,15 @@ def _user_test_input_to_internal_type(
                 return TextBasedInputDatum(frozendict(input_datum))
 
     test_suite_internal_types = [
-        (_user_test_input_to_internal_type(input_datum), metric)
-        for input_datum, metric in test_suite
+        (_user_test_input_to_internal_type(input_datum), TestSuiteMetric)
+        for input_datum, TestSuiteMetric in test_suite
     ]
 
     out: DatasetEvaluationParams[TextBasedInputDatum, TextOutput] = []
 
     # Group by input so that we only run each input through the AIConfig once.
     # This is sort of an optimization because the user can give the same input
-    # multiple times (with different metrics).
+    # multiple times (with different TestSuiteMetrics).
     input_to_metrics_mapping: dict[
         TextBasedInputDatum, MetricList[TextOutput]
     ] = {}
@@ -303,8 +304,8 @@ def _zip_inputs_outputs(outputs: list[TextOutput]):
         # of awaitables in aws.
         outputs_by_input = dict(zip(all_inputs, outputs))
 
-        for input_datum, metrics in input_to_metrics_mapping.items():
-            for metric in metrics:
+        for input_datum, TestSuiteMetrics in input_to_metrics_mapping.items():
+            for metric in TestSuiteMetrics:
                 out.append(
                     SampleEvaluationParams(
                         input_sample=input_datum,
diff --git a/python/src/aiconfig/eval/metrics.py b/python/src/aiconfig/eval/test_suite_metrics.py
similarity index 81%
rename from python/src/aiconfig/eval/metrics.py
rename to python/src/aiconfig/eval/test_suite_metrics.py
index cd63e84a5..41407a02f 100644
--- a/python/src/aiconfig/eval/metrics.py
+++ b/python/src/aiconfig/eval/test_suite_metrics.py
@@ -17,7 +17,7 @@
 import lastmile_utils.lib.core.api as core_utils
 import nltk
 import pandas as pd
-from aiconfig.eval import common
+from aiconfig.eval import common, test_suite_common
 from aiconfig.eval.openai import (
     OpenAIChatCompletionCreate,
     default_openai_chat_completion_create,
@@ -30,10 +30,10 @@
 
 
 @dataclass(frozen=True)
-class Metric(Generic[common.T_Evaluable, common.T_MetricValue]):
+class TestSuiteMetric(Generic[common.T_Evaluable, common.T_MetricValue]):
     """See metrics.py for examples."""
 
-    evaluation_fn: common.EvaluationFunction[
+    evaluation_fn: test_suite_common.EvaluationFunction[
         common.T_Evaluable, common.T_MetricValue
     ]
     metric_metadata: common.EvaluationMetricMetadata[
@@ -50,31 +50,31 @@ async def __call__(
         return await self.evaluation_fn(datum)
 
 
-T_ParamSpec = ParamSpec("T_ParamSpec")
+PS = ParamSpec("PS")
 
 
 @core_utils.parametrized
 def metric(
     parametrized_evaluation_fn: Callable[
-        Concatenate[common.T_Evaluable, T_ParamSpec], common.T_MetricValue
+        Concatenate[common.T_Evaluable, PS], common.T_MetricValue
     ],
     name: str | None = None,
     description: str | None = None,
     best_value: common.T_MetricValue | None = None,
     worst_value: common.T_MetricValue | None = None,
-) -> Callable[T_ParamSpec, Metric[common.T_Evaluable, common.T_MetricValue]]:
+) -> Callable[PS, TestSuiteMetric[common.T_Evaluable, common.T_MetricValue]]:
     name_ = name or parametrized_evaluation_fn.__name__
     description_ = description or name_
 
     def _construct(
-        *args: T_ParamSpec.args, **kwargs: T_ParamSpec.kwargs
-    ) -> Metric[common.T_Evaluable, common.T_MetricValue]:
+        *args: PS.args, **kwargs: PS.kwargs
+    ) -> TestSuiteMetric[common.T_Evaluable, common.T_MetricValue]:
         async def evaluation_fn(
             datum: common.T_Evaluable,
         ) -> common.T_MetricValue:
             return parametrized_evaluation_fn(datum, *args, **kwargs)
 
-        return Metric(
+        return TestSuiteMetric(
             evaluation_fn=evaluation_fn,
             metric_metadata=common.EvaluationMetricMetadata(
                 name=name_,
@@ -91,26 +91,25 @@ async def evaluation_fn(
 @core_utils.parametrized
 def metric_async(
     parametrized_evaluation_fn: Callable[
-        Concatenate[common.T_Evaluable, T_ParamSpec],
-        Awaitable[common.T_MetricValue],
+        Concatenate[common.T_Evaluable, PS], Awaitable[common.T_MetricValue]
     ],
     name: str | None = None,
     description: str | None = None,
     best_value: common.T_MetricValue | None = None,
     worst_value: common.T_MetricValue | None = None,
-) -> Callable[T_ParamSpec, Metric[common.T_Evaluable, common.T_MetricValue]]:
+) -> Callable[PS, TestSuiteMetric[common.T_Evaluable, common.T_MetricValue]]:
     name_ = name or parametrized_evaluation_fn.__name__
     description_ = description or name_
 
     def _construct(
-        *args: T_ParamSpec.args, **kwargs: T_ParamSpec.kwargs
-    ) -> Metric[common.T_Evaluable, common.T_MetricValue]:
+        *args: PS.args, **kwargs: PS.kwargs
+    ) -> TestSuiteMetric[common.T_Evaluable, common.T_MetricValue]:
         async def evaluation_fn(
             datum: common.T_Evaluable,
         ) -> common.T_MetricValue:
             return await parametrized_evaluation_fn(datum, *args, **kwargs)
 
-        return Metric(
+        return TestSuiteMetric(
             evaluation_fn=evaluation_fn,
             metric_metadata=common.EvaluationMetricMetadata(
                 name=name_,
@@ -178,7 +177,7 @@ def _get_sentiment_scores(
 
 def make_get_sentiment_scores(
     get_polarity_scores: GetPolarityScores,
-) -> common.EvaluationFunction[str, TextSentimentScores]:
+) -> test_suite_common.EvaluationFunction[str, TextSentimentScores]:
     async def _f(datum: str) -> TextSentimentScores:
         return _get_sentiment_scores(datum, get_polarity_scores)
 
@@ -187,7 +186,7 @@ async def _f(datum: str) -> TextSentimentScores:
 
 def make_get_sentiment_class(
     get_polarity_scores: GetPolarityScores,
-) -> common.EvaluationFunction[str, str]:
+) -> test_suite_common.EvaluationFunction[str, str]:
     async def _f(datum: str) -> str:
         scores = _get_sentiment_scores(datum, get_polarity_scores)
         return scores.highest
@@ -197,7 +196,7 @@ async def _f(datum: str) -> str:
 
 def make_get_overall_positive_sentiment(
     get_polarity_scores: GetPolarityScores,
-) -> common.EvaluationFunction[str, TextOverallPositiveSentiment]:
+) -> test_suite_common.EvaluationFunction[str, TextOverallPositiveSentiment]:
     async def _f(datum: str) -> TextOverallPositiveSentiment:
         scores = _get_sentiment_scores(datum, get_polarity_scores)
         return TextOverallPositiveSentiment(pos=scores.pos, neg=scores.neg)
@@ -209,17 +208,17 @@ def make_sentiment_scores_metric(
     get_polarity_scores: GetPolarityScores,
     make_evaluation_fn: Callable[
         [GetPolarityScores],
-        common.EvaluationFunction[str, common.T_MetricValue],
+        test_suite_common.EvaluationFunction[str, common.T_MetricValue],
     ],
     name: str,
     description: str,
     best_value: common.T_MetricValue | None = None,
     worst_value: common.T_MetricValue | None = None,
-) -> Metric[str, common.T_MetricValue]:
-    evaluation_fn: common.EvaluationFunction[
+) -> TestSuiteMetric[str, common.T_MetricValue]:
+    evaluation_fn: test_suite_common.EvaluationFunction[
         str, common.T_MetricValue
     ] = make_evaluation_fn(get_polarity_scores)
-    out: Metric[str, common.T_MetricValue] = Metric(
+    out: TestSuiteMetric[str, common.T_MetricValue] = TestSuiteMetric(
         evaluation_fn=evaluation_fn,
         metric_metadata=common.EvaluationMetricMetadata(
             #
@@ -234,22 +233,32 @@ def make_sentiment_scores_metric(
 
 
 def make_structured_llm_metric(
-    chat_completion_create: common.CompletionTextToSerializedJSON,
+    chat_completion_create: test_suite_common.CompletionTextToSerializedJSON,
     eval_llm_name: str,
-    pydantic_basemodel_type: Type[common.T_BaseModel],
+    pydantic_basemodel_type: Type[test_suite_common.T_BaseModel],
     metric_name: str,
     metric_description: str,
     field_descriptions: dict[str, str] = {},
-) -> Metric[str, common.CustomMetricPydanticObject[common.T_BaseModel]]:
+) -> TestSuiteMetric[
+    str,
+    test_suite_common.CustomMetricPydanticObject[
+        test_suite_common.T_BaseModel
+    ],
+]:
     def _make_evaluation_fn(
-        basemodel_type: Type[common.T_BaseModel],
-    ) -> common.EvaluationFunction[
-        str, common.CustomMetricPydanticObject[common.T_BaseModel]
+        basemodel_type: Type[test_suite_common.T_BaseModel],
+    ) -> test_suite_common.EvaluationFunction[
+        str,
+        test_suite_common.CustomMetricPydanticObject[
+            test_suite_common.T_BaseModel
+        ],
     ]:
         async def _evaluation_fn(
             datum: str,
-        ) -> common.CustomMetricPydanticObject[common.T_BaseModel]:
-            resp = common.get_llm_structured_response(
+        ) -> test_suite_common.CustomMetricPydanticObject[
+            test_suite_common.T_BaseModel
+        ]:
+            resp = test_suite_common.get_llm_structured_response(
                 input_text=datum,
                 chat_completion_create=chat_completion_create,
                 basemodel_type=basemodel_type,
@@ -260,11 +269,13 @@ async def _evaluation_fn(
                 case Err(e):
                     raise ValueError(f"Error getting structured response: {e}")
                 case Ok(data):
-                    return common.CustomMetricPydanticObject(data=data)
+                    return test_suite_common.CustomMetricPydanticObject(
+                        data=data
+                    )
 
         return _evaluation_fn
 
-    return Metric(
+    return TestSuiteMetric(
         evaluation_fn=_make_evaluation_fn(pydantic_basemodel_type),
         metric_metadata=common.EvaluationMetricMetadata(
             name=metric_name,
@@ -282,13 +293,19 @@ async def _evaluation_fn(
 
 def _make_openai_structured_llm_metric_helper(
     eval_llm_name: str,
-    pydantic_basemodel_type: Type[common.T_BaseModel],
+    pydantic_basemodel_type: Type[test_suite_common.T_BaseModel],
     metric_name: str,
     metric_description: str,
     field_descriptions: dict[str, str],
     openai_chat_completion_create: OpenAIChatCompletionCreate | None = None,
 ) -> Result[
-    Metric[str, common.CustomMetricPydanticObject[common.T_BaseModel]], str
+    TestSuiteMetric[
+        str,
+        test_suite_common.CustomMetricPydanticObject[
+            test_suite_common.T_BaseModel
+        ],
+    ],
+    str,
 ]:
     schema = pydantic_basemodel_type.model_json_schema()
     properties = schema["properties"]
@@ -313,7 +330,7 @@ def _with_description(key: str, value: dict[str, str]) -> dict[str, str]:
 
     required = required or list(properties.keys())
 
-    openai_eval_llm_chat_completion_create: common.CompletionTextToSerializedJSON = make_fn_completion_text_to_serialized_json(
+    openai_eval_llm_chat_completion_create: test_suite_common.CompletionTextToSerializedJSON = make_fn_completion_text_to_serialized_json(
         eval_llm_name=eval_llm_name,
         properties=properties,
         required=required,
@@ -343,12 +360,17 @@ def _with_description(key: str, value: dict[str, str]) -> dict[str, str]:
 
 def make_openai_structured_llm_metric(
     eval_llm_name: str,
-    pydantic_basemodel_type: Type[common.T_BaseModel],
+    pydantic_basemodel_type: Type[test_suite_common.T_BaseModel],
     metric_name: str,
     metric_description: str,
     field_descriptions: dict[str, str] = {},
     openai_chat_completion_create: OpenAIChatCompletionCreate | None = None,
-) -> Metric[str, common.CustomMetricPydanticObject[common.T_BaseModel]]:
+) -> TestSuiteMetric[
+    str,
+    test_suite_common.CustomMetricPydanticObject[
+        test_suite_common.T_BaseModel
+    ],
+]:
     res_metric = _make_openai_structured_llm_metric_helper(
         eval_llm_name=eval_llm_name,
         pydantic_basemodel_type=pydantic_basemodel_type,
@@ -403,7 +425,7 @@ def make_brevity(datum: str):
 
 gpt3_5_text_ratings = make_openai_structured_llm_metric(
     eval_llm_name="gpt-3.5-turbo-0613",
-    pydantic_basemodel_type=common.TextRatingsData,
+    pydantic_basemodel_type=test_suite_common.TextRatingsData,
     metric_name="text_ratings",
     metric_description="Text ratings",
     field_descriptions=dict(
diff --git a/python/tests/test_eval.py b/python/tests/test_test_suite_eval.py
similarity index 98%
rename from python/tests/test_eval.py
rename to python/tests/test_test_suite_eval.py
index 929062fac..faae7fcdf 100644
--- a/python/tests/test_eval.py
+++ b/python/tests/test_test_suite_eval.py
@@ -11,11 +11,11 @@
 import pytest
 from aiconfig.eval.api import (
     TestSuiteWithInputsSettings,
-    metrics,
+    test_suite_metrics as metrics,
     run_test_suite_outputs_only,
     run_test_suite_with_inputs,
 )
-from aiconfig.eval.lib import (
+from aiconfig.eval.test_suite_lib import (
     MetricList,
     TestSuiteGeneralSettings,
     TestSuiteWithInputsSpec,
@@ -99,7 +99,7 @@ async def test_run_with_inputs_sanity_check():
 
     path = os.path.join(
         current_dir(),
-        "../src/aiconfig/eval/examples/travel/travel_parametrized.aiconfig.json",
+        "../src/aiconfig/eval/test_suite_examples/travel/travel_parametrized.aiconfig.json"
     )
     out = await run_test_suite_with_inputs(
         [],
@@ -384,7 +384,6 @@ async def test_exception_metric(caplog: pytest.LogCaptureFixture):
     )
     with caplog.at_level(logging.ERROR):
         df = await run_test_suite_outputs_only(user_test_suite_outputs_only)
-        print(df[["metric_name"]])
     mapping: dict[str, Any] = df.query("metric_name=='brevity'").set_index("aiconfig_output").value.to_dict()  # type: ignore
     assert mapping["Hundred Acre Wood"] == 17.0
     assert pd.isnull(mapping[""])  # type: ignore
diff --git a/python/tests/test_eval_model_graded_openai.py b/python/tests/test_test_suite_eval_model_graded_openai.py
similarity index 85%
rename from python/tests/test_eval_model_graded_openai.py
rename to python/tests/test_test_suite_eval_model_graded_openai.py
index 719f2cacc..e39a877c4 100644
--- a/python/tests/test_eval_model_graded_openai.py
+++ b/python/tests/test_test_suite_eval_model_graded_openai.py
@@ -5,13 +5,16 @@
 import openai.types.chat.chat_completion as openai_chat_completion_types
 import openai.types.chat.chat_completion_message_tool_call as openai_tool_call_types
 import pytest
-from aiconfig.eval import common
-from aiconfig.eval.api import metrics, run_test_suite_outputs_only
+from aiconfig.eval import test_suite_common
+from aiconfig.eval.api import (
+    test_suite_metrics as metrics,
+    run_test_suite_outputs_only,
+)
 from result import Ok, Result
 
 
 def _mock_response(
-    function_args: common.SerializedJSON,
+    function_args: test_suite_common.SerializedJSON,
 ) -> openai_chat_types.ChatCompletion:
     return openai_chat_types.ChatCompletion(
         id="123",
@@ -42,7 +45,7 @@ def _mock_response(
 
 
 def _make_mock_openai_chat_completion_create(
-    function_arguments_return: common.SerializedJSON,
+    function_arguments_return: test_suite_common.SerializedJSON,
 ) -> lib_openai.OpenAIChatCompletionCreate:
     def _mock_openai_chat_completion_create(
         completion_params: lib_openai.OpenAIChatCompletionParams,
@@ -59,13 +62,13 @@ def _mock_openai_chat_completion_create(
 @pytest.mark.asyncio
 async def test_openai_structured_eval():
     _mock_create = _make_mock_openai_chat_completion_create(
-        common.SerializedJSON(
+        test_suite_common.SerializedJSON(
             '{"conciseness_rating": 5, "conciseness_confidence": 0.9, "conciseness_reasoning": "I think it\'s pretty concise."}'
         )
     )
     mock_metric = metrics.make_openai_structured_llm_metric(
         eval_llm_name="gpt-3.5-turbo-0613",
-        pydantic_basemodel_type=common.TextRatingsData,
+        pydantic_basemodel_type=test_suite_common.TextRatingsData,
         metric_name="text_ratings",
         metric_description="Text ratings",
         field_descriptions=dict(
@@ -81,10 +84,12 @@ async def test_openai_structured_eval():
     ]
     df = await run_test_suite_outputs_only(user_test_suite_outputs_only)
     metric_data = cast(
-        common.CustomMetricPydanticObject[common.TextRatingsData],
+        test_suite_common.CustomMetricPydanticObject[
+            test_suite_common.TextRatingsData
+        ],
         df.loc[0, "value"],
     ).data
-    assert isinstance(metric_data, common.TextRatingsData)
+    assert isinstance(metric_data, test_suite_common.TextRatingsData)
     metric_json = metric_data.to_dict()
     assert metric_json == {
         "conciseness_rating": 5,
@@ -96,7 +101,7 @@ async def test_openai_structured_eval():
 @pytest.mark.asyncio
 async def test_bad_structured_eval_metric():
     _mock_create = _make_mock_openai_chat_completion_create(
-        common.SerializedJSON(
+        test_suite_common.SerializedJSON(
             '{"conciseness_rating": 5, "conciseness_confidence": 0.9, "conciseness_reasoning": "I think it\'s pretty concise."}'
         )
     )
@@ -104,7 +109,7 @@ async def test_bad_structured_eval_metric():
     with pytest.raises(ValueError) as exc:
         _ = metrics.make_openai_structured_llm_metric(
             eval_llm_name="gpt-3.5-turbo-0613",
-            pydantic_basemodel_type=common.TextRatingsData,
+            pydantic_basemodel_type=test_suite_common.TextRatingsData,
             metric_name="text_ratings",
             metric_description="Text ratings",
             field_descriptions=dict(