From 31c046dcdf8bf9d09fbe0c56a567de07e6e3b525 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 10 Oct 2024 11:35:26 -0700
Subject: [PATCH 01/27] evals new rebase

---
 llama_stack/apis/dataset/dataset.py           |  16 +-
 llama_stack/apis/evals/client.py              |  85 ++++++++
 llama_stack/apis/evals/evals.py               | 123 +++++------
 llama_stack/distribution/registry/__init__.py |   5 +
 .../registry/datasets/__init__.py             |  23 +++
 .../distribution/registry/datasets/dataset.py |  62 ++++++
 .../registry/datasets/dataset_registry.py     |  32 +++
 .../distribution/registry/tasks/__init__.py   |  13 ++
 .../distribution/registry/tasks/task.py       |  49 +++++
 .../registry/tasks/task_registry.py           |  32 +++
 llama_stack/distribution/resolver.py          |   2 +
 llama_stack/providers/datatypes.py            |   1 +
 .../impls/meta_reference/evals/__init__.py    |  19 ++
 .../impls/meta_reference/evals/config.py      |  10 +
 .../impls/meta_reference/evals/evals.py       |  71 +++++++
 .../meta_reference/evals/tasks/__init__.py    |   5 +
 .../meta_reference/evals/tasks/mmlu_task.py   | 150 ++++++++++++++
 .../impls/third_party/evals/__init__.py       |   5 +
 .../third_party/evals/eleuther/__init__.py    |  19 ++
 .../third_party/evals/eleuther/config.py      |  10 +
 .../third_party/evals/eleuther/eleuther.py    | 168 +++++++++++++++
 .../eleuther/tasks/meta_ifeval/ifeval.yaml    |  32 +++
 .../evals/eleuther/tasks/meta_ifeval/utils.py | 191 ++++++++++++++++++
 .../mmlu_pro_5shot_cot_instruct.yaml          |  29 +++
 .../eleuther/tasks/meta_mmlu_pro/utils.py     |  35 ++++
 llama_stack/providers/registry/evals.py       |  42 ++++
 .../providers/utils/telemetry/tracing.py      |   2 +-
 tests/examples/local-run.yaml                 |   5 +
 28 files changed, 1145 insertions(+), 91 deletions(-)
 create mode 100644 llama_stack/apis/evals/client.py
 create mode 100644 llama_stack/distribution/registry/__init__.py
 create mode 100644 llama_stack/distribution/registry/datasets/__init__.py
 create mode 100644 llama_stack/distribution/registry/datasets/dataset.py
 create mode 100644 llama_stack/distribution/registry/datasets/dataset_registry.py
 create mode 100644 llama_stack/distribution/registry/tasks/__init__.py
 create mode 100644 llama_stack/distribution/registry/tasks/task.py
 create mode 100644 llama_stack/distribution/registry/tasks/task_registry.py
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/__init__.py
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/config.py
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/evals.py
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py
 create mode 100644 llama_stack/providers/impls/third_party/evals/__init__.py
 create mode 100644 llama_stack/providers/impls/third_party/evals/eleuther/__init__.py
 create mode 100644 llama_stack/providers/impls/third_party/evals/eleuther/config.py
 create mode 100644 llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py
 create mode 100644 llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml
 create mode 100644 llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py
 create mode 100644 llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
 create mode 100644 llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py
 create mode 100644 llama_stack/providers/registry/evals.py

diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py
index 2fa8bb4e5e..ba2cb88110 100644
--- a/llama_stack/apis/dataset/dataset.py
+++ b/llama_stack/apis/dataset/dataset.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from enum import Enum
+# from enum import Enum
 from typing import Any, Dict, Optional, Protocol
 
 from llama_models.llama3.api.datatypes import URL
@@ -14,22 +14,12 @@
 from pydantic import BaseModel
 
 
-@json_schema_type
-class TrainEvalDatasetColumnType(Enum):
-    dialog = "dialog"
-    text = "text"
-    media = "media"
-    number = "number"
-    json = "json"
-
-
 @json_schema_type
 class TrainEvalDataset(BaseModel):
     """Dataset to be used for training or evaluating language models."""
 
-    # TODO(ashwin): figure out if we need to add an enum for a "dataset type"
-
-    columns: Dict[str, TrainEvalDatasetColumnType]
+    # unique identifier associated with the dataset
+    dataset_id: str
     content_url: URL
     metadata: Optional[Dict[str, Any]] = None
 
diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py
new file mode 100644
index 0000000000..ad4a471455
--- /dev/null
+++ b/llama_stack/apis/evals/client.py
@@ -0,0 +1,85 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import json
+
+import fire
+import httpx
+from termcolor import cprint
+
+from .evals import *  # noqa: F403
+
+
+class EvaluationClient(Evals):
+    def __init__(self, base_url: str):
+        self.base_url = base_url
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def run_evals(
+        self,
+        model: str,
+        task: str,
+        dataset: Optional[str] = None,
+        eval_task_config: Optional[EvaluateTaskConfig] = None,
+    ) -> EvaluateResponse:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{self.base_url}/evals/run",
+                json={
+                    "model": model,
+                    "task": task,
+                    "dataset": dataset,
+                    "eval_task_config": (
+                        json.loads(eval_task_config.json())
+                        if eval_task_config
+                        else None
+                    ),
+                },
+                headers={"Content-Type": "application/json"},
+                timeout=3600,
+            )
+            response.raise_for_status()
+            return EvaluateResponse(**response.json())
+
+
+async def run_main(host: str, port: int):
+    client = EvaluationClient(f"http://{host}:{port}")
+
+    # CustomDataset
+    response = await client.run_evals(
+        model="Llama3.1-8B-Instruct",
+        dataset="mmlu-simple-eval-en",
+        task="mmlu",
+        eval_task_config=EvaluateTaskConfig(
+            n_samples=2,
+        ),
+    )
+    cprint(f"evaluate response={response}", "green")
+
+    # Eleuther Eval Task
+    # response = await client.run_evals(
+    #     model="Llama3.1-8B-Instruct",
+    #     task="meta_mmlu_pro_instruct",
+    #     # task="meta_ifeval",
+    #     eval_task_config=EvaluateTaskConfig(
+    #         n_samples=2,
+    #     )
+    # )
+    # cprint(response.metrics["metrics_table"], "red")
+
+
+def main(host: str, port: int):
+    asyncio.run(run_main(host, port))
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index 0be2243ab1..dbb1348a53 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -4,8 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from enum import Enum
-from typing import List, Protocol
+from typing import Protocol
 
 from llama_models.schema_utils import webmethod
 
@@ -13,23 +12,6 @@
 
 from llama_models.llama3.api.datatypes import *  # noqa: F403
 from llama_stack.apis.dataset import *  # noqa: F403
-from llama_stack.apis.common.training_types import *  # noqa: F403
-
-
-class TextGenerationMetric(Enum):
-    perplexity = "perplexity"
-    rouge = "rouge"
-    bleu = "bleu"
-
-
-class QuestionAnsweringMetric(Enum):
-    em = "em"
-    f1 = "f1"
-
-
-class SummarizationMetric(Enum):
-    rouge = "rouge"
-    bleu = "bleu"
 
 
 class EvaluationJob(BaseModel):
@@ -40,37 +22,21 @@ class EvaluationJobLogStream(BaseModel):
     job_uuid: str
 
 
-class EvaluateTaskRequestCommon(BaseModel):
-    job_uuid: str
-    dataset: TrainEvalDataset
-
-    checkpoint: Checkpoint
-
-    # generation params
+class EvaluateTaskConfig(BaseModel):
+    # num examples to evaluate, evaluate all if None
+    n_samples: Optional[int] = None
+    # model evaluation params
     sampling_params: SamplingParams = SamplingParams()
 
 
 @json_schema_type
-class EvaluateTextGenerationRequest(EvaluateTaskRequestCommon):
-    """Request to evaluate text generation."""
-
-    metrics: List[TextGenerationMetric]
-
-
-@json_schema_type
-class EvaluateQuestionAnsweringRequest(EvaluateTaskRequestCommon):
-    """Request to evaluate question answering."""
+class EvaluateResponse(BaseModel):
+    """Scores for evaluation."""
 
-    metrics: List[QuestionAnsweringMetric]
+    metrics: Dict[str, str]
 
 
 @json_schema_type
-class EvaluateSummarizationRequest(EvaluateTaskRequestCommon):
-    """Request to evaluate summarization."""
-
-    metrics: List[SummarizationMetric]
-
-
 class EvaluationJobStatusResponse(BaseModel):
     job_uuid: str
 
@@ -82,41 +48,44 @@ class EvaluationJobArtifactsResponse(BaseModel):
     job_uuid: str
 
 
-class Evaluations(Protocol):
-    @webmethod(route="/evaluate/text_generation/")
-    def evaluate_text_generation(
-        self,
-        metrics: List[TextGenerationMetric],
-    ) -> EvaluationJob: ...
-
-    @webmethod(route="/evaluate/question_answering/")
-    def evaluate_question_answering(
-        self,
-        metrics: List[QuestionAnsweringMetric],
-    ) -> EvaluationJob: ...
-
-    @webmethod(route="/evaluate/summarization/")
-    def evaluate_summarization(
-        self,
-        metrics: List[SummarizationMetric],
-    ) -> EvaluationJob: ...
-
-    @webmethod(route="/evaluate/jobs")
-    def get_evaluation_jobs(self) -> List[EvaluationJob]: ...
-
-    @webmethod(route="/evaluate/job/status")
-    def get_evaluation_job_status(
-        self, job_uuid: str
-    ) -> EvaluationJobStatusResponse: ...
+@json_schema_type
+class EvaluationJobCreateResponse(BaseModel):
+    """Response to create a evaluation job."""
 
-    # sends SSE stream of logs
-    @webmethod(route="/evaluate/job/logs")
-    def get_evaluation_job_logstream(self, job_uuid: str) -> EvaluationJobLogStream: ...
+    job_uuid: str
 
-    @webmethod(route="/evaluate/job/cancel")
-    def cancel_evaluation_job(self, job_uuid: str) -> None: ...
 
-    @webmethod(route="/evaluate/job/artifacts")
-    def get_evaluation_job_artifacts(
-        self, job_uuid: str
-    ) -> EvaluationJobArtifactsResponse: ...
+class Evals(Protocol):
+    @webmethod(route="/evals/run")
+    async def run_evals(
+        self,
+        model: str,
+        task: str,
+        dataset: Optional[str] = None,
+        eval_task_config: Optional[EvaluateTaskConfig] = None,
+    ) -> EvaluateResponse: ...
+
+    # @webmethod(route="/evals/jobs")
+    # def get_evaluation_jobs(self) -> List[EvaluationJob]: ...
+
+    # @webmethod(route="/evals/job/create")
+    # async def create_evaluation_job(
+    #     self, model: str, dataset: str, task: str
+    # ) -> EvaluationJob: ...
+
+    # @webmethod(route="/evals/job/status")
+    # def get_evaluation_job_status(
+    #     self, job_uuid: str
+    # ) -> EvaluationJobStatusResponse: ...
+
+    # # sends SSE stream of logs
+    # @webmethod(route="/evals/job/logs")
+    # def get_evaluation_job_logstream(self, job_uuid: str) -> EvaluationJobLogStream: ...
+
+    # @webmethod(route="/evals/job/cancel")
+    # def cancel_evaluation_job(self, job_uuid: str) -> None: ...
+
+    # @webmethod(route="/evals/job/artifacts")
+    # def get_evaluation_job_artifacts(
+    #     self, job_uuid: str
+    # ) -> EvaluationJobArtifactsResponse: ...
diff --git a/llama_stack/distribution/registry/__init__.py b/llama_stack/distribution/registry/__init__.py
new file mode 100644
index 0000000000..756f351d88
--- /dev/null
+++ b/llama_stack/distribution/registry/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py
new file mode 100644
index 0000000000..0b7a843953
--- /dev/null
+++ b/llama_stack/distribution/registry/datasets/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# TODO: make these import config based
+from .dataset import CustomDataset, HFDataset
+from .dataset_registry import DatasetRegistry
+
+DATASETS_REGISTRY = {
+    "mmlu-simple-eval-en": CustomDataset(
+        name="mmlu_eval",
+        url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
+    ),
+    "hellaswag": HFDataset(
+        name="hellaswag",
+        url="hf://hellaswag?split=validation&trust_remote_code=True",
+    ),
+}
+
+for k, v in DATASETS_REGISTRY.items():
+    DatasetRegistry.register(k, v)
diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py
new file mode 100644
index 0000000000..1a16a5c51b
--- /dev/null
+++ b/llama_stack/distribution/registry/datasets/dataset.py
@@ -0,0 +1,62 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from urllib.parse import parse_qs, urlparse
+
+import pandas
+from datasets import Dataset, load_dataset
+
+
+class BaseDataset(ABC):
+    def __init__(self, name: str):
+        self.dataset = None
+        self.dataset_id = name
+        self.type = self.__class__.__name__
+
+    def __iter__(self):
+        return iter(self.dataset)
+
+    @abstractmethod
+    def load(self):
+        pass
+
+
+class CustomDataset(BaseDataset):
+    def __init__(self, name, url):
+        super().__init__(name)
+        self.url = url
+
+    def load(self):
+        if self.dataset:
+            return
+        # TODO: better support w/ data url
+        if self.url.endswith(".csv"):
+            df = pandas.read_csv(self.url)
+        elif self.url.endswith(".xlsx"):
+            df = pandas.read_excel(self.url)
+
+        self.dataset = Dataset.from_pandas(df)
+
+
+class HFDataset(BaseDataset):
+    def __init__(self, name, url):
+        super().__init__(name)
+        self.url = url
+
+    def load(self):
+        if self.dataset:
+            return
+
+        parsed = urlparse(self.url)
+
+        if parsed.scheme != "hf":
+            raise ValueError(f"Unknown HF dataset: {self.url}")
+
+        query = parse_qs(parsed.query)
+        query = {k: v[0] for k, v in query.items()}
+        path = parsed.netloc
+        self.dataset = load_dataset(path, **query)
diff --git a/llama_stack/distribution/registry/datasets/dataset_registry.py b/llama_stack/distribution/registry/datasets/dataset_registry.py
new file mode 100644
index 0000000000..9ddaa8bb7a
--- /dev/null
+++ b/llama_stack/distribution/registry/datasets/dataset_registry.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import AbstractSet, Dict
+
+from .dataset import BaseDataset
+
+
+class DatasetRegistry:
+    _REGISTRY: Dict[str, BaseDataset] = {}
+
+    @staticmethod
+    def names() -> AbstractSet[str]:
+        return DatasetRegistry._REGISTRY.keys()
+
+    @staticmethod
+    def register(name: str, task: BaseDataset) -> None:
+        if name in DatasetRegistry._REGISTRY:
+            raise ValueError(f"Dataset {name} already exists.")
+        DatasetRegistry._REGISTRY[name] = task
+
+    @staticmethod
+    def get_dataset(name: str) -> BaseDataset:
+        if name not in DatasetRegistry._REGISTRY:
+            raise ValueError(f"Dataset {name} not found.")
+        return DatasetRegistry._REGISTRY[name]
+
+    @staticmethod
+    def reset() -> None:
+        DatasetRegistry._REGISTRY = {}
diff --git a/llama_stack/distribution/registry/tasks/__init__.py b/llama_stack/distribution/registry/tasks/__init__.py
new file mode 100644
index 0000000000..01ccb18aee
--- /dev/null
+++ b/llama_stack/distribution/registry/tasks/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+# TODO: make these import config based
+from llama_stack.providers.impls.meta_reference.evals.tasks.mmlu_task import MMLUTask
+from .task_registry import TaskRegistry
+
+TaskRegistry.register(
+    "mmlu",
+    MMLUTask,
+)
diff --git a/llama_stack/distribution/registry/tasks/task.py b/llama_stack/distribution/registry/tasks/task.py
new file mode 100644
index 0000000000..a92e6241b6
--- /dev/null
+++ b/llama_stack/distribution/registry/tasks/task.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from abc import ABC, abstractmethod
+
+
+class BaseTask(ABC):
+    """
+    A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods.
+    Base class for all evaluation tasks. Each task needs to implement the following methods:
+    - F1: preprocess_sample(self)
+    - F2: postprocess_sample(self)
+    - F3: score_sample(self)
+    """
+
+    def __init__(self, dataset, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._name = self.__class__.__name__
+        self.dataset = dataset
+
+    @abstractmethod
+    def preprocess_sample(self, sample):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def postprocess_sample(self, sample):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def score_sample(self, sample, ground_truth):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def aggregate_results(self, eval_results):
+        raise NotImplementedError()
+
+    def preprocess(self):
+        return [self.preprocess_sample(sample) for sample in self.dataset]
+
+    def postprocess(self, generation):
+        return [self.postprocess_sample(sample) for sample in generation]
+
+    def score(self, postprocessed):
+        return [
+            self.score_sample(sample, ground_truth)
+            for sample, ground_truth in zip(postprocessed, self.dataset)
+        ]
diff --git a/llama_stack/distribution/registry/tasks/task_registry.py b/llama_stack/distribution/registry/tasks/task_registry.py
new file mode 100644
index 0000000000..063894e482
--- /dev/null
+++ b/llama_stack/distribution/registry/tasks/task_registry.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import AbstractSet, Dict
+
+from .task import BaseTask
+
+
+class TaskRegistry:
+    _REGISTRY: Dict[str, BaseTask] = {}
+
+    @staticmethod
+    def names() -> AbstractSet[str]:
+        return TaskRegistry._REGISTRY.keys()
+
+    @staticmethod
+    def register(name: str, task: BaseTask) -> None:
+        if name in TaskRegistry._REGISTRY:
+            raise ValueError(f"Task {name} already exists.")
+        TaskRegistry._REGISTRY[name] = task
+
+    @staticmethod
+    def get_task(name: str) -> BaseTask:
+        if name not in TaskRegistry._REGISTRY:
+            raise ValueError(f"Task {name} not found.")
+        return TaskRegistry._REGISTRY[name]
+
+    @staticmethod
+    def reset() -> None:
+        TaskRegistry._REGISTRY = {}
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index a05e08cd7c..672a4ea60f 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -12,6 +12,7 @@
 from llama_stack.distribution.datatypes import *  # noqa: F403
 
 from llama_stack.apis.agents import Agents
+from llama_stack.apis.evals import Evals
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.memory import Memory
@@ -38,6 +39,7 @@ def api_protocol_map() -> Dict[Api, Any]:
         Api.safety: Safety,
         Api.shields: Shields,
         Api.telemetry: Telemetry,
+        Api.evals: Evals,
     }
 
 
diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py
index 777cd855b7..50ab0691b9 100644
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@@ -28,6 +28,7 @@ class Api(Enum):
     models = "models"
     shields = "shields"
     memory_banks = "memory_banks"
+    evals = "evals"
 
     # built-in API
     inspect = "inspect"
diff --git a/llama_stack/providers/impls/meta_reference/evals/__init__.py b/llama_stack/providers/impls/meta_reference/evals/__init__.py
new file mode 100644
index 0000000000..f4dd4b79d6
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import MetaReferenceEvalsImplConfig  # noqa
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.distribution.datatypes import Api, ProviderSpec
+
+
+async def get_provider_impl(
+    config: MetaReferenceEvalsImplConfig, deps: Dict[Api, ProviderSpec]
+):
+    from .evals import MetaReferenceEvalsImpl
+
+    impl = MetaReferenceEvalsImpl(config, deps[Api.inference])
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/providers/impls/meta_reference/evals/config.py b/llama_stack/providers/impls/meta_reference/evals/config.py
new file mode 100644
index 0000000000..05dee366ed
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/config.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pydantic import BaseModel
+
+
+class MetaReferenceEvalsImplConfig(BaseModel): ...
diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py
new file mode 100644
index 0000000000..5f475c5395
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.apis.evals import *  # noqa: F403
+from termcolor import cprint
+
+from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry
+
+from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry
+
+from .config import MetaReferenceEvalsImplConfig
+
+
+class MetaReferenceEvalsImpl(Evals):
+    def __init__(self, config: MetaReferenceEvalsImplConfig, inference_api: Inference):
+        self.inference_api = inference_api
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def run_evals(
+        self,
+        model: str,
+        task: str,
+        dataset: Optional[str] = None,
+        eval_task_config: Optional[EvaluateTaskConfig] = None,
+    ) -> EvaluateResponse:
+        cprint(
+            f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}",
+            "red",
+        )
+        if not dataset:
+            raise ValueError("dataset must be specified for mete-reference evals")
+
+        dataset = DatasetRegistry.get_dataset(dataset)
+        dataset.load()
+
+        task_impl = TaskRegistry.get_task(task)(dataset)
+        x1 = task_impl.preprocess()
+
+        # TODO: replace w/ batch inference & async return eval job
+        generation_outputs = []
+        if eval_task_config is None:
+            eval_task_config = EvaluateTaskConfig(n_samples=len(x1))
+        if eval_task_config.n_samples is None or eval_task_config.n_samples > len(x1):
+            eval_task_config.n_samples = len(x1)
+
+        print(
+            f"Eval generation start, generate on {eval_task_config.n_samples} samples"
+        )
+
+        for msg in x1[: eval_task_config.n_samples]:
+            print("generation for msg: ", msg)
+            response = await self.inference_api.chat_completion(
+                model=model,
+                messages=[msg],
+                stream=False,
+            )
+            generation_outputs.append(response.completion_message.content)
+
+        x2 = task_impl.postprocess(generation_outputs)
+        eval_results = task_impl.score(x2)
+        eval_response = task_impl.aggregate_results(eval_results)
+        return eval_response
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py b/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py
new file mode 100644
index 0000000000..756f351d88
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py
new file mode 100644
index 0000000000..673a953791
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py
@@ -0,0 +1,150 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import re
+
+from llama_stack.apis.evals import *  # noqa: F403
+from llama_stack.distribution.registry.tasks.task import BaseTask
+
+QUERY_TEMPLATE_MULTICHOICE = """
+Answer the following multiple choice question and make the answer very simple. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD.
+
+{Question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+MULTILINGUAL_ANSWER_REGEXES = [
+    r"Answer\s*:",
+    r"Answer\s*:​​​​​​",  # Korean invisible character
+    r"উত্তর\s*:",
+    r"उत्तर\s*:",
+    r"উত্তরঃ",
+    r"উত্তর\s*:",
+    r"Antwort\s*:",
+    r"답변\s*:",
+    r"정답\s*:",
+    r"답\s*:",
+    r"答案\s*：",
+    r"答案\s*:",
+    r"答\s*：",
+    r"答\s*:",
+    r"答复\s*：",
+    r"答曰\s*：",
+    r"الإجابة:",
+    r"الجواب:",
+    r"إجابة:",
+    r"الإجابة النهائية:",
+    r"الإجابة الصحيحة:",
+    r"الإجابة الصحيحة هي:",
+    r"الإجابة هي:",
+    r"Respuesta\s*:",
+    r"Risposta\s*:",
+    r"答え\s*:",
+    r"答え\s*：",
+    r"回答\s*:",
+    r"回答\s*：",
+    r"解答\s*:",
+    r"Jawaban\s*:",
+    r"Réponse\s*:",
+    r"Resposta\s*:",
+    r"Jibu\s*:",
+    r"Idahun\s*:",
+    r"Ìdáhùn\s*:",
+    r"Idáhùn\s*:",
+    r"Àmọ̀nà\s*:",
+    r"Àdáhùn\s*:",
+    r"Ànúgọ\s*:",
+    r"Àṣàyàn\s*:",
+]
+
+MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = (
+    r"(?i){}\s*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[Ａ]|[Ｂ]|[Ｃ]|[Ｄ])"
+)
+
+
+def normalize_response(response: str) -> str:
+    """
+    Normalize the response by removing markdown and LaTeX formatting that may prevent a match.
+    """
+
+    return (
+        response.replace("**", "")
+        .replace("$\\boxed{", "")
+        .replace("}$", "")
+        .replace("\\$", "")
+        .replace("$\\text{", "")
+        .replace("$", "")
+        .replace("\\mathrm{", "")
+        .replace("\\{", "")
+        .replace("\\text", "")
+        .replace("\\(", "")
+        .replace("\\mathbf{", "")
+        .replace("{", "")
+        .replace("\\boxed", "")
+    )
+
+
+def normalize_extracted_answer(extracted_answer: str) -> str:
+    return (
+        # In arabic these are the letters used for A-D in multiple choice questions
+        extracted_answer.replace("أ", " A")
+        .replace("ب", " B")
+        .replace("ج", " C")
+        .replace("د", " D")
+        # In Bengali these are the letters used for A-D in multiple choice questions
+        .replace("অ", " A")
+        .replace("ব", " B")
+        .replace("ড", " C")
+        .replace("ঢ", " D")
+        # In Japanese these are the letters sometimes used for A-D in multiple choice questions
+        .replace("Ａ", " A")
+        .replace("Ｂ", " B")
+        .replace("Ｃ", " C")
+        .replace("Ｄ", " D")
+        .strip()
+    )
+
+
+class MMLUTask(BaseTask):
+    """
+    MMLU Task.
+    """
+
+    def __init__(self, dataset, *args, **kwargs):
+        super().__init__(dataset, *args, **kwargs)
+
+    def preprocess_sample(self, sample):
+        content = QUERY_TEMPLATE_MULTICHOICE.format(**sample)
+        return {
+            "role": "user",
+            "content": content,
+        }
+
+    def postprocess_sample(self, sample):
+        normalized = normalize_response(sample)
+        return normalized
+
+    def score_sample(self, sample, expected):
+        extracted_answer = None
+        for answer_regex in MULTILINGUAL_ANSWER_REGEXES:
+            regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex)
+            match = re.search(regex, sample)
+            if match:
+                extracted_answer = normalize_extracted_answer(match.group(1))
+                break
+        score = (
+            1.0 if extracted_answer and extracted_answer == expected["Answer"] else 0.0
+        )
+        # TODO: generalize this into SingleEvalResult
+        return score
+
+    def aggregate_results(self, eval_results):
+        return EvaluateResponse(
+            metrics={"score": str(sum(eval_results) / len(eval_results))}
+        )
diff --git a/llama_stack/providers/impls/third_party/evals/__init__.py b/llama_stack/providers/impls/third_party/evals/__init__.py
new file mode 100644
index 0000000000..756f351d88
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py b/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py
new file mode 100644
index 0000000000..9886ed6d6c
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import EleutherEvalsImplConfig  # noqa
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.distribution.datatypes import Api, ProviderSpec
+
+
+async def get_provider_impl(
+    config: EleutherEvalsImplConfig, deps: Dict[Api, ProviderSpec]
+):
+    from .eleuther import EleutherEvalsAdapter
+
+    impl = EleutherEvalsAdapter(config, deps[Api.inference])
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/config.py b/llama_stack/providers/impls/third_party/evals/eleuther/config.py
new file mode 100644
index 0000000000..a9ab297b42
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/config.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pydantic import BaseModel
+
+
+class EleutherEvalsImplConfig(BaseModel): ...
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py
new file mode 100644
index 0000000000..b9f9505e93
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py
@@ -0,0 +1,168 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.apis.evals import *  # noqa: F403
+import os
+import random
+import threading
+from pathlib import Path
+
+import lm_eval
+import tqdm
+from lm_eval.api.model import LM
+from lm_eval.evaluator import evaluate, get_task_list
+from lm_eval.tasks import get_task_dict, TaskManager
+from termcolor import cprint
+
+from .config import EleutherEvalsImplConfig
+
+
+# https://stackoverflow.com/questions/74703727/how-to-call-async-function-from-sync-funcion-and-get-result-while-a-loop-is-alr
+# We will use another thread wih its own event loop to run the async api within sync function
+_loop = asyncio.new_event_loop()
+_thr = threading.Thread(target=_loop.run_forever, name="Async Runner", daemon=True)
+
+
+class EleutherEvalsWrapper(LM):
+    def __init__(
+        self,
+        inference_api: Inference,
+        model: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.inference_api = inference_api
+        self.model = model
+        self.tokenizer = None
+        self.tokenized_requests = False
+        self.kwargs = kwargs
+
+    @property
+    def eot_token_id(self):
+        raise NotImplementedError("Not implemented")
+
+    @property
+    def max_length(self) -> int:
+        return NotImplementedError("Not implemented")
+
+    @property
+    def max_gen_toks(self) -> int:
+        return NotImplementedError("Not implemented")
+
+    @property
+    def batch_size(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError("No support for logits.")
+
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError("No support for logits.")
+
+    @property
+    def world_size(self):
+        return 1
+
+    def tok_encode(self, string: str) -> List[int]:
+        return NotImplementedError("Not implemented")
+
+    def tok_decode(self, tokens: List[int]) -> str:
+        return NotImplementedError("Not implemented")
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override generate_until
+        raise NotImplementedError()
+
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
+        # TODO: implement inference completion with loglikelihood
+        res = []
+        for req in requests:
+            res.append((-random.random(), False))
+
+        return res
+
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+        res = []
+        if not _thr.is_alive():
+            _thr.start()
+        for req in tqdm.tqdm(requests):
+            chat_completion_coro_fn = self.inference_api.chat_completion(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": req.args[0],
+                    }
+                ],
+                stream=False,
+            )
+            future = asyncio.run_coroutine_threadsafe(chat_completion_coro_fn, _loop)
+            response = future.result()
+            res.append(response.completion_message.content)
+
+        return res
+
+
+class EleutherEvalsAdapter(Evals):
+    def __init__(self, config: EleutherEvalsImplConfig, inference_api: Inference):
+        self.inference_api = inference_api
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def run_evals(
+        self,
+        model: str,
+        task: str,
+        dataset: Optional[str] = None,
+        eval_task_config: Optional[EvaluateTaskConfig] = None,
+    ) -> EvaluateResponse:
+        cprint(f"Eleuther Evals: {model} {dataset} {task}", "red")
+
+        eluther_wrapper = EleutherEvalsWrapper(self.inference_api, model)
+        current_dir = Path(os.path.dirname(os.path.abspath(__file__)))
+
+        # custom registry of harness tasks
+        task_manager = TaskManager(
+            include_path=str(current_dir / "tasks"),
+        )
+
+        task_dict = get_task_dict(task, task_manager)
+        cprint(task_dict, "blue")
+
+        task_types = set([t.task.OUTPUT_TYPE for t in get_task_list(task_dict)])
+        cprint(task_types, "cyan")
+
+        output = evaluate(
+            eluther_wrapper,
+            task_dict,
+            limit=eval_task_config.n_samples,
+        )
+
+        formatted_output = lm_eval.utils.make_table(output)
+
+        cprint(formatted_output, "green")
+
+        return EvaluateResponse(
+            metrics={
+                "metrics_table": formatted_output,
+            },
+        )
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml
new file mode 100644
index 0000000000..e10277a314
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml
@@ -0,0 +1,32 @@
+task: meta_ifeval
+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
+dataset_name: Llama-3.1-8B-Instruct-evals__ifeval__strict__details
+output_type: generate_until
+test_split: latest
+process_docs: !function utils.process_docs
+num_fewshot: 0
+doc_to_text: prompt
+doc_to_target: 0
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0.0
+  max_gen_toks: 1280
+process_results: !function utils.process_results
+metric_list:
+  - metric: prompt_level_strict_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: inst_level_strict_acc
+    aggregation: !function utils.agg_inst_level_acc
+    higher_is_better: true
+  - metric: prompt_level_loose_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: inst_level_loose_acc
+    aggregation: !function utils.agg_inst_level_acc
+    higher_is_better: true
+metadata:
+  version: 2.0
+fewshot_config:
+  sampler: first_n
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py
new file mode 100644
index 0000000000..aa171343fd
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py
@@ -0,0 +1,191 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import dataclasses
+from typing import Dict, Optional, Union
+
+import datasets
+
+from lm_eval.tasks.ifeval import instructions_registry
+
+
+@dataclasses.dataclass
+class InputExample:
+    key: int
+    instruction_id_list: list[str]
+    prompt: str
+    kwargs: list[Dict[str, Optional[Union[str, int]]]]
+
+
+@dataclasses.dataclass
+class OutputExample:
+    instruction_id_list: list[str]
+    prompt: str
+    response: str
+    follow_all_instructions: bool
+    follow_instruction_list: list[bool]
+
+
+def test_instruction_following_strict(
+    inp,
+    response,
+):
+    """Tests response to see if instructions are followed."""
+    instruction_list = inp.instruction_id_list
+    is_following_list = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=inp.prompt)
+
+        if response.strip() and instruction.check_following(response):
+            is_following_list.append(True)
+        else:
+            is_following_list.append(False)
+
+    return OutputExample(
+        instruction_id_list=inp.instruction_id_list,
+        prompt=inp.prompt,
+        response=response,
+        follow_all_instructions=all(is_following_list),
+        follow_instruction_list=is_following_list,
+    )
+
+
+def test_instruction_following_loose(
+    inp,
+    response,
+):
+    """Tests response for an upper bound for following instructions."""
+    r = response.split("\n")
+    response_remove_first = "\n".join(r[1:]).strip()
+    response_remove_last = "\n".join(r[:-1]).strip()
+    response_remove_both = "\n".join(r[1:-1]).strip()
+    revised_response = response.replace("*", "")
+    revised_response_remove_first = response_remove_first.replace("*", "")
+    revised_response_remove_last = response_remove_last.replace("*", "")
+    revised_response_remove_both = response_remove_both.replace("*", "")
+    all_responses = [
+        response,
+        revised_response,
+        response_remove_first,
+        response_remove_last,
+        response_remove_both,
+        revised_response_remove_first,
+        revised_response_remove_last,
+        revised_response_remove_both,
+    ]
+    instruction_list = inp.instruction_id_list
+    is_following_list = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=inp.prompt)
+
+        is_following = False
+        for r in all_responses:
+            if r.strip() and instruction.check_following(r):
+                is_following = True
+                break
+
+        is_following_list.append(is_following)
+
+    return OutputExample(
+        instruction_id_list=inp.instruction_id_list,
+        prompt=inp.prompt,
+        response=response,
+        follow_all_instructions=all(is_following_list),
+        follow_instruction_list=is_following_list,
+    )
+
+
+def process_results(doc, results):
+    new_kwargs = []
+    for item in doc["kwargs"]:
+        if item["nth_paragraph"]:
+            item["nth_paragraph"] = int(item["nth_paragraph"])
+        new_kwargs.append(item)
+    inp = InputExample(
+        key=doc["key"],
+        instruction_id_list=doc["instruction_id_list"],
+        prompt=doc["prompt"],
+        kwargs=new_kwargs,
+    )
+    response = results[0]
+
+    out_strict = test_instruction_following_strict(inp, response)
+    out_loose = test_instruction_following_loose(inp, response)
+
+    return {
+        "prompt_level_strict_acc": out_strict.follow_all_instructions,
+        "inst_level_strict_acc": out_strict.follow_instruction_list,
+        "prompt_level_loose_acc": out_loose.follow_all_instructions,
+        "inst_level_loose_acc": out_loose.follow_instruction_list,
+    }
+
+
+def agg_inst_level_acc(items):
+    flat_items = [item for sublist in items for item in sublist]
+    inst_level_acc = sum(flat_items) / len(flat_items)
+    return inst_level_acc
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _get_question(example: dict) -> dict:
+        # get the question from the ifeval dataset
+        example["input_question"] = (
+            eval(
+                example["input_question"]
+                .replace("null", "None")
+                .replace("true", "True")
+                .replace("false", "False")
+            )["dialog"][0]["body"]
+            .replace("Is it True that the first song", "Is it true that the first song")
+            .replace("Is the following True", "Is the following true")
+        )
+        example["input_final_prompts"] = example["input_final_prompts"][0]
+        return example
+
+    original_dataset_name = "wis-k/instruction-following-eval"
+    ifeval_data = datasets.load_dataset(original_dataset_name, split="train")
+    ifeval_df = ifeval_data.to_pandas()
+    ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
+
+    meta_dataset = dataset.map(_get_question)
+    meta_df = meta_dataset.to_pandas()
+
+    # join the two datasets on the input_question column
+    joined = meta_df.join(ifeval_df.set_index("input_question"), on="input_question")
+    joined = joined.rename(columns={"input_final_prompts": "prompt"})
+    joined = joined.rename(columns={"is_correct": "previous_is_correct"})
+    joined = datasets.Dataset.from_pandas(joined)
+    joined = joined.select_columns(
+        [
+            "input_question",
+            "prompt",
+            "previous_is_correct",
+            "instruction_id_list",
+            "kwargs",
+            "output_prediction_text",
+            "key",
+        ]
+    )
+    joined.rename_column("output_prediction_text", "previous_output_prediction_text")
+    return joined
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
new file mode 100644
index 0000000000..1ec3c107d8
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
@@ -0,0 +1,29 @@
+task: meta_mmlu_pro_instruct
+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
+dataset_name: Llama-3.1-8B-Instruct-evals__mmlu_pro__details
+test_split: latest
+output_type: generate_until
+process_docs: !function utils.process_docs
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: 'best answer is ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 1024
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py
new file mode 100644
index 0000000000..6b8bc3e5b2
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import datasets
+
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+        }
+        return out_doc
+
+    dataset = dataset.select_columns(
+        [
+            "input_question",
+            "input_correct_responses",
+            "input_final_prompts",
+            "is_correct",
+            "input_question_hash",
+            "input_choice_list",
+            "output_prediction_text",
+        ],
+    )
+    dataset = dataset.rename_column("is_correct", "previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset
diff --git a/llama_stack/providers/registry/evals.py b/llama_stack/providers/registry/evals.py
new file mode 100644
index 0000000000..8693ec603a
--- /dev/null
+++ b/llama_stack/providers/registry/evals.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from llama_stack.distribution.datatypes import *  # noqa: F403
+
+
+def available_providers() -> List[ProviderSpec]:
+    return [
+        InlineProviderSpec(
+            api=Api.evals,
+            provider_type="meta-reference",
+            pip_packages=[
+                "matplotlib",
+                "pillow",
+                "pandas",
+                "scikit-learn",
+                "datasets",
+            ],
+            module="llama_stack.providers.impls.meta_reference.evals",
+            config_class="llama_stack.providers.impls.meta_reference.evals.MetaReferenceEvalsImplConfig",
+            api_dependencies=[
+                Api.inference,
+            ],
+        ),
+        InlineProviderSpec(
+            api=Api.evals,
+            provider_type="eleuther",
+            pip_packages=[
+                "lm-eval",
+            ],
+            module="llama_stack.providers.impls.third_party.evals.eleuther",
+            config_class="llama_stack.providers.impls.third_party.evals.eleuther.EleutherEvalsImplConfig",
+            api_dependencies=[
+                Api.inference,
+            ],
+        ),
+    ]
diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py
index 9fffc0f99a..2070649043 100644
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@@ -152,7 +152,7 @@ def severity(levelname: str) -> LogSeverity:
     elif levelname == "INFO":
         return LogSeverity.INFO
     elif levelname == "WARNING":
-        return LogSeverity.WARNING
+        return LogSeverity.WARN
     elif levelname == "ERROR":
         return LogSeverity.ERROR
     elif levelname == "CRITICAL":
diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml
index e12f6e8528..1422d6ee20 100644
--- a/tests/examples/local-run.yaml
+++ b/tests/examples/local-run.yaml
@@ -11,7 +11,12 @@ apis:
 - memory_banks
 - inference
 - safety
+- evals
 providers:
+  evals:
+  - provider_id: meta-reference
+    provider_type: meta-reference
+    config: {}
   inference:
   - provider_id: meta-reference
     provider_type: meta-reference

From c8de439d9fc7303e704fa88002457a08dfb0674d Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 10 Oct 2024 11:38:37 -0700
Subject: [PATCH 02/27] clean

---
 llama_stack/apis/dataset/dataset.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py
index ba2cb88110..8ab135b6a5 100644
--- a/llama_stack/apis/dataset/dataset.py
+++ b/llama_stack/apis/dataset/dataset.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-# from enum import Enum
 from typing import Any, Dict, Optional, Protocol
 
 from llama_models.llama3.api.datatypes import URL

From 99ed1425fc4db16973fc6224e22caeeb9f2b19dc Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 10 Oct 2024 17:19:18 -0700
Subject: [PATCH 03/27] add dataset datatypes

---
 llama_stack/apis/dataset/dataset.py           | 95 +++++++++++++++----
 llama_stack/apis/evals/evals.py               |  1 +
 .../registry/datasets/__init__.py             | 28 +++---
 .../distribution/registry/datasets/dataset.py | 94 +++++++++++-------
 .../registry/datasets/dataset_registry.py     |  2 +-
 5 files changed, 154 insertions(+), 66 deletions(-)

diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py
index 8ab135b6a5..164e16be44 100644
--- a/llama_stack/apis/dataset/dataset.py
+++ b/llama_stack/apis/dataset/dataset.py
@@ -4,46 +4,105 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional, Protocol
-
-from llama_models.llama3.api.datatypes import URL
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Any, Dict, Generic, Iterator, Literal, Protocol, TypeVar, Union
 
 from llama_models.schema_utils import json_schema_type, webmethod
 
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
+from typing_extensions import Annotated
+
+TDatasetRow = TypeVar("TDatasetRow")
 
 
 @json_schema_type
-class TrainEvalDataset(BaseModel):
-    """Dataset to be used for training or evaluating language models."""
+class DatasetRow(BaseModel): ...
+
 
-    # unique identifier associated with the dataset
-    dataset_id: str
-    content_url: URL
-    metadata: Optional[Dict[str, Any]] = None
+@json_schema_type
+class DictSample(DatasetRow):
+    data: Dict[str, Any]
 
 
 @json_schema_type
-class CreateDatasetRequest(BaseModel):
-    """Request to create a dataset."""
+class Generation(BaseModel): ...
+
 
-    uuid: str
-    dataset: TrainEvalDataset
+@json_schema_type
+class DatasetType(Enum):
+    custom = "custom"
+    huggingface = "huggingface"
+
+
+@json_schema_type
+class HuggingfaceDatasetDef(BaseModel):
+    type: Literal[DatasetType.huggingface.value] = DatasetType.huggingface.value
+    identifier: str = Field(
+        description="A unique name for the dataset",
+    )
+    dataset_name: str = Field(
+        description="The name of the dataset into HF (e.g. hellawag)",
+    )
+    kwargs: Dict[str, Any] = Field(
+        description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)",
+        default_factory=dict,
+    )
+
+
+@json_schema_type
+class CustomDatasetDef(BaseModel):
+    type: Literal[DatasetType.custom.value] = DatasetType.custom.value
+    identifier: str = Field(
+        description="A unique name for the dataset",
+    )
+    url: str = Field(
+        description="The URL to the dataset",
+    )
+
+
+DatasetDef = Annotated[
+    Union[
+        HuggingfaceDatasetDef,
+        CustomDatasetDef,
+    ],
+    Field(discriminator="type"),
+]
+
+
+class BaseDataset(ABC, Generic[TDatasetRow]):
+    def __init__(self) -> None:
+        self.type: str = self.__class__.__name__
+
+    @abstractmethod
+    def __iter__(self) -> Iterator[TDatasetRow]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def load(self) -> None:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def __str__(self) -> str:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def __len__(self) -> int:
+        raise NotImplementedError()
 
 
 class Datasets(Protocol):
     @webmethod(route="/datasets/create")
     def create_dataset(
         self,
-        uuid: str,
-        dataset: TrainEvalDataset,
+        dataset: DatasetDef,
     ) -> None: ...
 
     @webmethod(route="/datasets/get")
     def get_dataset(
         self,
-        dataset_uuid: str,
-    ) -> TrainEvalDataset: ...
+        dataset_identifier: str,
+    ) -> DatasetDef: ...
 
     @webmethod(route="/datasets/delete")
     def delete_dataset(
diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index dbb1348a53..629e68d32b 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -33,6 +33,7 @@ class EvaluateTaskConfig(BaseModel):
 class EvaluateResponse(BaseModel):
     """Scores for evaluation."""
 
+    preprocess_output: GenerationOutput
     metrics: Dict[str, str]
 
 
diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py
index 0b7a843953..3a60d6a5e7 100644
--- a/llama_stack/distribution/registry/datasets/__init__.py
+++ b/llama_stack/distribution/registry/datasets/__init__.py
@@ -5,19 +5,19 @@
 # the root directory of this source tree.
 
 # TODO: make these import config based
-from .dataset import CustomDataset, HFDataset
-from .dataset_registry import DatasetRegistry
+# from .dataset import CustomDataset, HFDataset
+# from .dataset_registry import DatasetRegistry
 
-DATASETS_REGISTRY = {
-    "mmlu-simple-eval-en": CustomDataset(
-        name="mmlu_eval",
-        url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
-    ),
-    "hellaswag": HFDataset(
-        name="hellaswag",
-        url="hf://hellaswag?split=validation&trust_remote_code=True",
-    ),
-}
+# DATASETS_REGISTRY = {
+#     "mmlu-simple-eval-en": CustomDataset(
+#         name="mmlu_eval",
+#         url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
+#     ),
+#     "hellaswag": HFDataset(
+#         name="hellaswag",
+#         url="hf://hellaswag?split=validation&trust_remote_code=True",
+#     ),
+# }
 
-for k, v in DATASETS_REGISTRY.items():
-    DatasetRegistry.register(k, v)
+# for k, v in DATASETS_REGISTRY.items():
+#     DatasetRegistry.register(k, v)
diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py
index 1a16a5c51b..e3a2de3994 100644
--- a/llama_stack/distribution/registry/datasets/dataset.py
+++ b/llama_stack/distribution/registry/datasets/dataset.py
@@ -3,60 +3,88 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-
-from abc import ABC, abstractmethod
-from urllib.parse import parse_qs, urlparse
-
 import pandas
 from datasets import Dataset, load_dataset
 
+from llama_stack.apis.dataset import *  # noqa: F403
+
 
-class BaseDataset(ABC):
-    def __init__(self, name: str):
+class CustomDataset(BaseDataset[DictSample]):
+    def __init__(self, config: CustomDatasetDef) -> None:
+        super().__init__()
+        self.config = config
         self.dataset = None
-        self.dataset_id = name
-        self.type = self.__class__.__name__
+        self.index = 0
 
-    def __iter__(self):
-        return iter(self.dataset)
+    def __iter__(self) -> Iterator[DictSample]:
+        return self
 
-    @abstractmethod
-    def load(self):
-        pass
+    def __next__(self) -> DictSample:
+        if not self.dataset:
+            self.load()
+        if self.index >= len(self.dataset):
+            raise StopIteration
+        sample = DictSample(data=self.dataset[self.index])
+        self.index += 1
+        return sample
 
+    def __str__(self):
+        return f"CustomDataset({self.config})"
 
-class CustomDataset(BaseDataset):
-    def __init__(self, name, url):
-        super().__init__(name)
-        self.url = url
+    def __len__(self):
+        if not self.dataset:
+            self.load()
+        return len(self.dataset)
 
     def load(self):
         if self.dataset:
             return
         # TODO: better support w/ data url
-        if self.url.endswith(".csv"):
-            df = pandas.read_csv(self.url)
-        elif self.url.endswith(".xlsx"):
-            df = pandas.read_excel(self.url)
+        if self.config.url.endswith(".csv"):
+            df = pandas.read_csv(self.config.url)
+        elif self.config.url.endswith(".xlsx"):
+            df = pandas.read_excel(self.config.url)
 
         self.dataset = Dataset.from_pandas(df)
 
 
-class HFDataset(BaseDataset):
-    def __init__(self, name, url):
-        super().__init__(name)
-        self.url = url
+class HuggingfaceDataset(BaseDataset[DictSample]):
+    def __init__(self, config: HuggingfaceDatasetDef):
+        super().__init__()
+        self.config = config
+        self.dataset = None
+        self.index = 0
+
+    def __iter__(self) -> Iterator[DictSample]:
+        return self
+
+    def __next__(self) -> DictSample:
+        if not self.dataset:
+            self.load()
+        if self.index >= len(self.dataset):
+            raise StopIteration
+        sample = DictSample(data=self.dataset[self.index])
+        self.index += 1
+        return sample
+
+    def __str__(self):
+        return f"HuggingfaceDataset({self.config})"
+
+    def __len__(self):
+        if not self.dataset:
+            self.load()
+        return len(self.dataset)
 
     def load(self):
         if self.dataset:
             return
+        self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs)
+        # parsed = urlparse(self.url)
 
-        parsed = urlparse(self.url)
-
-        if parsed.scheme != "hf":
-            raise ValueError(f"Unknown HF dataset: {self.url}")
+        # if parsed.scheme != "hf":
+        #     raise ValueError(f"Unknown HF dataset: {self.url}")
 
-        query = parse_qs(parsed.query)
-        query = {k: v[0] for k, v in query.items()}
-        path = parsed.netloc
-        self.dataset = load_dataset(path, **query)
+        # query = parse_qs(parsed.query)
+        # query = {k: v[0] for k, v in query.items()}
+        # path = parsed.netloc
+        # self.dataset = load_dataset(path, **query)
diff --git a/llama_stack/distribution/registry/datasets/dataset_registry.py b/llama_stack/distribution/registry/datasets/dataset_registry.py
index 9ddaa8bb7a..8e9b22266a 100644
--- a/llama_stack/distribution/registry/datasets/dataset_registry.py
+++ b/llama_stack/distribution/registry/datasets/dataset_registry.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 from typing import AbstractSet, Dict
 
-from .dataset import BaseDataset
+from llama_stack.apis.dataset import BaseDataset
 
 
 class DatasetRegistry:

From 9816c9aae69803e880377ee97db517c1c0dfea0c Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 10 Oct 2024 19:56:19 -0700
Subject: [PATCH 04/27] wip add datatypes

---
 llama_stack/apis/dataset/dataset.py           | 58 ++++++++++--
 llama_stack/apis/evals/evals.py               | 94 +++++++++++++++++--
 .../registry/datasets/__init__.py             | 34 ++++---
 .../distribution/registry/datasets/dataset.py | 36 ++-----
 .../meta_reference/evals/tasks/mmlu_task.py   | 10 +-
 5 files changed, 175 insertions(+), 57 deletions(-)

diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py
index 164e16be44..9a4f442e52 100644
--- a/llama_stack/apis/dataset/dataset.py
+++ b/llama_stack/apis/dataset/dataset.py
@@ -13,20 +13,59 @@
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 
-TDatasetRow = TypeVar("TDatasetRow")
+# A sample (row) from raw dataset
+TDatasetSample = TypeVar("TDatasetSample")
 
 
 @json_schema_type
-class DatasetRow(BaseModel): ...
+class DatasetSample(BaseModel): ...
 
 
 @json_schema_type
-class DictSample(DatasetRow):
+class DictSample(DatasetSample):
     data: Dict[str, Any]
 
 
 @json_schema_type
-class Generation(BaseModel): ...
+class ProcessedDictSample(DatasetSample):
+    data: Dict[str, Any]
+    preprocessed: Dict[str, Any]
+    prediction: Dict[str, Any]
+    postprocessed: Dict[str, Any]
+
+
+# # A sample (row) after preprocessing the raw dataset
+# TPreprocessedSample = TypeVar("TPreprocessedSample")
+
+# @json_schema_type
+# class PreprocessedSample(BaseModel): ...
+
+# @json_schema_type
+# class InferencePreprocessedSample(PreprocessedSample):
+#     # TODO: either keep it generic or specific to inference API
+#     # messages: List[Message]
+#     data: Dict[str, Any]
+
+# # A sample (row) from model prediction output
+# TPredictionSample = TypeVar("TPredictionSample")
+
+# @json_schema_type
+# class PredictionSample(BaseModel): ...
+
+# @json_schema_type
+# class InferencePredictionSample(PredictionSample):
+#     data: Dict[str, Any]
+
+
+# # A sample (row) from post-processed output
+# TPostprocessedSample = TypeVar("TPostprocessedSample")
+
+# @json_schema_type
+# class PostprocessedSample(BaseModel): ...
+
+# @json_schema_type
+# class InferencePostprocessedSample(PredictionSample):
+#     data: Dict[str, Any]
 
 
 @json_schema_type
@@ -70,16 +109,17 @@ class CustomDatasetDef(BaseModel):
 ]
 
 
-class BaseDataset(ABC, Generic[TDatasetRow]):
+class BaseDataset(ABC, Generic[TDatasetSample]):
     def __init__(self) -> None:
         self.type: str = self.__class__.__name__
 
+    @property
     @abstractmethod
-    def __iter__(self) -> Iterator[TDatasetRow]:
+    def dataset_id(self) -> str:
         raise NotImplementedError()
 
     @abstractmethod
-    def load(self) -> None:
+    def __iter__(self) -> Iterator[TDatasetSample]:
         raise NotImplementedError()
 
     @abstractmethod
@@ -90,6 +130,10 @@ def __str__(self) -> str:
     def __len__(self) -> int:
         raise NotImplementedError()
 
+    @abstractmethod
+    def load(self) -> None:
+        raise NotImplementedError()
+
 
 class Datasets(Protocol):
     @webmethod(route="/datasets/create")
diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index 629e68d32b..53a2ff6df1 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Protocol
+from abc import ABC, abstractmethod
+from typing import Dict, Generic, List, Protocol
 
 from llama_models.schema_utils import webmethod
-
 from pydantic import BaseModel
 
 from llama_models.llama3.api.datatypes import *  # noqa: F403
@@ -22,19 +22,26 @@ class EvaluationJobLogStream(BaseModel):
     job_uuid: str
 
 
-class EvaluateTaskConfig(BaseModel):
-    # num examples to evaluate, evaluate all if None
-    n_samples: Optional[int] = None
-    # model evaluation params
-    sampling_params: SamplingParams = SamplingParams()
+@json_schema_type
+class EvalResult(BaseModel):
+    """Evaluation result."""
+
+    metrics: Dict[str, str]
+
+
+@json_schema_type
+class SingleEvalResult(BaseModel):
+    """Single evaluation result."""
+
+    score_data: Dict[str, float]
 
 
 @json_schema_type
 class EvaluateResponse(BaseModel):
     """Scores for evaluation."""
 
-    preprocess_output: GenerationOutput
-    metrics: Dict[str, str]
+    eval_result: EvalResult
+    formatted_report: Optional[str] = None
 
 
 @json_schema_type
@@ -56,6 +63,75 @@ class EvaluationJobCreateResponse(BaseModel):
     job_uuid: str
 
 
+@json_schema_type
+class EvaluateTaskConfig(BaseModel):
+    # num examples to evaluate, evaluate all if None
+    n_samples: Optional[int] = None
+    # model evaluation params
+    sampling_params: SamplingParams = SamplingParams()
+
+
+class BaseTask(
+    ABC,
+    Generic[
+        TDatasetSample,
+        TPreprocessedSample,
+        TPredictionSample,
+        TPostprocessedSample,
+        TSingleEvalResult,
+    ],
+):
+    """
+    A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods.
+    Base class for all evaluation tasks. Each task needs to implement the following methods:
+    - F1: preprocess_sample(self)
+    - F2: postprocess_sample(self)
+    - F3: score_sample(self)
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self._name = self.__class__.__name__
+
+    @abstractmethod
+    def preprocess_sample(self, sample: TDatasetSample) -> TPreprocessedSample:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def postprocess_sample(self, sample: TPredictionSample) -> TPostprocessedSample:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def score_sample(
+        self, sample: TPostprocessedSample, ground_truth: TPreprocessedSample
+    ):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        raise NotImplementedError()
+
+    def preprocess(
+        self, dataset: BaseDataset[TDatasetSample]
+    ) -> List[TPreprocessedSample]:
+        return [self.preprocess_sample(sample) for sample in self.dataset]
+
+    def postprocess(
+        self, generation: List[TPredictionSample]
+    ) -> List[TPostprocessedSample]:
+        return [self.postprocess_sample(sample) for sample in generation]
+
+    def score(
+        self,
+        postprocessed: List[TPostprocessedSample],
+        preprocessed_dataset: List[TPreprocessedSample],
+    ) -> List[TSingleEvalResult]:
+        return [
+            self.score_sample(sample, ground_truth)
+            for sample, ground_truth in zip(postprocessed, self.preprocessed_dataset)
+        ]
+
+
 class Evals(Protocol):
     @webmethod(route="/evals/run")
     async def run_evals(
diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py
index 3a60d6a5e7..f0636212ae 100644
--- a/llama_stack/distribution/registry/datasets/__init__.py
+++ b/llama_stack/distribution/registry/datasets/__init__.py
@@ -5,19 +5,25 @@
 # the root directory of this source tree.
 
 # TODO: make these import config based
-# from .dataset import CustomDataset, HFDataset
-# from .dataset_registry import DatasetRegistry
+from llama_stack.apis.dataset import *  # noqa: F403
+from .dataset import CustomDataset, HuggingfaceDataset
+from .dataset_registry import DatasetRegistry
 
-# DATASETS_REGISTRY = {
-#     "mmlu-simple-eval-en": CustomDataset(
-#         name="mmlu_eval",
-#         url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
-#     ),
-#     "hellaswag": HFDataset(
-#         name="hellaswag",
-#         url="hf://hellaswag?split=validation&trust_remote_code=True",
-#     ),
-# }
+DATASETS_REGISTRY = [
+    CustomDataset(
+        config=CustomDatasetDef(
+            identifier="mmlu-simple-eval-en",
+            url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
+        )
+    ),
+    HuggingfaceDataset(
+        config=HuggingfaceDatasetDef(
+            identifier="hellaswag",
+            dataset_name="hellaswag",
+            kwargs={"split": "validation", "trust_remote_code": True},
+        )
+    ),
+]
 
-# for k, v in DATASETS_REGISTRY.items():
-#     DatasetRegistry.register(k, v)
+for d in DATASETS_REGISTRY:
+    DatasetRegistry.register(d.dataset_id, d)
diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py
index e3a2de3994..87a01d311b 100644
--- a/llama_stack/distribution/registry/datasets/dataset.py
+++ b/llama_stack/distribution/registry/datasets/dataset.py
@@ -16,17 +16,14 @@ def __init__(self, config: CustomDatasetDef) -> None:
         self.dataset = None
         self.index = 0
 
-    def __iter__(self) -> Iterator[DictSample]:
-        return self
+    @property
+    def dataset_id(self) -> str:
+        return self.config.identifier
 
-    def __next__(self) -> DictSample:
+    def __iter__(self) -> Iterator[DictSample]:
         if not self.dataset:
             self.load()
-        if self.index >= len(self.dataset):
-            raise StopIteration
-        sample = DictSample(data=self.dataset[self.index])
-        self.index += 1
-        return sample
+        return (DictSample(data=x) for x in self.dataset)
 
     def __str__(self):
         return f"CustomDataset({self.config})"
@@ -53,19 +50,15 @@ def __init__(self, config: HuggingfaceDatasetDef):
         super().__init__()
         self.config = config
         self.dataset = None
-        self.index = 0
 
-    def __iter__(self) -> Iterator[DictSample]:
-        return self
+    @property
+    def dataset_id(self) -> str:
+        return self.config.identifier
 
-    def __next__(self) -> DictSample:
+    def __iter__(self) -> Iterator[DictSample]:
         if not self.dataset:
             self.load()
-        if self.index >= len(self.dataset):
-            raise StopIteration
-        sample = DictSample(data=self.dataset[self.index])
-        self.index += 1
-        return sample
+        return (DictSample(data=x) for x in self.dataset)
 
     def __str__(self):
         return f"HuggingfaceDataset({self.config})"
@@ -79,12 +72,3 @@ def load(self):
         if self.dataset:
             return
         self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs)
-        # parsed = urlparse(self.url)
-
-        # if parsed.scheme != "hf":
-        #     raise ValueError(f"Unknown HF dataset: {self.url}")
-
-        # query = parse_qs(parsed.query)
-        # query = {k: v[0] for k, v in query.items()}
-        # path = parsed.netloc
-        # self.dataset = load_dataset(path, **query)
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py
index 673a953791..c5c9d97563 100644
--- a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py
@@ -111,7 +111,14 @@ def normalize_extracted_answer(extracted_answer: str) -> str:
     )
 
 
-class MMLUTask(BaseTask):
+class MMLUTask(
+    BaseTask[
+        DictSample,
+        InferencePreprocessedSample,
+        InferencePredictionSample,
+        InferencePostprocessedSample,
+    ]
+):
     """
     MMLU Task.
     """
@@ -120,6 +127,7 @@ def __init__(self, dataset, *args, **kwargs):
         super().__init__(dataset, *args, **kwargs)
 
     def preprocess_sample(self, sample):
+        print(sample)
         content = QUERY_TEMPLATE_MULTICHOICE.format(**sample)
         return {
             "role": "user",

From ad18dc94acd7ae1713a26432962fc12f8b168d6d Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 10 Oct 2024 21:33:13 -0700
Subject: [PATCH 05/27] add data structure to tasks

---
 llama_stack/apis/dataset/dataset.py           | 49 +++---------
 llama_stack/apis/evals/evals.py               | 40 +++-------
 .../distribution/registry/tasks/task.py       | 49 ------------
 .../registry/tasks/task_registry.py           |  2 +-
 .../impls/meta_reference/evals/evals.py       | 35 +++++----
 .../meta_reference/evals/tasks/mmlu_task.py   | 75 +++++++++++--------
 tests/examples/local-run.yaml                 | 18 +++--
 7 files changed, 100 insertions(+), 168 deletions(-)
 delete mode 100644 llama_stack/distribution/registry/tasks/task.py

diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py
index 9a4f442e52..ed21c429fd 100644
--- a/llama_stack/apis/dataset/dataset.py
+++ b/llama_stack/apis/dataset/dataset.py
@@ -9,11 +9,12 @@
 from typing import Any, Dict, Generic, Iterator, Literal, Protocol, TypeVar, Union
 
 from llama_models.schema_utils import json_schema_type, webmethod
+from llama_models.llama3.api.datatypes import *  # noqa: F403
 
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 
-# A sample (row) from raw dataset
+# A sample (row) from dataset
 TDatasetSample = TypeVar("TDatasetSample")
 
 
@@ -26,46 +27,20 @@ class DictSample(DatasetSample):
     data: Dict[str, Any]
 
 
-@json_schema_type
-class ProcessedDictSample(DatasetSample):
-    data: Dict[str, Any]
-    preprocessed: Dict[str, Any]
-    prediction: Dict[str, Any]
-    postprocessed: Dict[str, Any]
-
-
-# # A sample (row) after preprocessing the raw dataset
-# TPreprocessedSample = TypeVar("TPreprocessedSample")
-
-# @json_schema_type
-# class PreprocessedSample(BaseModel): ...
-
-# @json_schema_type
-# class InferencePreprocessedSample(PreprocessedSample):
-#     # TODO: either keep it generic or specific to inference API
-#     # messages: List[Message]
-#     data: Dict[str, Any]
+# A sample (row) from evals intermediate dataset
+TProcessedSample = TypeVar("TProcessedSample")
 
-# # A sample (row) from model prediction output
-# TPredictionSample = TypeVar("TPredictionSample")
 
-# @json_schema_type
-# class PredictionSample(BaseModel): ...
-
-# @json_schema_type
-# class InferencePredictionSample(PredictionSample):
-#     data: Dict[str, Any]
-
-
-# # A sample (row) from post-processed output
-# TPostprocessedSample = TypeVar("TPostprocessedSample")
+@json_schema_type
+class PredictionSample(BaseModel):
+    completion_message: str
 
-# @json_schema_type
-# class PostprocessedSample(BaseModel): ...
 
-# @json_schema_type
-# class InferencePostprocessedSample(PredictionSample):
-#     data: Dict[str, Any]
+@json_schema_type
+class ProcessedDictSample(DictSample):
+    preprocessed: Optional[Dict[str, Any]] = None
+    prediction: Optional[PredictionSample] = None
+    postprocessed: Optional[Dict[str, Any]] = None
 
 
 @json_schema_type
diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index 53a2ff6df1..6fe85408e4 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -71,16 +71,7 @@ class EvaluateTaskConfig(BaseModel):
     sampling_params: SamplingParams = SamplingParams()
 
 
-class BaseTask(
-    ABC,
-    Generic[
-        TDatasetSample,
-        TPreprocessedSample,
-        TPredictionSample,
-        TPostprocessedSample,
-        TSingleEvalResult,
-    ],
-):
+class BaseTask(ABC, Generic[TDatasetSample, TProcessedSample]):
     """
     A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods.
     Base class for all evaluation tasks. Each task needs to implement the following methods:
@@ -94,17 +85,15 @@ def __init__(self, *args, **kwargs) -> None:
         self._name = self.__class__.__name__
 
     @abstractmethod
-    def preprocess_sample(self, sample: TDatasetSample) -> TPreprocessedSample:
+    def preprocess_sample(self, sample: TDatasetSample) -> TProcessedSample:
         raise NotImplementedError()
 
     @abstractmethod
-    def postprocess_sample(self, sample: TPredictionSample) -> TPostprocessedSample:
+    def postprocess_sample(self, sample: TProcessedSample) -> TProcessedSample:
         raise NotImplementedError()
 
     @abstractmethod
-    def score_sample(
-        self, sample: TPostprocessedSample, ground_truth: TPreprocessedSample
-    ):
+    def score_sample(self, sample: TProcessedSample) -> SingleEvalResult:
         raise NotImplementedError()
 
     @abstractmethod
@@ -112,24 +101,15 @@ def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
         raise NotImplementedError()
 
     def preprocess(
-        self, dataset: BaseDataset[TDatasetSample]
-    ) -> List[TPreprocessedSample]:
-        return [self.preprocess_sample(sample) for sample in self.dataset]
+        self, dataset: BaseDataset[TProcessedSample]
+    ) -> List[TProcessedSample]:
+        return [self.preprocess_sample(sample) for sample in dataset]
 
-    def postprocess(
-        self, generation: List[TPredictionSample]
-    ) -> List[TPostprocessedSample]:
+    def postprocess(self, generation: List[TProcessedSample]) -> List[TProcessedSample]:
         return [self.postprocess_sample(sample) for sample in generation]
 
-    def score(
-        self,
-        postprocessed: List[TPostprocessedSample],
-        preprocessed_dataset: List[TPreprocessedSample],
-    ) -> List[TSingleEvalResult]:
-        return [
-            self.score_sample(sample, ground_truth)
-            for sample, ground_truth in zip(postprocessed, self.preprocessed_dataset)
-        ]
+    def score(self, postprocessed: List[TProcessedSample]) -> List[SingleEvalResult]:
+        return [self.score_sample(sample) for sample in postprocessed]
 
 
 class Evals(Protocol):
diff --git a/llama_stack/distribution/registry/tasks/task.py b/llama_stack/distribution/registry/tasks/task.py
deleted file mode 100644
index a92e6241b6..0000000000
--- a/llama_stack/distribution/registry/tasks/task.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from abc import ABC, abstractmethod
-
-
-class BaseTask(ABC):
-    """
-    A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods.
-    Base class for all evaluation tasks. Each task needs to implement the following methods:
-    - F1: preprocess_sample(self)
-    - F2: postprocess_sample(self)
-    - F3: score_sample(self)
-    """
-
-    def __init__(self, dataset, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._name = self.__class__.__name__
-        self.dataset = dataset
-
-    @abstractmethod
-    def preprocess_sample(self, sample):
-        raise NotImplementedError()
-
-    @abstractmethod
-    def postprocess_sample(self, sample):
-        raise NotImplementedError()
-
-    @abstractmethod
-    def score_sample(self, sample, ground_truth):
-        raise NotImplementedError()
-
-    @abstractmethod
-    def aggregate_results(self, eval_results):
-        raise NotImplementedError()
-
-    def preprocess(self):
-        return [self.preprocess_sample(sample) for sample in self.dataset]
-
-    def postprocess(self, generation):
-        return [self.postprocess_sample(sample) for sample in generation]
-
-    def score(self, postprocessed):
-        return [
-            self.score_sample(sample, ground_truth)
-            for sample, ground_truth in zip(postprocessed, self.dataset)
-        ]
diff --git a/llama_stack/distribution/registry/tasks/task_registry.py b/llama_stack/distribution/registry/tasks/task_registry.py
index 063894e482..df25686ba6 100644
--- a/llama_stack/distribution/registry/tasks/task_registry.py
+++ b/llama_stack/distribution/registry/tasks/task_registry.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 from typing import AbstractSet, Dict
 
-from .task import BaseTask
+from llama_stack.apis.evals import BaseTask
 
 
 class TaskRegistry:
diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py
index 5f475c5395..d7214663ef 100644
--- a/llama_stack/providers/impls/meta_reference/evals/evals.py
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@@ -6,6 +6,8 @@
 
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.apis.evals import *  # noqa: F403
+from llama_stack.apis.dataset import *  # noqa: F403
+
 from termcolor import cprint
 
 from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry
@@ -42,30 +44,37 @@ async def run_evals(
         dataset = DatasetRegistry.get_dataset(dataset)
         dataset.load()
 
-        task_impl = TaskRegistry.get_task(task)(dataset)
-        x1 = task_impl.preprocess()
+        task_impl = TaskRegistry.get_task(task)()
+        preprocessed = task_impl.preprocess(dataset)
 
         # TODO: replace w/ batch inference & async return eval job
         generation_outputs = []
         if eval_task_config is None:
-            eval_task_config = EvaluateTaskConfig(n_samples=len(x1))
-        if eval_task_config.n_samples is None or eval_task_config.n_samples > len(x1):
-            eval_task_config.n_samples = len(x1)
+            eval_task_config = EvaluateTaskConfig(n_samples=len(preprocessed))
+        if eval_task_config.n_samples is None or eval_task_config.n_samples > len(
+            preprocessed
+        ):
+            eval_task_config.n_samples = len(preprocessed)
 
         print(
             f"Eval generation start, generate on {eval_task_config.n_samples} samples"
         )
 
-        for msg in x1[: eval_task_config.n_samples]:
-            print("generation for msg: ", msg)
+        for sample in preprocessed[: eval_task_config.n_samples]:
+            print("generation: ", sample)
             response = await self.inference_api.chat_completion(
                 model=model,
-                messages=[msg],
+                messages=sample.preprocessed["messages"],
                 stream=False,
             )
-            generation_outputs.append(response.completion_message.content)
+            sample.prediction = PredictionSample(
+                completion_message=response.completion_message.content
+            )
+            generation_outputs.append(sample)
 
-        x2 = task_impl.postprocess(generation_outputs)
-        eval_results = task_impl.score(x2)
-        eval_response = task_impl.aggregate_results(eval_results)
-        return eval_response
+        postprocessed = task_impl.postprocess(generation_outputs)
+        eval_results = task_impl.score(postprocessed)
+        aggr_result = task_impl.aggregate_results(eval_results)
+        return EvaluateResponse(
+            eval_result=aggr_result,
+        )
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py
index c5c9d97563..e3d9e4ef3c 100644
--- a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py
@@ -6,7 +6,8 @@
 import re
 
 from llama_stack.apis.evals import *  # noqa: F403
-from llama_stack.distribution.registry.tasks.task import BaseTask
+
+# from llama_stack.distribution.registry.tasks.task import BaseTask
 
 QUERY_TEMPLATE_MULTICHOICE = """
 Answer the following multiple choice question and make the answer very simple. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD.
@@ -111,48 +112,60 @@ def normalize_extracted_answer(extracted_answer: str) -> str:
     )
 
 
-class MMLUTask(
-    BaseTask[
-        DictSample,
-        InferencePreprocessedSample,
-        InferencePredictionSample,
-        InferencePostprocessedSample,
-    ]
-):
+class MMLUTask(BaseTask[DictSample, ProcessedDictSample]):
     """
     MMLU Task.
     """
 
-    def __init__(self, dataset, *args, **kwargs):
-        super().__init__(dataset, *args, **kwargs)
-
-    def preprocess_sample(self, sample):
-        print(sample)
-        content = QUERY_TEMPLATE_MULTICHOICE.format(**sample)
-        return {
-            "role": "user",
-            "content": content,
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def preprocess_sample(self, sample: ProcessedDictSample) -> ProcessedDictSample:
+        content = QUERY_TEMPLATE_MULTICHOICE.format(**sample.data)
+        preprocessed = {
+            "messages": [
+                {
+                    "role": "user",
+                    "content": content,
+                }
+            ],
         }
+        processed_sample = ProcessedDictSample(
+            data=sample.data,
+            preprocessed=preprocessed,
+        )
+        return processed_sample
+
+    def postprocess_sample(self, sample: ProcessedDictSample) -> ProcessedDictSample:
+        if not sample.postprocessed:
+            sample.postprocessed = {}
+        sample.postprocessed["postprocessed"] = normalize_response(
+            sample.prediction.completion_message
+        )
+        return sample
 
-    def postprocess_sample(self, sample):
-        normalized = normalize_response(sample)
-        return normalized
+    def score_sample(self, sample: ProcessedDictSample) -> SingleEvalResult:
+        postprocessed_output = sample.postprocessed["postprocessed"]
+        expected_answer = sample.data["Answer"]
 
-    def score_sample(self, sample, expected):
         extracted_answer = None
         for answer_regex in MULTILINGUAL_ANSWER_REGEXES:
             regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex)
-            match = re.search(regex, sample)
+            match = re.search(regex, postprocessed_output)
             if match:
                 extracted_answer = normalize_extracted_answer(match.group(1))
                 break
-        score = (
-            1.0 if extracted_answer and extracted_answer == expected["Answer"] else 0.0
-        )
-        # TODO: generalize this into SingleEvalResult
-        return score
 
-    def aggregate_results(self, eval_results):
-        return EvaluateResponse(
-            metrics={"score": str(sum(eval_results) / len(eval_results))}
+        score = 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
+
+        return SingleEvalResult(
+            score_data={
+                "score": score,
+            },
         )
+
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        print("aggregate_results", eval_results)
+        sum_score = sum([result.score_data["score"] for result in eval_results])
+
+        return EvalResult(metrics={"score": str(sum_score / len(eval_results))})
diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml
index 1422d6ee20..3c9f73e0b1 100644
--- a/tests/examples/local-run.yaml
+++ b/tests/examples/local-run.yaml
@@ -18,14 +18,18 @@ providers:
     provider_type: meta-reference
     config: {}
   inference:
-  - provider_id: meta-reference
-    provider_type: meta-reference
+  - provider_id: remote::tgi
+    provider_type: remote::tgi
     config:
-      model: Llama3.1-8B-Instruct
-      quantization: null
-      torch_seed: null
-      max_seq_len: 4096
-      max_batch_size: 1
+      url: http://127.0.0.1:5009
+  # - provider_id: meta-reference
+  #   provider_type: meta-reference
+  #   config:
+  #     model: Llama3.1-8B-Instruct
+  #     quantization: null
+  #     torch_seed: null
+  #     max_seq_len: 4096
+  #     max_batch_size: 1
   safety:
   - provider_id: meta-reference
     provider_type: meta-reference

From fb565dfb066addeacad8b31386c81d0c24787b2c Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Fri, 11 Oct 2024 09:30:10 -0700
Subject: [PATCH 06/27] eleuther eval fix

---
 llama_stack/apis/evals/client.py              | 32 ++++++++++---------
 .../third_party/evals/eleuther/eleuther.py    |  8 +++--
 tests/examples/local-run.yaml                 |  4 +--
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py
index ad4a471455..bde78adc94 100644
--- a/llama_stack/apis/evals/client.py
+++ b/llama_stack/apis/evals/client.py
@@ -54,27 +54,29 @@ async def run_evals(
 async def run_main(host: str, port: int):
     client = EvaluationClient(f"http://{host}:{port}")
 
-    # CustomDataset
+    # Custom Eval Task
+    # response = await client.run_evals(
+    #     model="Llama3.1-8B-Instruct",
+    #     dataset="mmlu-simple-eval-en",
+    #     task="mmlu",
+    #     eval_task_config=EvaluateTaskConfig(
+    #         n_samples=2,
+    #     ),
+    # )
+
+    # Eleuther Eval Task
     response = await client.run_evals(
         model="Llama3.1-8B-Instruct",
-        dataset="mmlu-simple-eval-en",
-        task="mmlu",
+        # task="meta_mmlu_pro_instruct",
+        task="meta_ifeval",
         eval_task_config=EvaluateTaskConfig(
             n_samples=2,
         ),
     )
-    cprint(f"evaluate response={response}", "green")
-
-    # Eleuther Eval Task
-    # response = await client.run_evals(
-    #     model="Llama3.1-8B-Instruct",
-    #     task="meta_mmlu_pro_instruct",
-    #     # task="meta_ifeval",
-    #     eval_task_config=EvaluateTaskConfig(
-    #         n_samples=2,
-    #     )
-    # )
-    # cprint(response.metrics["metrics_table"], "red")
+    if response.formatted_report:
+        cprint(response.formatted_report, "green")
+    else:
+        cprint(f"evaluate response={response}", "green")
 
 
 def main(host: str, port: int):
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py
index b9f9505e93..e4b32a45e0 100644
--- a/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py
@@ -157,12 +157,14 @@ async def run_evals(
             limit=eval_task_config.n_samples,
         )
 
+        eval_result = EvalResult(
+            metrics={},
+        )
         formatted_output = lm_eval.utils.make_table(output)
 
         cprint(formatted_output, "green")
 
         return EvaluateResponse(
-            metrics={
-                "metrics_table": formatted_output,
-            },
+            eval_result=eval_result,
+            formatted_report=formatted_output,
         )
diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml
index 3c9f73e0b1..430ce61020 100644
--- a/tests/examples/local-run.yaml
+++ b/tests/examples/local-run.yaml
@@ -14,8 +14,8 @@ apis:
 - evals
 providers:
   evals:
-  - provider_id: meta-reference
-    provider_type: meta-reference
+  - provider_id: eleuther
+    provider_type: eleuther
     config: {}
   inference:
   - provider_id: remote::tgi

From a25aff290ef5103c40566f3fb925f6efbe20bccf Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Sun, 13 Oct 2024 23:27:02 -0700
Subject: [PATCH 07/27] generator + scorer Api for MMLU

---
 llama_stack/apis/dataset/dataset.py           |  47 +++-
 llama_stack/apis/evals/client.py              |  25 +-
 llama_stack/apis/evals/evals.py               | 226 +++++++++++++++---
 .../distribution/registry/datasets/dataset.py |  10 +-
 .../distribution/registry/scorers/__init__.py |   6 +
 .../registry/scorers/scorer_registry.py       |  32 +++
 .../distribution/registry/tasks/__init__.py   |   8 -
 .../impls/meta_reference/evals/evals.py       | 166 ++++++++++---
 .../evals/processor/__init__.py               |   5 +
 .../mmlu_processor.py}                        |  94 +++++---
 .../meta_reference/evals/scorer/__init__.py   |   5 +
 .../evals/scorer/basic_scorers.py             |  78 ++++++
 .../evals/tasks/run_eval_task.py              |  39 +++
 tests/examples/local-run.yaml                 |   4 +-
 14 files changed, 616 insertions(+), 129 deletions(-)
 create mode 100644 llama_stack/distribution/registry/scorers/__init__.py
 create mode 100644 llama_stack/distribution/registry/scorers/scorer_registry.py
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/processor/__init__.py
 rename llama_stack/providers/impls/meta_reference/evals/{tasks/mmlu_task.py => processor/mmlu_processor.py} (60%)
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py

diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py
index ed21c429fd..798f3aba99 100644
--- a/llama_stack/apis/dataset/dataset.py
+++ b/llama_stack/apis/dataset/dataset.py
@@ -14,6 +14,25 @@
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 
+
+@json_schema_type
+class GenerationInput(BaseModel):
+    messages: List[Message]
+
+
+@json_schema_type
+class GenerationOutput(BaseModel):
+    completion_message: str
+    logprobs: Optional[List[TokenLogProbs]] = None
+
+
+@json_schema_type
+class PostprocessedGeneration(BaseModel):
+    completion_message: str
+    # structured transformed output from raw_completion_message to compute scorer metrics
+    transformed_generation: Optional[Any] = None
+
+
 # A sample (row) from dataset
 TDatasetSample = TypeVar("TDatasetSample")
 
@@ -27,20 +46,32 @@ class DictSample(DatasetSample):
     data: Dict[str, Any]
 
 
-# A sample (row) from evals intermediate dataset
-TProcessedSample = TypeVar("TProcessedSample")
+# A sample (row) from evals intermediate dataset after preprocessing
+TPreprocessedSample = TypeVar("TPreprocessedSample")
 
 
 @json_schema_type
-class PredictionSample(BaseModel):
-    completion_message: str
+class PreprocessedSample(DatasetSample):
+    generation_input: GenerationInput
+
+
+# A sample (row) from evals intermediate dataset after inference
+TGenerationResponseSample = TypeVar("TGenerationResponseSample")
+
+
+@json_schema_type
+class GenerationResponseSample(DatasetSample):
+    generation_output: GenerationOutput
+
+
+# A sample (row) for prepared evals dataset ready for scoring
+TScorerInputSample = TypeVar("TScorerInputSample")
 
 
 @json_schema_type
-class ProcessedDictSample(DictSample):
-    preprocessed: Optional[Dict[str, Any]] = None
-    prediction: Optional[PredictionSample] = None
-    postprocessed: Optional[Dict[str, Any]] = None
+class ScorerInputSample(DatasetSample):
+    generation_output: PostprocessedGeneration
+    expected_output: Union[str, List[str]]
 
 
 @json_schema_type
diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py
index bde78adc94..b4d1c39fe7 100644
--- a/llama_stack/apis/evals/client.py
+++ b/llama_stack/apis/evals/client.py
@@ -33,7 +33,7 @@ async def run_evals(
     ) -> EvaluateResponse:
         async with httpx.AsyncClient() as client:
             response = await client.post(
-                f"{self.base_url}/evals/run",
+                f"{self.base_url}/evals/run_eval_task",
                 json={
                     "model": model,
                     "task": task,
@@ -55,28 +55,25 @@ async def run_main(host: str, port: int):
     client = EvaluationClient(f"http://{host}:{port}")
 
     # Custom Eval Task
+    response = await client.run_evals(
+        model="Llama3.1-8B-Instruct",
+        dataset="mmlu-simple-eval-en",
+        task="mmlu",
+    )
+
+    # Eleuther Eval Task
     # response = await client.run_evals(
     #     model="Llama3.1-8B-Instruct",
-    #     dataset="mmlu-simple-eval-en",
-    #     task="mmlu",
+    #     # task="meta_mmlu_pro_instruct",
+    #     task="meta_ifeval",
     #     eval_task_config=EvaluateTaskConfig(
     #         n_samples=2,
     #     ),
     # )
-
-    # Eleuther Eval Task
-    response = await client.run_evals(
-        model="Llama3.1-8B-Instruct",
-        # task="meta_mmlu_pro_instruct",
-        task="meta_ifeval",
-        eval_task_config=EvaluateTaskConfig(
-            n_samples=2,
-        ),
-    )
     if response.formatted_report:
         cprint(response.formatted_report, "green")
     else:
-        cprint(f"evaluate response={response}", "green")
+        cprint(f"Response: {response}", "green")
 
 
 def main(host: str, port: int):
diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index 6fe85408e4..92657f6b5c 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 from abc import ABC, abstractmethod
-from typing import Dict, Generic, List, Protocol
+from typing import Dict, Generic, List, Optional, Protocol
 
 from llama_models.schema_utils import webmethod
 from pydantic import BaseModel
@@ -24,14 +24,14 @@ class EvaluationJobLogStream(BaseModel):
 
 @json_schema_type
 class EvalResult(BaseModel):
-    """Evaluation result."""
+    """Aggregated final evaluation result."""
 
-    metrics: Dict[str, str]
+    metrics: Dict[str, float]
 
 
 @json_schema_type
 class SingleEvalResult(BaseModel):
-    """Single evaluation result."""
+    """Single evaluation result. Contains a scorer name, and corresponding metrics from scorer."""
 
     score_data: Dict[str, float]
 
@@ -64,57 +64,222 @@ class EvaluationJobCreateResponse(BaseModel):
 
 
 @json_schema_type
-class EvaluateTaskConfig(BaseModel):
-    # num examples to evaluate, evaluate all if None
-    n_samples: Optional[int] = None
-    # model evaluation params
+class EvaluateDatasetConfig(BaseModel):
+    # identifier to previously registered dataset via DatasetDef
+    dataset_name: str
+    # limit number of rows to evaluate
+    row_limit: Optional[int] = None
+    kwargs: Optional[Dict[str, Any]] = None
+
+
+@json_schema_type
+class EvaluatePreprocessConfig(BaseModel):
+    kwargs: Optional[Dict[str, Any]] = None
+
+
+@json_schema_type
+class EvaluateModelGenerationConfig(BaseModel):
+    model: str
     sampling_params: SamplingParams = SamplingParams()
+    kwargs: Optional[Dict[str, Any]] = None
+
+
+@json_schema_type
+class EvaluatePostprocessConfig(BaseModel):
+    kwargs: Optional[Dict[str, Any]] = None
+
+
+@json_schema_type
+class EvaluateJudgeScoringConfig(BaseModel): ...
+
+
+@json_schema_type
+class LLMJudgeConfig(BaseModel):
+    judge_preprocess_config: EvaluatePreprocessConfig
+    judge_model_generation_config: EvaluateModelGenerationConfig
+    judge_postprocess_config: EvaluatePostprocessConfig
+    judge_scoring_config: EvaluateJudgeScoringConfig
+
+
+@json_schema_type
+class EvaluateSingleScorerConfig(BaseModel):
+    scorer_name: str
+    llm_judge_config: Optional[LLMJudgeConfig] = None
+
 
+@json_schema_type
+class EvaluateScoringConfig(BaseModel):
+    # list of scorer (metrics) names to use
+    scorer_config_list: List[EvaluateSingleScorerConfig]
 
-class BaseTask(ABC, Generic[TDatasetSample, TProcessedSample]):
+
+@json_schema_type
+class EvaluateTaskConfig(BaseModel):
+    dataset_config: EvaluateDatasetConfig
+    preprocess_config: Optional[EvaluatePreprocessConfig] = None
+    generation_config: EvaluateModelGenerationConfig
+    postprocess_config: Optional[EvaluatePostprocessConfig] = None
+    scoring_config: EvaluateScoringConfig
+
+
+class BaseGeneratorProcessor(
+    ABC,
+    Generic[
+        TDatasetSample,
+        TPreprocessedSample,
+        TGenerationResponseSample,
+        TScorerInputSample,
+    ],
+):
     """
-    A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods.
-    Base class for all evaluation tasks. Each task needs to implement the following methods:
-    - F1: preprocess_sample(self)
+    Base class for all generator processors. Each processor needs to implement the following methods:
+    - F1: preprocess_sample(self, dataset)
     - F2: postprocess_sample(self)
-    - F3: score_sample(self)
     """
 
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self._name = self.__class__.__name__
+
+    def __str__(self) -> str:
+        return self.__class__.__name__
+
+    def preprocess(
+        self, dataset: BaseDataset[TDatasetSample]
+    ) -> List[TPreprocessedSample]:
+        return [self.preprocess_sample(sample) for sample in dataset]
+
+    def postprocess(
+        self,
+        generation: List[TGenerationResponseSample],
+        dataset: BaseDataset[TDatasetSample],
+    ) -> List[TScorerInputSample]:
+        return [
+            self.postprocess_sample(generation_sample, dataset_sample)
+            for generation_sample, dataset_sample in zip(generation, dataset)
+        ]
 
     @abstractmethod
-    def preprocess_sample(self, sample: TDatasetSample) -> TProcessedSample:
+    def preprocess_sample(self, sample: TDatasetSample) -> TPreprocessedSample:
         raise NotImplementedError()
 
     @abstractmethod
-    def postprocess_sample(self, sample: TProcessedSample) -> TProcessedSample:
+    def postprocess_sample(
+        self,
+        generation_sample: TGenerationResponseSample,
+        dataset_sample: TDatasetSample,
+    ) -> TScorerInputSample:
         raise NotImplementedError()
 
+
+class BaseGenerator(ABC, Generic[TGenerationResponseSample]):
+    """
+    Base class for all generators. Each generator needs to implement the following methods:
+    - generate(self, preprocessed_dataset)
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def __str__(self) -> str:
+        return self.__class__.__name__
+
+    @abstractmethod
+    def generate(
+        self, preprocessed_dataset: List[TPreprocessedSample]
+    ) -> List[TGenerationResponseSample]:
+        raise NotImplementedError()
+
+
+class BaseScorer(ABC, Generic[TScorerInputSample]):
+    """
+    Base class for all scorers. Each scorer needs to implement the following methods:
+    - score_sample(self, scorer_input_sample)
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def __str__(self) -> str:
+        return self.__class__.__name__
+
     @abstractmethod
-    def score_sample(self, sample: TProcessedSample) -> SingleEvalResult:
+    def score_sample(self, scorer_input_sample: TScorerInputSample) -> SingleEvalResult:
         raise NotImplementedError()
 
     @abstractmethod
     def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
         raise NotImplementedError()
 
-    def preprocess(
-        self, dataset: BaseDataset[TProcessedSample]
-    ) -> List[TProcessedSample]:
-        return [self.preprocess_sample(sample) for sample in dataset]
+    def score(
+        self, prepared_eval_dataset: List[TScorerInputSample]
+    ) -> List[SingleEvalResult]:
+        return [self.score_sample(sample) for sample in prepared_eval_dataset]
 
-    def postprocess(self, generation: List[TProcessedSample]) -> List[TProcessedSample]:
-        return [self.postprocess_sample(sample) for sample in generation]
 
-    def score(self, postprocessed: List[TProcessedSample]) -> List[SingleEvalResult]:
-        return [self.score_sample(sample) for sample in postprocessed]
+class BaseTask(ABC):
+    def __init__(
+        self,
+        generator_processor: Optional[BaseGeneratorProcessor] = None,
+        generator: Optional[BaseGenerator] = None,
+        scorer: Optional[BaseScorer] = None,
+        *args,
+        **kwargs
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.generator_processor = generator_processor
+        self.generator = generator
+        self.scorer = scorer
+
+    @abstractmethod
+    def run(self, *args, **kwargs) -> EvalResult:
+        raise NotImplementedError()
+
+
+# class BaseTask(ABC, Generic[TDatasetSample, TProcessedSample]):
+#     """
+#     A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods.
+#     Base class for all evaluation tasks. Each task needs to implement the following methods:
+#     - F1: preprocess_sample(self)
+#     - F2: postprocess_sample(self)
+#     - F3: score_sample(self)
+#     """
+
+#     def __init__(self, *args, **kwargs) -> None:
+#         super().__init__(*args, **kwargs)
+#         self._name = self.__class__.__name__
+
+#     @abstractmethod
+#     def preprocess_sample(self, sample: TDatasetSample) -> TProcessedSample:
+#         raise NotImplementedError()
+
+#     @abstractmethod
+#     def postprocess_sample(self, sample: TProcessedSample) -> TProcessedSample:
+#         raise NotImplementedError()
+
+#     @abstractmethod
+#     def score_sample(self, sample: TProcessedSample) -> SingleEvalResult:
+#         raise NotImplementedError()
+
+#     @abstractmethod
+#     def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+#         raise NotImplementedError()
+
+#     def preprocess(
+#         self, dataset: BaseDataset[TProcessedSample]
+#     ) -> List[TProcessedSample]:
+#         return [self.preprocess_sample(sample) for sample in dataset]
+
+#     def postprocess(self, generation: List[TProcessedSample]) -> List[TProcessedSample]:
+#         return [self.postprocess_sample(sample) for sample in generation]
+
+#     def score(self, postprocessed: List[TProcessedSample]) -> List[SingleEvalResult]:
+#         return [self.score_sample(sample) for sample in postprocessed]
 
 
 class Evals(Protocol):
-    @webmethod(route="/evals/run")
-    async def run_evals(
+
+    @webmethod(route="/evals/run_eval_task")
+    async def run_eval_task(
         self,
         model: str,
         task: str,
@@ -122,6 +287,13 @@ async def run_evals(
         eval_task_config: Optional[EvaluateTaskConfig] = None,
     ) -> EvaluateResponse: ...
 
+    @webmethod(route="/evals/run_scorer")
+    async def run_scorer(
+        self,
+        dataset_config: EvaluateDatasetConfig,
+        eval_scoring_config: EvaluateScoringConfig,
+    ) -> EvaluateResponse: ...
+
     # @webmethod(route="/evals/jobs")
     # def get_evaluation_jobs(self) -> List[EvaluationJob]: ...
 
diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py
index 87a01d311b..0bd86b8d49 100644
--- a/llama_stack/distribution/registry/datasets/dataset.py
+++ b/llama_stack/distribution/registry/datasets/dataset.py
@@ -25,23 +25,27 @@ def __iter__(self) -> Iterator[DictSample]:
             self.load()
         return (DictSample(data=x) for x in self.dataset)
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"CustomDataset({self.config})"
 
-    def __len__(self):
+    def __len__(self) -> int:
         if not self.dataset:
             self.load()
         return len(self.dataset)
 
-    def load(self):
+    def load(self, n_samples: Optional[int] = None) -> None:
         if self.dataset:
             return
+
         # TODO: better support w/ data url
         if self.config.url.endswith(".csv"):
             df = pandas.read_csv(self.config.url)
         elif self.config.url.endswith(".xlsx"):
             df = pandas.read_excel(self.config.url)
 
+        if n_samples is not None:
+            df = df.sample(n=n_samples)
+
         self.dataset = Dataset.from_pandas(df)
 
 
diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py
new file mode 100644
index 0000000000..76edd2ebd3
--- /dev/null
+++ b/llama_stack/distribution/registry/scorers/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+# TODO: make these import config based
diff --git a/llama_stack/distribution/registry/scorers/scorer_registry.py b/llama_stack/distribution/registry/scorers/scorer_registry.py
new file mode 100644
index 0000000000..b6a382c531
--- /dev/null
+++ b/llama_stack/distribution/registry/scorers/scorer_registry.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import AbstractSet, Dict
+
+from llama_stack.apis.evals import BaseScorer
+
+
+class ScorerRegistry:
+    _REGISTRY: Dict[str, BaseScorer] = {}
+
+    @staticmethod
+    def names() -> AbstractSet[str]:
+        return ScorerRegistry._REGISTRY.keys()
+
+    @staticmethod
+    def register(name: str, scorer: BaseScorer) -> None:
+        if name in ScorerRegistry._REGISTRY:
+            raise ValueError(f"Task {name} already exists.")
+        ScorerRegistry._REGISTRY[name] = task
+
+    @staticmethod
+    def get_scorer(name: str) -> BaseScorer:
+        if name not in ScorerRegistry._REGISTRY:
+            raise ValueError(f"Task {name} not found.")
+        return ScorerRegistry._REGISTRY[name]
+
+    @staticmethod
+    def reset() -> None:
+        ScorerRegistry._REGISTRY = {}
diff --git a/llama_stack/distribution/registry/tasks/__init__.py b/llama_stack/distribution/registry/tasks/__init__.py
index 01ccb18aee..756f351d88 100644
--- a/llama_stack/distribution/registry/tasks/__init__.py
+++ b/llama_stack/distribution/registry/tasks/__init__.py
@@ -3,11 +3,3 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-# TODO: make these import config based
-from llama_stack.providers.impls.meta_reference.evals.tasks.mmlu_task import MMLUTask
-from .task_registry import TaskRegistry
-
-TaskRegistry.register(
-    "mmlu",
-    MMLUTask,
-)
diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py
index d7214663ef..0fbce823e8 100644
--- a/llama_stack/providers/impls/meta_reference/evals/evals.py
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@@ -3,16 +3,27 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import json
+
+from termcolor import cprint
+
+from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import (
+    AggregateScorer,
+)
 
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.apis.evals import *  # noqa: F403
 from llama_stack.apis.dataset import *  # noqa: F403
 
-from termcolor import cprint
-
 from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry
+from llama_stack.providers.impls.meta_reference.evals.processor.mmlu_processor import (
+    MMLUProcessor,
+)
+
+# from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry
+# from .tasks.run_eval_task import RunEvalTask
+from .scorer.basic_scorers import *  # noqa: F403
 
-from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry
 
 from .config import MetaReferenceEvalsImplConfig
 
@@ -27,7 +38,7 @@ async def initialize(self) -> None:
     async def shutdown(self) -> None:
         pass
 
-    async def run_evals(
+    async def run_eval_task(
         self,
         model: str,
         task: str,
@@ -38,43 +49,142 @@ async def run_evals(
             f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}",
             "red",
         )
+
         if not dataset:
             raise ValueError("dataset must be specified for mete-reference evals")
 
-        dataset = DatasetRegistry.get_dataset(dataset)
-        dataset.load()
+        if not eval_task_config:
+            # construct eval task config from inputs
+            eval_task_config = EvaluateTaskConfig(
+                dataset_config=EvaluateDatasetConfig(
+                    dataset_name=dataset,
+                    row_limit=2,
+                ),
+                generation_config=EvaluateModelGenerationConfig(
+                    model=model,
+                ),
+                scoring_config=EvaluateScoringConfig(
+                    scorer_config_list=[
+                        EvaluateSingleScorerConfig(scorer_name="accuracy"),
+                    ]
+                ),
+            )
 
-        task_impl = TaskRegistry.get_task(task)()
-        preprocessed = task_impl.preprocess(dataset)
+        # TODO: wrap inside task
+        # run_task = RunEvalTask(
+        #     eval_task_config=eval_task_config,
+        # )
+        # eval_result = run_task.run()
 
-        # TODO: replace w/ batch inference & async return eval job
-        generation_outputs = []
-        if eval_task_config is None:
-            eval_task_config = EvaluateTaskConfig(n_samples=len(preprocessed))
-        if eval_task_config.n_samples is None or eval_task_config.n_samples > len(
-            preprocessed
-        ):
-            eval_task_config.n_samples = len(preprocessed)
-
-        print(
-            f"Eval generation start, generate on {eval_task_config.n_samples} samples"
+        dataset = DatasetRegistry.get_dataset(
+            eval_task_config.dataset_config.dataset_name
         )
+        dataset.load(n_samples=eval_task_config.dataset_config.row_limit)
+        print(f"Running on {len(dataset)} samples")
+
+        # F1
+        processor = MMLUProcessor()
+        preprocessed = processor.preprocess(dataset)
 
-        for sample in preprocessed[: eval_task_config.n_samples]:
+        # Generation
+        # TODO: wrap inside BaseGenerator
+        generation_outputs = []
+        for sample in preprocessed:
             print("generation: ", sample)
             response = await self.inference_api.chat_completion(
                 model=model,
-                messages=sample.preprocessed["messages"],
+                messages=sample.generation_input.messages,
                 stream=False,
             )
-            sample.prediction = PredictionSample(
-                completion_message=response.completion_message.content
+            cprint(f"response: {response}", "cyan")
+
+            generation_outputs.append(
+                GenerationResponseSample(
+                    generation_output=GenerationOutput(
+                        completion_message=response.completion_message.content
+                    )
+                )
             )
-            generation_outputs.append(sample)
+        cprint(generation_outputs, "green")
+
+        # F2
+        postprocessed = processor.postprocess(generation_outputs, dataset)
+        cprint(postprocessed, "blue")
+
+        # F3 - scorer
+        scorer = AggregateScorer(
+            scorers=[
+                AccuracyScorer(),
+                RandomScorer(),
+            ]
+        )
+
+        scorer_results = scorer.score(postprocessed)
+        cprint(scorer_results, "magenta")
+        eval_result = scorer.aggregate_results(scorer_results)
 
-        postprocessed = task_impl.postprocess(generation_outputs)
-        eval_results = task_impl.score(postprocessed)
-        aggr_result = task_impl.aggregate_results(eval_results)
         return EvaluateResponse(
-            eval_result=aggr_result,
+            eval_result=eval_result,
+            formatted_report=json.dumps(eval_result.json(), indent=4),
         )
+
+    async def run_scorer(
+        self,
+        dataset_config: EvaluateDatasetConfig,
+        eval_scoring_config: EvaluateScoringConfig,
+    ) -> EvaluateResponse:
+        return EvaluateResponse(
+            eval_result={},
+        )
+
+    # async def run_evals(
+    #     self,
+    #     model: str,
+    #     task: str,
+    #     dataset: Optional[str] = None,
+    #     eval_task_config: Optional[EvaluateTaskConfig] = None,
+    # ) -> EvaluateResponse:
+    #     cprint(
+    #         f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}",
+    #         "red",
+    #     )
+    #     if not dataset:
+    #         raise ValueError("dataset must be specified for mete-reference evals")
+
+    #     dataset = DatasetRegistry.get_dataset(dataset)
+    #     dataset.load()
+
+    #     task_impl = TaskRegistry.get_task(task)()
+    #     preprocessed = task_impl.preprocess(dataset)
+
+    #     # TODO: replace w/ batch inference & async return eval job
+    #     generation_outputs = []
+    #     if eval_task_config is None:
+    #         eval_task_config = EvaluateTaskConfig(n_samples=len(preprocessed))
+    #     if eval_task_config.n_samples is None or eval_task_config.n_samples > len(
+    #         preprocessed
+    #     ):
+    #         eval_task_config.n_samples = len(preprocessed)
+
+    #     print(
+    #         f"Eval generation start, generate on {eval_task_config.n_samples} samples"
+    #     )
+
+    #     for sample in preprocessed[: eval_task_config.n_samples]:
+    #         print("generation: ", sample)
+    #         response = await self.inference_api.chat_completion(
+    #             model=model,
+    #             messages=sample.preprocessed["messages"],
+    #             stream=False,
+    #         )
+    #         sample.prediction = PredictionSample(
+    #             completion_message=response.completion_message.content
+    #         )
+    #         generation_outputs.append(sample)
+
+    #     postprocessed = task_impl.postprocess(generation_outputs)
+    #     eval_results = task_impl.score(postprocessed)
+    #     aggr_result = task_impl.aggregate_results(eval_results)
+    #     return EvaluateResponse(
+    #         eval_result=aggr_result,
+    #     )
diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py
new file mode 100644
index 0000000000..756f351d88
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py b/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py
similarity index 60%
rename from llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py
rename to llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py
index e3d9e4ef3c..83460bb0c5 100644
--- a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py
+++ b/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py
@@ -7,8 +7,6 @@
 
 from llama_stack.apis.evals import *  # noqa: F403
 
-# from llama_stack.distribution.registry.tasks.task import BaseTask
-
 QUERY_TEMPLATE_MULTICHOICE = """
 Answer the following multiple choice question and make the answer very simple. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD.
 
@@ -112,60 +110,78 @@ def normalize_extracted_answer(extracted_answer: str) -> str:
     )
 
 
-class MMLUTask(BaseTask[DictSample, ProcessedDictSample]):
+class MMLUProcessor(
+    BaseGeneratorProcessor[
+        DictSample, PreprocessedSample, GenerationResponseSample, ScorerInputSample
+    ]
+):
     """
-    MMLU Task.
+    Generator processor for MMLU
     """
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-    def preprocess_sample(self, sample: ProcessedDictSample) -> ProcessedDictSample:
+    def preprocess_sample(self, sample: DictSample) -> PreprocessedSample:
         content = QUERY_TEMPLATE_MULTICHOICE.format(**sample.data)
-        preprocessed = {
-            "messages": [
-                {
-                    "role": "user",
-                    "content": content,
-                }
-            ],
-        }
-        processed_sample = ProcessedDictSample(
-            data=sample.data,
-            preprocessed=preprocessed,
+        preprocessed_msgs = [
+            {
+                "role": "user",
+                "content": content,
+            }
+        ]
+        processed_sample = PreprocessedSample(
+            generation_input=GenerationInput(
+                messages=preprocessed_msgs,
+            )
         )
         return processed_sample
 
-    def postprocess_sample(self, sample: ProcessedDictSample) -> ProcessedDictSample:
-        if not sample.postprocessed:
-            sample.postprocessed = {}
-        sample.postprocessed["postprocessed"] = normalize_response(
-            sample.prediction.completion_message
-        )
-        return sample
-
-    def score_sample(self, sample: ProcessedDictSample) -> SingleEvalResult:
-        postprocessed_output = sample.postprocessed["postprocessed"]
-        expected_answer = sample.data["Answer"]
+    def postprocess_sample(
+        self, generation_sample: GenerationResponseSample, dataset_sample: DictSample
+    ) -> ScorerInputSample:
+        response_text = generation_sample.generation_output.completion_message
+        normalized_response = normalize_response(response_text)
 
-        extracted_answer = None
+        # extract answer
+        extracted_answer = ""
         for answer_regex in MULTILINGUAL_ANSWER_REGEXES:
             regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex)
-            match = re.search(regex, postprocessed_output)
+            match = re.search(regex, normalized_response)
             if match:
                 extracted_answer = normalize_extracted_answer(match.group(1))
                 break
 
-        score = 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
-
-        return SingleEvalResult(
-            score_data={
-                "score": score,
-            },
+        return ScorerInputSample(
+            generation_output=PostprocessedGeneration(
+                completion_message=response_text,
+                transformed_generation=extracted_answer,
+            ),
+            expected_output=dataset_sample.data["Answer"],
         )
 
-    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
-        print("aggregate_results", eval_results)
-        sum_score = sum([result.score_data["score"] for result in eval_results])
+    # def score_sample(self, sample: ProcessedDictSample) -> SingleEvalResult:
+    #     postprocessed_output = sample.postprocessed["postprocessed"]
+    #     expected_answer = sample.data["Answer"]
+
+    #     extracted_answer = None
+    #     for answer_regex in MULTILINGUAL_ANSWER_REGEXES:
+    #         regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex)
+    #         match = re.search(regex, postprocessed_output)
+    #         if match:
+    #             extracted_answer = normalize_extracted_answer(match.group(1))
+    #             break
+
+    #     score = 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
+
+    #     return SingleEvalResult(
+    #         score_data={
+    #             "score": score,
+    #         },
+    #     )
+
+    # def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+    #     print("aggregate_results", eval_results)
+    #     sum_score = sum([result.score_data["score"] for result in eval_results])
 
-        return EvalResult(metrics={"score": str(sum_score / len(eval_results))})
+    #     return EvalResult(metrics={"score": str(sum_score / len(eval_results))})
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py
new file mode 100644
index 0000000000..756f351d88
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
new file mode 100644
index 0000000000..ff9639ecd7
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
@@ -0,0 +1,78 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import random
+
+from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult
+from llama_stack.apis.dataset.dataset import *  # noqa: F401 F403
+
+
+class AggregateScorer(BaseScorer[ScorerInputSample]):
+    def __init__(self, scorers: List[BaseScorer[ScorerInputSample]]):
+        self.scorers = scorers
+
+    def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+        all_score_data = {}
+        for scorer in self.scorers:
+            score_data = scorer.score_sample(scorer_input_sample).score_data
+            for k, v in score_data.items():
+                all_score_data[k] = v
+
+        return SingleEvalResult(
+            score_data=all_score_data,
+        )
+
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        all_metrics = {}
+
+        for scorer in self.scorers:
+            metrics = scorer.aggregate_results(eval_results).metrics
+            for k, v in metrics.items():
+                all_metrics[f"{scorer.__class__.__name__}:{k}"] = v
+
+        return EvalResult(
+            metrics=all_metrics,
+        )
+
+
+class RandomScorer(BaseScorer[ScorerInputSample]):
+    def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+        return SingleEvalResult(score_data={"random": random.random()})
+
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        avg_random = sum(
+            [result.score_data["random"] for result in eval_results]
+        ) / len(eval_results)
+        max_random = max([result.score_data["random"] for result in eval_results])
+        return EvalResult(
+            metrics={
+                "avg_random": avg_random,
+                "max_random": max_random,
+            }
+        )
+
+
+class AccuracyScorer(BaseScorer[ScorerInputSample]):
+    def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+        extracted_answer = scorer_input_sample.generation_output.transformed_generation
+        expected_answer = scorer_input_sample.expected_output
+
+        accuracy = (
+            1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
+        )
+
+        return SingleEvalResult(score_data={"accuracy": accuracy})
+
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        num_correct = sum([result.score_data["accuracy"] for result in eval_results])
+        num_total = len(eval_results)
+
+        return EvalResult(
+            metrics={
+                "avg_accuracy": num_correct / num_total,
+                "num_correct": num_correct,
+                "num_total": num_total,
+            }
+        )
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
new file mode 100644
index 0000000000..df164b4315
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry
+
+from llama_stack.apis.evals import *  # noqa: F403
+
+
+class RunEvalTask(BaseTask):
+    """
+    RunEvalTask for LlamaStack
+    """
+
+    def __init__(
+        self,
+        eval_task_config,
+        generator_processor: Optional[BaseGeneratorProcessor] = None,
+        generator: Optional[BaseGenerator] = None,
+        scorer: Optional[BaseScorer] = None,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            generator_processor=generator_processor,
+            generator=generator,
+            scorer=scorer,
+            *args,
+            **kwargs,
+        )
+        self.eval_task_config = eval_task_config
+        self.dataset = DatasetRegistry.get_dataset(
+            eval_task_config.dataset_config.dataset_name
+        )
+
+    def run(self, *args, **kwargs) -> EvalResult:
+        print(f"Running eval task on {self.dataset}")
+        return EvalResult()
diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml
index 430ce61020..3c9f73e0b1 100644
--- a/tests/examples/local-run.yaml
+++ b/tests/examples/local-run.yaml
@@ -14,8 +14,8 @@ apis:
 - evals
 providers:
   evals:
-  - provider_id: eleuther
-    provider_type: eleuther
+  - provider_id: meta-reference
+    provider_type: meta-reference
     config: {}
   inference:
   - provider_id: remote::tgi

From 8890de732204f6906588ae250354a2469d749f54 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Sun, 13 Oct 2024 23:30:21 -0700
Subject: [PATCH 08/27] cleanup original BaseTask

---
 llama_stack/apis/evals/evals.py               | 41 ---------------
 .../impls/meta_reference/evals/evals.py       | 52 -------------------
 2 files changed, 93 deletions(-)

diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index 92657f6b5c..098fa5cc45 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -235,47 +235,6 @@ def run(self, *args, **kwargs) -> EvalResult:
         raise NotImplementedError()
 
 
-# class BaseTask(ABC, Generic[TDatasetSample, TProcessedSample]):
-#     """
-#     A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods.
-#     Base class for all evaluation tasks. Each task needs to implement the following methods:
-#     - F1: preprocess_sample(self)
-#     - F2: postprocess_sample(self)
-#     - F3: score_sample(self)
-#     """
-
-#     def __init__(self, *args, **kwargs) -> None:
-#         super().__init__(*args, **kwargs)
-#         self._name = self.__class__.__name__
-
-#     @abstractmethod
-#     def preprocess_sample(self, sample: TDatasetSample) -> TProcessedSample:
-#         raise NotImplementedError()
-
-#     @abstractmethod
-#     def postprocess_sample(self, sample: TProcessedSample) -> TProcessedSample:
-#         raise NotImplementedError()
-
-#     @abstractmethod
-#     def score_sample(self, sample: TProcessedSample) -> SingleEvalResult:
-#         raise NotImplementedError()
-
-#     @abstractmethod
-#     def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
-#         raise NotImplementedError()
-
-#     def preprocess(
-#         self, dataset: BaseDataset[TProcessedSample]
-#     ) -> List[TProcessedSample]:
-#         return [self.preprocess_sample(sample) for sample in dataset]
-
-#     def postprocess(self, generation: List[TProcessedSample]) -> List[TProcessedSample]:
-#         return [self.postprocess_sample(sample) for sample in generation]
-
-#     def score(self, postprocessed: List[TProcessedSample]) -> List[SingleEvalResult]:
-#         return [self.score_sample(sample) for sample in postprocessed]
-
-
 class Evals(Protocol):
 
     @webmethod(route="/evals/run_eval_task")
diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py
index 0fbce823e8..411aa0bc2d 100644
--- a/llama_stack/providers/impls/meta_reference/evals/evals.py
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@@ -136,55 +136,3 @@ async def run_scorer(
         return EvaluateResponse(
             eval_result={},
         )
-
-    # async def run_evals(
-    #     self,
-    #     model: str,
-    #     task: str,
-    #     dataset: Optional[str] = None,
-    #     eval_task_config: Optional[EvaluateTaskConfig] = None,
-    # ) -> EvaluateResponse:
-    #     cprint(
-    #         f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}",
-    #         "red",
-    #     )
-    #     if not dataset:
-    #         raise ValueError("dataset must be specified for mete-reference evals")
-
-    #     dataset = DatasetRegistry.get_dataset(dataset)
-    #     dataset.load()
-
-    #     task_impl = TaskRegistry.get_task(task)()
-    #     preprocessed = task_impl.preprocess(dataset)
-
-    #     # TODO: replace w/ batch inference & async return eval job
-    #     generation_outputs = []
-    #     if eval_task_config is None:
-    #         eval_task_config = EvaluateTaskConfig(n_samples=len(preprocessed))
-    #     if eval_task_config.n_samples is None or eval_task_config.n_samples > len(
-    #         preprocessed
-    #     ):
-    #         eval_task_config.n_samples = len(preprocessed)
-
-    #     print(
-    #         f"Eval generation start, generate on {eval_task_config.n_samples} samples"
-    #     )
-
-    #     for sample in preprocessed[: eval_task_config.n_samples]:
-    #         print("generation: ", sample)
-    #         response = await self.inference_api.chat_completion(
-    #             model=model,
-    #             messages=sample.preprocessed["messages"],
-    #             stream=False,
-    #         )
-    #         sample.prediction = PredictionSample(
-    #             completion_message=response.completion_message.content
-    #         )
-    #         generation_outputs.append(sample)
-
-    #     postprocessed = task_impl.postprocess(generation_outputs)
-    #     eval_results = task_impl.score(postprocessed)
-    #     aggr_result = task_impl.aggregate_results(eval_results)
-    #     return EvaluateResponse(
-    #         eval_result=aggr_result,
-    #     )

From 78cb88c3c4b2593c4d03b33981cbbd40c07bab8a Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Sun, 13 Oct 2024 23:48:15 -0700
Subject: [PATCH 09/27] RunEvalTask / InferenceGenerator

---
 llama_stack/apis/evals/evals.py               |  6 +-
 .../impls/meta_reference/evals/evals.py       | 69 ++-----------------
 .../evals/generator/__init__.py               |  5 ++
 .../evals/generator/inference_generator.py    | 48 +++++++++++++
 .../evals/tasks/run_eval_task.py              | 67 +++++++++++++-----
 5 files changed, 111 insertions(+), 84 deletions(-)
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/generator/__init__.py
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py

diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index 098fa5cc45..a62fa4418a 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -171,7 +171,7 @@ def postprocess_sample(
         raise NotImplementedError()
 
 
-class BaseGenerator(ABC, Generic[TGenerationResponseSample]):
+class BaseGenerator(ABC, Generic[TPreprocessedSample, TGenerationResponseSample]):
     """
     Base class for all generators. Each generator needs to implement the following methods:
     - generate(self, preprocessed_dataset)
@@ -184,7 +184,7 @@ def __str__(self) -> str:
         return self.__class__.__name__
 
     @abstractmethod
-    def generate(
+    async def generate(
         self, preprocessed_dataset: List[TPreprocessedSample]
     ) -> List[TGenerationResponseSample]:
         raise NotImplementedError()
@@ -231,7 +231,7 @@ def __init__(
         self.scorer = scorer
 
     @abstractmethod
-    def run(self, *args, **kwargs) -> EvalResult:
+    async def run(self, *args, **kwargs) -> EvalResult:
         raise NotImplementedError()
 
 
diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py
index 411aa0bc2d..f717fc9d8e 100644
--- a/llama_stack/providers/impls/meta_reference/evals/evals.py
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@@ -7,25 +7,14 @@
 
 from termcolor import cprint
 
-from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import (
-    AggregateScorer,
-)
-
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.apis.evals import *  # noqa: F403
 from llama_stack.apis.dataset import *  # noqa: F403
 
-from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry
-from llama_stack.providers.impls.meta_reference.evals.processor.mmlu_processor import (
-    MMLUProcessor,
-)
+from .config import MetaReferenceEvalsImplConfig
 
 # from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry
-# from .tasks.run_eval_task import RunEvalTask
-from .scorer.basic_scorers import *  # noqa: F403
-
-
-from .config import MetaReferenceEvalsImplConfig
+from .tasks.run_eval_task import RunEvalTask
 
 
 class MetaReferenceEvalsImpl(Evals):
@@ -70,58 +59,8 @@ async def run_eval_task(
                 ),
             )
 
-        # TODO: wrap inside task
-        # run_task = RunEvalTask(
-        #     eval_task_config=eval_task_config,
-        # )
-        # eval_result = run_task.run()
-
-        dataset = DatasetRegistry.get_dataset(
-            eval_task_config.dataset_config.dataset_name
-        )
-        dataset.load(n_samples=eval_task_config.dataset_config.row_limit)
-        print(f"Running on {len(dataset)} samples")
-
-        # F1
-        processor = MMLUProcessor()
-        preprocessed = processor.preprocess(dataset)
-
-        # Generation
-        # TODO: wrap inside BaseGenerator
-        generation_outputs = []
-        for sample in preprocessed:
-            print("generation: ", sample)
-            response = await self.inference_api.chat_completion(
-                model=model,
-                messages=sample.generation_input.messages,
-                stream=False,
-            )
-            cprint(f"response: {response}", "cyan")
-
-            generation_outputs.append(
-                GenerationResponseSample(
-                    generation_output=GenerationOutput(
-                        completion_message=response.completion_message.content
-                    )
-                )
-            )
-        cprint(generation_outputs, "green")
-
-        # F2
-        postprocessed = processor.postprocess(generation_outputs, dataset)
-        cprint(postprocessed, "blue")
-
-        # F3 - scorer
-        scorer = AggregateScorer(
-            scorers=[
-                AccuracyScorer(),
-                RandomScorer(),
-            ]
-        )
-
-        scorer_results = scorer.score(postprocessed)
-        cprint(scorer_results, "magenta")
-        eval_result = scorer.aggregate_results(scorer_results)
+        run_task = RunEvalTask()
+        eval_result = await run_task.run(eval_task_config, self.inference_api)
 
         return EvaluateResponse(
             eval_result=eval_result,
diff --git a/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py b/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py
new file mode 100644
index 0000000000..756f351d88
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py b/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py
new file mode 100644
index 0000000000..adc181e237
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from termcolor import cprint
+
+from llama_stack.apis.evals import *  # noqa: F403
+from llama_stack.apis.inference import *  # noqa: F403
+
+
+class InferenceGenerator(BaseGenerator[PreprocessedSample, GenerationResponseSample]):
+    """
+    InferenceGenerator for LlamaStack
+    """
+
+    def __init__(
+        self,
+        model,
+        inference_api,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.model = model
+        self.inference_api = inference_api
+
+    async def generate(
+        self, preprocessed_dataset: List[PreprocessedSample]
+    ) -> List[GenerationResponseSample]:
+        generation_outputs = []
+        for sample in preprocessed_dataset:
+            print("generation: ", sample)
+            response = await self.inference_api.chat_completion(
+                model=self.model,
+                messages=sample.generation_input.messages,
+                stream=False,
+            )
+            cprint(f"response: {response}", "cyan")
+
+            generation_outputs.append(
+                GenerationResponseSample(
+                    generation_output=GenerationOutput(
+                        completion_message=response.completion_message.content
+                    )
+                )
+            )
+        return generation_outputs
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
index df164b4315..f3a66e18b0 100644
--- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
@@ -4,8 +4,17 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry
+from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import *  # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import (
+    InferenceGenerator,
+)
+from llama_stack.providers.impls.meta_reference.evals.processor.mmlu_processor import (
+    MMLUProcessor,
+)
 
 from llama_stack.apis.evals import *  # noqa: F403
+from llama_stack.apis.inference import *  # noqa: F403
+from termcolor import cprint
 
 
 class RunEvalTask(BaseTask):
@@ -15,25 +24,51 @@ class RunEvalTask(BaseTask):
 
     def __init__(
         self,
-        eval_task_config,
-        generator_processor: Optional[BaseGeneratorProcessor] = None,
-        generator: Optional[BaseGenerator] = None,
-        scorer: Optional[BaseScorer] = None,
         *args,
         **kwargs,
     ) -> None:
-        super().__init__(
-            generator_processor=generator_processor,
-            generator=generator,
-            scorer=scorer,
-            *args,
-            **kwargs,
-        )
-        self.eval_task_config = eval_task_config
-        self.dataset = DatasetRegistry.get_dataset(
+        super().__init__(*args, **kwargs)
+
+    async def run(
+        self,
+        eval_task_config: EvaluateTaskConfig,
+        inference_api: Inference,
+        *args,
+        **kwargs,
+    ) -> EvalResult:
+        print(f"Running eval task w/ {eval_task_config}")
+
+        dataset = DatasetRegistry.get_dataset(
             eval_task_config.dataset_config.dataset_name
         )
+        dataset.load(n_samples=eval_task_config.dataset_config.row_limit)
+        print(f"Running on {len(dataset)} samples")
+
+        # F1
+        processor = MMLUProcessor()
+        preprocessed = processor.preprocess(dataset)
+
+        # Generation
+        generator = InferenceGenerator(
+            model=eval_task_config.generation_config.model,
+            inference_api=inference_api,
+        )
+        generation_outputs = await generator.generate(preprocessed)
+
+        # F2
+        postprocessed = processor.postprocess(generation_outputs, dataset)
+        cprint(postprocessed, "blue")
+
+        # F3 - scorer
+        scorer = AggregateScorer(
+            scorers=[
+                AccuracyScorer(),
+                RandomScorer(),
+            ]
+        )
+
+        scorer_results = scorer.score(postprocessed)
+        cprint(scorer_results, "magenta")
+        eval_result = scorer.aggregate_results(scorer_results)
 
-    def run(self, *args, **kwargs) -> EvalResult:
-        print(f"Running eval task on {self.dataset}")
-        return EvalResult()
+        return eval_result

From 18fe966e96297d797b4c86b343e037855c8af613 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 14 Oct 2024 00:12:46 -0700
Subject: [PATCH 10/27] registry refactor

---
 .../registry/datasets/__init__.py             |  7 +++-
 .../registry/datasets/dataset_registry.py     | 32 -------------------
 .../registry/generator_processors/__init__.py | 12 +++++++
 llama_stack/distribution/registry/registry.py | 32 +++++++++++++++++++
 .../distribution/registry/scorers/__init__.py |  7 ++++
 .../registry/scorers/scorer_registry.py       | 32 -------------------
 .../distribution/registry/tasks/__init__.py   |  5 ---
 .../registry/tasks/task_registry.py           | 32 -------------------
 .../evals/tasks/run_eval_task.py              |  7 ++--
 9 files changed, 60 insertions(+), 106 deletions(-)
 delete mode 100644 llama_stack/distribution/registry/datasets/dataset_registry.py
 create mode 100644 llama_stack/distribution/registry/generator_processors/__init__.py
 create mode 100644 llama_stack/distribution/registry/registry.py
 delete mode 100644 llama_stack/distribution/registry/scorers/scorer_registry.py
 delete mode 100644 llama_stack/distribution/registry/tasks/__init__.py
 delete mode 100644 llama_stack/distribution/registry/tasks/task_registry.py

diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py
index f0636212ae..68de3fa879 100644
--- a/llama_stack/distribution/registry/datasets/__init__.py
+++ b/llama_stack/distribution/registry/datasets/__init__.py
@@ -6,8 +6,13 @@
 
 # TODO: make these import config based
 from llama_stack.apis.dataset import *  # noqa: F403
+from ..registry import Registry
 from .dataset import CustomDataset, HuggingfaceDataset
-from .dataset_registry import DatasetRegistry
+
+
+class DatasetRegistry(Registry[BaseDataset]):
+    _REGISTRY: Dict[str, BaseDataset] = {}
+
 
 DATASETS_REGISTRY = [
     CustomDataset(
diff --git a/llama_stack/distribution/registry/datasets/dataset_registry.py b/llama_stack/distribution/registry/datasets/dataset_registry.py
deleted file mode 100644
index 8e9b22266a..0000000000
--- a/llama_stack/distribution/registry/datasets/dataset_registry.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import AbstractSet, Dict
-
-from llama_stack.apis.dataset import BaseDataset
-
-
-class DatasetRegistry:
-    _REGISTRY: Dict[str, BaseDataset] = {}
-
-    @staticmethod
-    def names() -> AbstractSet[str]:
-        return DatasetRegistry._REGISTRY.keys()
-
-    @staticmethod
-    def register(name: str, task: BaseDataset) -> None:
-        if name in DatasetRegistry._REGISTRY:
-            raise ValueError(f"Dataset {name} already exists.")
-        DatasetRegistry._REGISTRY[name] = task
-
-    @staticmethod
-    def get_dataset(name: str) -> BaseDataset:
-        if name not in DatasetRegistry._REGISTRY:
-            raise ValueError(f"Dataset {name} not found.")
-        return DatasetRegistry._REGISTRY[name]
-
-    @staticmethod
-    def reset() -> None:
-        DatasetRegistry._REGISTRY = {}
diff --git a/llama_stack/distribution/registry/generator_processors/__init__.py b/llama_stack/distribution/registry/generator_processors/__init__.py
new file mode 100644
index 0000000000..bb9d5c1824
--- /dev/null
+++ b/llama_stack/distribution/registry/generator_processors/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.evals import *  # noqa: F403
+
+from ..registry import Registry
+
+
+class GeneratorProcessorRegistry(Registry[BaseGeneratorProcessor]):
+    _REGISTRY: Dict[str, BaseGeneratorProcessor] = {}
diff --git a/llama_stack/distribution/registry/registry.py b/llama_stack/distribution/registry/registry.py
new file mode 100644
index 0000000000..b4a5b626d9
--- /dev/null
+++ b/llama_stack/distribution/registry/registry.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import AbstractSet, Dict, Generic, TypeVar
+
+TRegistry = TypeVar("TRegistry")
+
+
+class Registry(Generic[TRegistry]):
+    _REGISTRY: Dict[str, TRegistry] = {}
+
+    @staticmethod
+    def names() -> AbstractSet[str]:
+        return Registry._REGISTRY.keys()
+
+    @staticmethod
+    def register(name: str, task: TRegistry) -> None:
+        if name in Registry._REGISTRY:
+            raise ValueError(f"Dataset {name} already exists.")
+        Registry._REGISTRY[name] = task
+
+    @staticmethod
+    def get(name: str) -> TRegistry:
+        if name not in Registry._REGISTRY:
+            raise ValueError(f"Dataset {name} not found.")
+        return Registry._REGISTRY[name]
+
+    @staticmethod
+    def reset() -> None:
+        Registry._REGISTRY = {}
diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py
index 76edd2ebd3..3332b70527 100644
--- a/llama_stack/distribution/registry/scorers/__init__.py
+++ b/llama_stack/distribution/registry/scorers/__init__.py
@@ -4,3 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # TODO: make these import config based
+from llama_stack.apis.evals import *  # noqa: F403
+
+from ..registry import Registry
+
+
+class ScorerRegistry(Registry[BaseScorer]):
+    _REGISTRY: Dict[str, BaseScorer] = {}
diff --git a/llama_stack/distribution/registry/scorers/scorer_registry.py b/llama_stack/distribution/registry/scorers/scorer_registry.py
deleted file mode 100644
index b6a382c531..0000000000
--- a/llama_stack/distribution/registry/scorers/scorer_registry.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import AbstractSet, Dict
-
-from llama_stack.apis.evals import BaseScorer
-
-
-class ScorerRegistry:
-    _REGISTRY: Dict[str, BaseScorer] = {}
-
-    @staticmethod
-    def names() -> AbstractSet[str]:
-        return ScorerRegistry._REGISTRY.keys()
-
-    @staticmethod
-    def register(name: str, scorer: BaseScorer) -> None:
-        if name in ScorerRegistry._REGISTRY:
-            raise ValueError(f"Task {name} already exists.")
-        ScorerRegistry._REGISTRY[name] = task
-
-    @staticmethod
-    def get_scorer(name: str) -> BaseScorer:
-        if name not in ScorerRegistry._REGISTRY:
-            raise ValueError(f"Task {name} not found.")
-        return ScorerRegistry._REGISTRY[name]
-
-    @staticmethod
-    def reset() -> None:
-        ScorerRegistry._REGISTRY = {}
diff --git a/llama_stack/distribution/registry/tasks/__init__.py b/llama_stack/distribution/registry/tasks/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/llama_stack/distribution/registry/tasks/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/llama_stack/distribution/registry/tasks/task_registry.py b/llama_stack/distribution/registry/tasks/task_registry.py
deleted file mode 100644
index df25686ba6..0000000000
--- a/llama_stack/distribution/registry/tasks/task_registry.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import AbstractSet, Dict
-
-from llama_stack.apis.evals import BaseTask
-
-
-class TaskRegistry:
-    _REGISTRY: Dict[str, BaseTask] = {}
-
-    @staticmethod
-    def names() -> AbstractSet[str]:
-        return TaskRegistry._REGISTRY.keys()
-
-    @staticmethod
-    def register(name: str, task: BaseTask) -> None:
-        if name in TaskRegistry._REGISTRY:
-            raise ValueError(f"Task {name} already exists.")
-        TaskRegistry._REGISTRY[name] = task
-
-    @staticmethod
-    def get_task(name: str) -> BaseTask:
-        if name not in TaskRegistry._REGISTRY:
-            raise ValueError(f"Task {name} not found.")
-        return TaskRegistry._REGISTRY[name]
-
-    @staticmethod
-    def reset() -> None:
-        TaskRegistry._REGISTRY = {}
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
index f3a66e18b0..fde2efdb08 100644
--- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry
+from llama_stack.distribution.registry.datasets import DatasetRegistry
 from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import *  # noqa: F403
 from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import (
     InferenceGenerator,
@@ -38,9 +38,8 @@ async def run(
     ) -> EvalResult:
         print(f"Running eval task w/ {eval_task_config}")
 
-        dataset = DatasetRegistry.get_dataset(
-            eval_task_config.dataset_config.dataset_name
-        )
+        print(DatasetRegistry.names())
+        dataset = DatasetRegistry.get(eval_task_config.dataset_config.dataset_name)
         dataset.load(n_samples=eval_task_config.dataset_config.row_limit)
         print(f"Running on {len(dataset)} samples")
 

From f046899a1cf4b35c1f1f4092196b98437cd3e2b2 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 14 Oct 2024 13:16:39 -0700
Subject: [PATCH 11/27] datasets api

---
 .../apis/{dataset => datasets}/__init__.py    |  2 +-
 llama_stack/apis/datasets/client.py           | 92 +++++++++++++++++++
 .../dataset.py => datasets/datasets.py}       | 10 +-
 llama_stack/apis/evals/evals.py               |  2 +-
 llama_stack/distribution/datatypes.py         | 10 ++
 llama_stack/distribution/distribution.py      | 20 +++-
 llama_stack/distribution/registry/__init__.py | 17 ++++
 .../registry/datasets/__init__.py             |  4 +-
 .../distribution/registry/datasets/dataset.py | 90 ++++++------------
 .../registry/datasets/dataset_wrappers.py     | 78 ++++++++++++++++
 llama_stack/distribution/resolver.py          | 23 +++++
 llama_stack/providers/datatypes.py            |  4 +-
 .../impls/meta_reference/evals/evals.py       |  6 +-
 .../evals/scorer/basic_scorers.py             |  2 +-
 tests/examples/local-run.yaml                 |  1 +
 15 files changed, 281 insertions(+), 80 deletions(-)
 rename llama_stack/apis/{dataset => datasets}/__init__.py (82%)
 create mode 100644 llama_stack/apis/datasets/client.py
 rename llama_stack/apis/{dataset/dataset.py => datasets/datasets.py} (96%)
 create mode 100644 llama_stack/distribution/registry/datasets/dataset_wrappers.py

diff --git a/llama_stack/apis/dataset/__init__.py b/llama_stack/apis/datasets/__init__.py
similarity index 82%
rename from llama_stack/apis/dataset/__init__.py
rename to llama_stack/apis/datasets/__init__.py
index 33557a0ab1..102b9927f3 100644
--- a/llama_stack/apis/dataset/__init__.py
+++ b/llama_stack/apis/datasets/__init__.py
@@ -4,4 +4,4 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from .dataset import *  # noqa: F401 F403
+from .datasets import *  # noqa: F401 F403
diff --git a/llama_stack/apis/datasets/client.py b/llama_stack/apis/datasets/client.py
new file mode 100644
index 0000000000..241db65689
--- /dev/null
+++ b/llama_stack/apis/datasets/client.py
@@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import json
+
+import fire
+import httpx
+
+from .datasets import *  # noqa: F403
+
+
+class DatasetClient(Datasets):
+    def __init__(self, base_url: str):
+        self.base_url = base_url
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def create_dataset(
+        self,
+        dataset_def: DatasetDef,
+    ) -> None:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{self.base_url}/datasets/create",
+                json={
+                    "dataset_def": json.loads(dataset_def.json()),
+                },
+                headers={"Content-Type": "application/json"},
+                timeout=60,
+            )
+            response.raise_for_status()
+            return None
+
+    async def get_dataset(
+        self,
+        dataset_identifier: str,
+    ) -> DatasetDef:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{self.base_url}/datasets/create",
+                json={
+                    "dataset_identifier": dataset_identifier,
+                },
+                headers={"Content-Type": "application/json"},
+                timeout=60,
+            )
+            response.raise_for_status()
+            return DatasetDef(**response.json())
+
+    async def delete_dataset(
+        self,
+        dataset_identifier: str,
+    ) -> DatasetDef:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{self.base_url}/datasets/delete",
+                json={
+                    "dataset_identifier": dataset_identifier,
+                },
+                headers={"Content-Type": "application/json"},
+                timeout=60,
+            )
+            response.raise_for_status()
+            return None
+
+
+async def run_main(host: str, port: int):
+    client = DatasetClient(f"http://{host}:{port}")
+
+    # Custom Eval Task
+    response = await client.create_dataset(
+        dataset_def=CustomDatasetDef(
+            identifier="test-dataset",
+            url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
+        ),
+    )
+
+
+def main(host: str, port: int):
+    asyncio.run(run_main(host, port))
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/datasets/datasets.py
similarity index 96%
rename from llama_stack/apis/dataset/dataset.py
rename to llama_stack/apis/datasets/datasets.py
index 798f3aba99..c79301557c 100644
--- a/llama_stack/apis/dataset/dataset.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -143,19 +143,19 @@ def load(self) -> None:
 
 class Datasets(Protocol):
     @webmethod(route="/datasets/create")
-    def create_dataset(
+    async def create_dataset(
         self,
-        dataset: DatasetDef,
+        dataset_def: DatasetDef,
     ) -> None: ...
 
     @webmethod(route="/datasets/get")
-    def get_dataset(
+    async def get_dataset(
         self,
         dataset_identifier: str,
     ) -> DatasetDef: ...
 
     @webmethod(route="/datasets/delete")
-    def delete_dataset(
+    async def delete_dataset(
         self,
-        dataset_uuid: str,
+        dataset_identifier: str,
     ) -> None: ...
diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index a62fa4418a..af0b291e87 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -11,7 +11,7 @@
 from pydantic import BaseModel
 
 from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
 
 
 class EvaluationJob(BaseModel):
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index 0044de09ee..ce7f5a8e50 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -73,6 +73,16 @@ class RoutingTableProviderSpec(ProviderSpec):
     pip_packages: List[str] = Field(default_factory=list)
 
 
+# Example: /datasets
+class RegistryProviderSpec(ProviderSpec):
+    provider_type: str = "registry"
+    config_class: str = ""
+    docker_image: Optional[str] = None
+
+    module: str
+    pip_packages: List[str] = Field(default_factory=list)
+
+
 class DistributionSpec(BaseModel):
     description: Optional[str] = Field(
         default="",
diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py
index 999646cc06..d96db23b46 100644
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@@ -21,6 +21,19 @@ class AutoRoutedApiInfo(BaseModel):
     router_api: Api
 
 
+class RegistryApiInfo(BaseModel):
+    registry_api: Api
+    # registry: Registry
+
+
+def builtin_registry_apis() -> List[RegistryApiInfo]:
+    return [
+        RegistryApiInfo(
+            registry_api=Api.datasets,
+        )
+    ]
+
+
 def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
     return [
         AutoRoutedApiInfo(
@@ -42,7 +55,12 @@ def providable_apis() -> List[Api]:
     routing_table_apis = set(
         x.routing_table_api for x in builtin_automatically_routed_apis()
     )
-    return [api for api in Api if api not in routing_table_apis and api != Api.inspect]
+    registry_apis = set(
+        x.registry_api for x in builtin_registry_apis() if x.registry_api
+    )
+    non_providable_apis = routing_table_apis | registry_apis | {Api.inspect}
+
+    return [api for api in Api if api not in non_providable_apis]
 
 
 def get_provider_registry() -> Dict[Api, Dict[str, ProviderSpec]]:
diff --git a/llama_stack/distribution/registry/__init__.py b/llama_stack/distribution/registry/__init__.py
index 756f351d88..6e68333280 100644
--- a/llama_stack/distribution/registry/__init__.py
+++ b/llama_stack/distribution/registry/__init__.py
@@ -3,3 +3,20 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any
+
+from llama_stack.providers.datatypes import Api
+from .datasets.dataset import DatasetRegistryImpl
+
+
+async def get_registry_impl(api: Api, _deps) -> Any:
+    api_to_registry = {
+        "datasets": DatasetRegistryImpl,
+    }
+
+    if api.value not in api_to_registry:
+        raise ValueError(f"API {api.value} not found in registry map")
+
+    impl = api_to_registry[api.value]()
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py
index 68de3fa879..384028b9e3 100644
--- a/llama_stack/distribution/registry/datasets/__init__.py
+++ b/llama_stack/distribution/registry/datasets/__init__.py
@@ -5,9 +5,9 @@
 # the root directory of this source tree.
 
 # TODO: make these import config based
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
 from ..registry import Registry
-from .dataset import CustomDataset, HuggingfaceDataset
+from .dataset_wrappers import CustomDataset, HuggingfaceDataset
 
 
 class DatasetRegistry(Registry[BaseDataset]):
diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py
index 0bd86b8d49..936fd0713b 100644
--- a/llama_stack/distribution/registry/datasets/dataset.py
+++ b/llama_stack/distribution/registry/datasets/dataset.py
@@ -3,76 +3,38 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import pandas
-from datasets import Dataset, load_dataset
 
-from llama_stack.apis.dataset import *  # noqa: F403
+# from llama_stack.apis.datasets import *
+# from llama_stack.distribution.registry.datasets import DatasetRegistry  # noqa: F403
+# from ..registry import Registry
+# from .dataset_wrappers import CustomDataset, HuggingfaceDataset
 
 
-class CustomDataset(BaseDataset[DictSample]):
-    def __init__(self, config: CustomDatasetDef) -> None:
-        super().__init__()
-        self.config = config
-        self.dataset = None
-        self.index = 0
+class DatasetRegistryImpl(Datasets):
+    """API Impl to interact with underlying dataset registry"""
 
-    @property
-    def dataset_id(self) -> str:
-        return self.config.identifier
+    def __init__(
+        self,
+    ) -> None:
+        pass
 
-    def __iter__(self) -> Iterator[DictSample]:
-        if not self.dataset:
-            self.load()
-        return (DictSample(data=x) for x in self.dataset)
+    async def initialize(self) -> None:
+        pass
 
-    def __str__(self) -> str:
-        return f"CustomDataset({self.config})"
+    async def shutdown(self) -> None:
+        pass
 
-    def __len__(self) -> int:
-        if not self.dataset:
-            self.load()
-        return len(self.dataset)
+    async def create_dataset(
+        self,
+        dataset_def: DatasetDef,
+    ) -> None:
+        print(f"Creating dataset {dataset.identifier}")
 
-    def load(self, n_samples: Optional[int] = None) -> None:
-        if self.dataset:
-            return
+    async def get_dataset(
+        self,
+        dataset_identifier: str,
+    ) -> DatasetDef:
+        pass
 
-        # TODO: better support w/ data url
-        if self.config.url.endswith(".csv"):
-            df = pandas.read_csv(self.config.url)
-        elif self.config.url.endswith(".xlsx"):
-            df = pandas.read_excel(self.config.url)
-
-        if n_samples is not None:
-            df = df.sample(n=n_samples)
-
-        self.dataset = Dataset.from_pandas(df)
-
-
-class HuggingfaceDataset(BaseDataset[DictSample]):
-    def __init__(self, config: HuggingfaceDatasetDef):
-        super().__init__()
-        self.config = config
-        self.dataset = None
-
-    @property
-    def dataset_id(self) -> str:
-        return self.config.identifier
-
-    def __iter__(self) -> Iterator[DictSample]:
-        if not self.dataset:
-            self.load()
-        return (DictSample(data=x) for x in self.dataset)
-
-    def __str__(self):
-        return f"HuggingfaceDataset({self.config})"
-
-    def __len__(self):
-        if not self.dataset:
-            self.load()
-        return len(self.dataset)
-
-    def load(self):
-        if self.dataset:
-            return
-        self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs)
+    async def delete_dataset(self, dataset_identifier: str) -> None:
+        pass
diff --git a/llama_stack/distribution/registry/datasets/dataset_wrappers.py b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
new file mode 100644
index 0000000000..e18165a110
--- /dev/null
+++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
@@ -0,0 +1,78 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import pandas
+from datasets import Dataset, load_dataset
+
+from llama_stack.apis.datasets import *  # noqa: F403
+
+
+class CustomDataset(BaseDataset[DictSample]):
+    def __init__(self, config: CustomDatasetDef) -> None:
+        super().__init__()
+        self.config = config
+        self.dataset = None
+        self.index = 0
+
+    @property
+    def dataset_id(self) -> str:
+        return self.config.identifier
+
+    def __iter__(self) -> Iterator[DictSample]:
+        if not self.dataset:
+            self.load()
+        return (DictSample(data=x) for x in self.dataset)
+
+    def __str__(self) -> str:
+        return f"CustomDataset({self.config})"
+
+    def __len__(self) -> int:
+        if not self.dataset:
+            self.load()
+        return len(self.dataset)
+
+    def load(self, n_samples: Optional[int] = None) -> None:
+        if self.dataset:
+            return
+
+        # TODO: better support w/ data url
+        if self.config.url.endswith(".csv"):
+            df = pandas.read_csv(self.config.url)
+        elif self.config.url.endswith(".xlsx"):
+            df = pandas.read_excel(self.config.url)
+
+        if n_samples is not None:
+            df = df.sample(n=n_samples)
+
+        self.dataset = Dataset.from_pandas(df)
+
+
+class HuggingfaceDataset(BaseDataset[DictSample]):
+    def __init__(self, config: HuggingfaceDatasetDef):
+        super().__init__()
+        self.config = config
+        self.dataset = None
+
+    @property
+    def dataset_id(self) -> str:
+        return self.config.identifier
+
+    def __iter__(self) -> Iterator[DictSample]:
+        if not self.dataset:
+            self.load()
+        return (DictSample(data=x) for x in self.dataset)
+
+    def __str__(self):
+        return f"HuggingfaceDataset({self.config})"
+
+    def __len__(self):
+        if not self.dataset:
+            self.load()
+        return len(self.dataset)
+
+    def load(self):
+        if self.dataset:
+            return
+        self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs)
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index 672a4ea60f..e71c3fd8ce 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -12,6 +12,7 @@
 from llama_stack.distribution.datatypes import *  # noqa: F403
 
 from llama_stack.apis.agents import Agents
+from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.evals import Evals
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
@@ -23,6 +24,7 @@
 from llama_stack.apis.telemetry import Telemetry
 from llama_stack.distribution.distribution import (
     builtin_automatically_routed_apis,
+    builtin_registry_apis,
     get_provider_registry,
 )
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
@@ -40,6 +42,7 @@ def api_protocol_map() -> Dict[Api, Any]:
         Api.shields: Shields,
         Api.telemetry: Telemetry,
         Api.evals: Evals,
+        Api.datasets: Datasets,
     }
 
 
@@ -139,6 +142,20 @@ async def resolve_impls_with_routing(run_config: StackRunConfig) -> Dict[Api, An
             )
         }
 
+    for info in builtin_registry_apis():
+        providers_with_specs[info.registry_api.value] = {
+            "__builtin__": ProviderWithSpec(
+                provider_id="__registry__",
+                provider_type="__registry__",
+                config={},
+                spec=RegistryProviderSpec(
+                    api=info.registry_api,
+                    module="llama_stack.distribution.registry",
+                    deps__=[],
+                ),
+            )
+        }
+
     sorted_providers = topological_sort(
         {k: v.values() for k, v in providers_with_specs.items()}
     )
@@ -259,6 +276,12 @@ async def instantiate_provider(
 
         config = None
         args = [provider_spec.api, inner_impls, deps]
+    elif isinstance(provider_spec, RegistryProviderSpec):
+        print("ROUTER PROVIDER SPEC")
+        method = "get_registry_impl"
+
+        config = None
+        args = [provider_spec.api, deps]
     else:
         method = "get_provider_impl"
 
diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py
index 50ab0691b9..1d397c9e73 100644
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@@ -28,11 +28,13 @@ class Api(Enum):
     models = "models"
     shields = "shields"
     memory_banks = "memory_banks"
-    evals = "evals"
 
     # built-in API
     inspect = "inspect"
 
+    evals = "evals"
+    datasets = "datasets"
+
 
 class ModelsProtocolPrivate(Protocol):
     async def list_models(self) -> List[ModelDef]: ...
diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py
index f717fc9d8e..3ae988cbdc 100644
--- a/llama_stack/providers/impls/meta_reference/evals/evals.py
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@@ -9,11 +9,9 @@
 
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.apis.evals import *  # noqa: F403
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
 
 from .config import MetaReferenceEvalsImplConfig
-
-# from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry
 from .tasks.run_eval_task import RunEvalTask
 
 
@@ -47,7 +45,7 @@ async def run_eval_task(
             eval_task_config = EvaluateTaskConfig(
                 dataset_config=EvaluateDatasetConfig(
                     dataset_name=dataset,
-                    row_limit=2,
+                    row_limit=3,
                 ),
                 generation_config=EvaluateModelGenerationConfig(
                     model=model,
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
index ff9639ecd7..47d41c6d61 100644
--- a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
@@ -6,7 +6,7 @@
 import random
 
 from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult
-from llama_stack.apis.dataset.dataset import *  # noqa: F401 F403
+from llama_stack.apis.datasets.datasets import *  # noqa: F401 F403
 
 
 class AggregateScorer(BaseScorer[ScorerInputSample]):
diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml
index 3c9f73e0b1..31fb726708 100644
--- a/tests/examples/local-run.yaml
+++ b/tests/examples/local-run.yaml
@@ -12,6 +12,7 @@ apis:
 - inference
 - safety
 - evals
+- datasets
 providers:
   evals:
   - provider_id: meta-reference

From a9210cd416ca81c74a0aa52ae22f18b615645e19 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 14 Oct 2024 13:54:16 -0700
Subject: [PATCH 12/27] datasets api crud

---
 llama_stack/apis/datasets/client.py           | 74 ++++++++++++++++---
 llama_stack/apis/datasets/datasets.py         | 32 +++++++-
 .../distribution/registry/datasets/dataset.py | 63 +++++++++++++---
 llama_stack/distribution/registry/registry.py |  6 ++
 4 files changed, 151 insertions(+), 24 deletions(-)

diff --git a/llama_stack/apis/datasets/client.py b/llama_stack/apis/datasets/client.py
index 241db65689..476a5964a1 100644
--- a/llama_stack/apis/datasets/client.py
+++ b/llama_stack/apis/datasets/client.py
@@ -6,13 +6,26 @@
 
 import asyncio
 import json
+from typing import Optional
 
 import fire
 import httpx
+from termcolor import cprint
 
 from .datasets import *  # noqa: F403
 
 
+def deserialize_dataset_def(j: Optional[Dict[str, Any]]) -> Optional[DatasetDef]:
+    if not j:
+        return None
+    if j["type"] == "huggingface":
+        return HuggingfaceDatasetDef(**j)
+    elif j["type"] == "custom":
+        return CustomDatasetDef(**j)
+    else:
+        raise ValueError(f"Unknown dataset type: {j['type']}")
+
+
 class DatasetClient(Datasets):
     def __init__(self, base_url: str):
         self.base_url = base_url
@@ -26,7 +39,7 @@ async def shutdown(self) -> None:
     async def create_dataset(
         self,
         dataset_def: DatasetDef,
-    ) -> None:
+    ) -> CreateDatasetResponse:
         async with httpx.AsyncClient() as client:
             response = await client.post(
                 f"{self.base_url}/datasets/create",
@@ -37,28 +50,31 @@ async def create_dataset(
                 timeout=60,
             )
             response.raise_for_status()
-            return None
+            return CreateDatasetResponse(**response.json())
 
     async def get_dataset(
         self,
         dataset_identifier: str,
-    ) -> DatasetDef:
+    ) -> Optional[DatasetDef]:
         async with httpx.AsyncClient() as client:
-            response = await client.post(
-                f"{self.base_url}/datasets/create",
-                json={
+            response = await client.get(
+                f"{self.base_url}/datasets/get",
+                params={
                     "dataset_identifier": dataset_identifier,
                 },
                 headers={"Content-Type": "application/json"},
                 timeout=60,
             )
             response.raise_for_status()
-            return DatasetDef(**response.json())
+            if not response.json():
+                return
+
+            return deserialize_dataset_def(response.json())
 
     async def delete_dataset(
         self,
         dataset_identifier: str,
-    ) -> DatasetDef:
+    ) -> DeleteDatasetResponse:
         async with httpx.AsyncClient() as client:
             response = await client.post(
                 f"{self.base_url}/datasets/delete",
@@ -69,19 +85,57 @@ async def delete_dataset(
                 timeout=60,
             )
             response.raise_for_status()
-            return None
+            return DeleteDatasetResponse(**response.json())
+
+    async def list_dataset(
+        self,
+    ) -> List[DatasetDef]:
+        async with httpx.AsyncClient() as client:
+            response = await client.get(
+                f"{self.base_url}/datasets/list",
+                headers={"Content-Type": "application/json"},
+                timeout=60,
+            )
+            response.raise_for_status()
+            if not response.json():
+                return
+
+            return [deserialize_dataset_def(x) for x in response.json()]
 
 
 async def run_main(host: str, port: int):
     client = DatasetClient(f"http://{host}:{port}")
 
-    # Custom Eval Task
+    # register dataset
     response = await client.create_dataset(
         dataset_def=CustomDatasetDef(
             identifier="test-dataset",
             url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
         ),
     )
+    cprint(response, "green")
+
+    # get dataset
+    get_dataset = await client.get_dataset(
+        dataset_identifier="test-dataset",
+    )
+    cprint(get_dataset, "cyan")
+
+    # delete dataset
+    delete_dataset = await client.delete_dataset(
+        dataset_identifier="test-dataset",
+    )
+    cprint(delete_dataset, "red")
+
+    # get again after deletion
+    get_dataset = await client.get_dataset(
+        dataset_identifier="test-dataset",
+    )
+    cprint(get_dataset, "yellow")
+
+    # list datasets
+    list_dataset = await client.list_dataset()
+    cprint(list_dataset, "blue")
 
 
 def main(host: str, port: int):
diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
index c79301557c..11a3f60964 100644
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -115,6 +115,27 @@ class CustomDatasetDef(BaseModel):
 ]
 
 
+class DatasetsResponseStatus(Enum):
+    success = "success"
+    fail = "fail"
+
+
+@json_schema_type
+class CreateDatasetResponse(BaseModel):
+    status: DatasetsResponseStatus = Field(
+        description="Return status of the dataset creation",
+    )
+    msg: Optional[str] = None
+
+
+@json_schema_type
+class DeleteDatasetResponse(BaseModel):
+    status: DatasetsResponseStatus = Field(
+        description="Return status of the dataset creation",
+    )
+    msg: Optional[str] = None
+
+
 class BaseDataset(ABC, Generic[TDatasetSample]):
     def __init__(self) -> None:
         self.type: str = self.__class__.__name__
@@ -146,16 +167,19 @@ class Datasets(Protocol):
     async def create_dataset(
         self,
         dataset_def: DatasetDef,
-    ) -> None: ...
+    ) -> CreateDatasetResponse: ...
 
-    @webmethod(route="/datasets/get")
+    @webmethod(route="/datasets/get", method="GET")
     async def get_dataset(
         self,
         dataset_identifier: str,
-    ) -> DatasetDef: ...
+    ) -> Optional[DatasetDef]: ...
 
     @webmethod(route="/datasets/delete")
     async def delete_dataset(
         self,
         dataset_identifier: str,
-    ) -> None: ...
+    ) -> DeleteDatasetResponse: ...
+
+    @webmethod(route="/datasets/list", method="GET")
+    async def list_datasets(self) -> List[DatasetDef]: ...
diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py
index 936fd0713b..838e8c65fa 100644
--- a/llama_stack/distribution/registry/datasets/dataset.py
+++ b/llama_stack/distribution/registry/datasets/dataset.py
@@ -4,10 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-# from llama_stack.apis.datasets import *
-# from llama_stack.distribution.registry.datasets import DatasetRegistry  # noqa: F403
-# from ..registry import Registry
-# from .dataset_wrappers import CustomDataset, HuggingfaceDataset
+from llama_stack.apis.datasets import *  # noqa: F403
+from llama_stack.distribution.registry.datasets import DatasetRegistry
+from llama_stack.distribution.registry.datasets.dataset_wrappers import (
+    CustomDataset,
+    HuggingfaceDataset,
+)
 
 
 class DatasetRegistryImpl(Datasets):
@@ -27,14 +29,55 @@ async def shutdown(self) -> None:
     async def create_dataset(
         self,
         dataset_def: DatasetDef,
-    ) -> None:
-        print(f"Creating dataset {dataset.identifier}")
+    ) -> CreateDatasetResponse:
+        if dataset_def.type == DatasetType.huggingface.value:
+            dataset_cls = HuggingfaceDataset(dataset_def)
+        else:
+            dataset_cls = CustomDataset(dataset_def)
+
+        try:
+            DatasetRegistry.register(
+                dataset_def.identifier,
+                dataset_cls,
+            )
+        except ValueError as e:
+            return CreateDatasetResponse(
+                status=DatasetsResponseStatus.fail,
+                msg=str(e),
+            )
+
+        return CreateDatasetResponse(
+            status=DatasetsResponseStatus.success,
+            msg=f"Dataset '{dataset_def.identifier}' registered",
+        )
 
     async def get_dataset(
         self,
         dataset_identifier: str,
-    ) -> DatasetDef:
-        pass
+    ) -> Optional[DatasetDef]:
+        try:
+            dataset_ref = DatasetRegistry.get(dataset_identifier).config
+        except ValueError as e:
+            return None
 
-    async def delete_dataset(self, dataset_identifier: str) -> None:
-        pass
+        return dataset_ref
+
+    async def delete_dataset(self, dataset_identifier: str) -> DeleteDatasetResponse:
+        try:
+            DatasetRegistry.delete(dataset_identifier)
+        except ValueError as e:
+            return DeleteDatasetResponse(
+                status=DatasetsResponseStatus.fail,
+                msg=str(e),
+            )
+
+        return DeleteDatasetResponse(
+            status=DatasetsResponseStatus.success,
+            msg=f"Dataset '{dataset_identifier}' deleted",
+        )
+
+    async def list_datasets(self) -> List[DatasetDef]:
+        return [
+            DatasetRegistry.get(dataset_identifier).config
+            for dataset_identifier in DatasetRegistry.names()
+        ]
diff --git a/llama_stack/distribution/registry/registry.py b/llama_stack/distribution/registry/registry.py
index b4a5b626d9..313fb6d4e4 100644
--- a/llama_stack/distribution/registry/registry.py
+++ b/llama_stack/distribution/registry/registry.py
@@ -27,6 +27,12 @@ def get(name: str) -> TRegistry:
             raise ValueError(f"Dataset {name} not found.")
         return Registry._REGISTRY[name]
 
+    @staticmethod
+    def delete(name: str) -> None:
+        if name not in Registry._REGISTRY:
+            raise ValueError(f"Dataset {name} not found.")
+        del Registry._REGISTRY[name]
+
     @staticmethod
     def reset() -> None:
         Registry._REGISTRY = {}

From 9c501d042b0ca1f2a4bfd2848c1609af8bb46cb1 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 14 Oct 2024 14:19:15 -0700
Subject: [PATCH 13/27] cleanup hardcoded dataset registry

---
 llama_stack/apis/datasets/client.py           | 14 +++++++++--
 llama_stack/apis/evals/client.py              | 23 +++++++++++++++----
 .../registry/datasets/__init__.py             | 23 -------------------
 3 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/llama_stack/apis/datasets/client.py b/llama_stack/apis/datasets/client.py
index 476a5964a1..e292b14d8c 100644
--- a/llama_stack/apis/datasets/client.py
+++ b/llama_stack/apis/datasets/client.py
@@ -26,7 +26,7 @@ def deserialize_dataset_def(j: Optional[Dict[str, Any]]) -> Optional[DatasetDef]
         raise ValueError(f"Unknown dataset type: {j['type']}")
 
 
-class DatasetClient(Datasets):
+class DatasetsClient(Datasets):
     def __init__(self, base_url: str):
         self.base_url = base_url
 
@@ -104,7 +104,7 @@ async def list_dataset(
 
 
 async def run_main(host: str, port: int):
-    client = DatasetClient(f"http://{host}:{port}")
+    client = DatasetsClient(f"http://{host}:{port}")
 
     # register dataset
     response = await client.create_dataset(
@@ -115,6 +115,16 @@ async def run_main(host: str, port: int):
     )
     cprint(response, "green")
 
+    # register HF dataset
+    response = await client.create_dataset(
+        dataset_def=HuggingfaceDatasetDef(
+            identifier="hellaswag",
+            dataset_name="hellaswag",
+            kwargs={"split": "validation", "trust_remote_code": True},
+        )
+    )
+    cprint(response, "green")
+
     # get dataset
     get_dataset = await client.get_dataset(
         dataset_identifier="test-dataset",
diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py
index b4d1c39fe7..d61de8c39b 100644
--- a/llama_stack/apis/evals/client.py
+++ b/llama_stack/apis/evals/client.py
@@ -12,6 +12,7 @@
 from termcolor import cprint
 
 from .evals import *  # noqa: F403
+from ..datasets.client import DatasetsClient
 
 
 class EvaluationClient(Evals):
@@ -54,13 +55,31 @@ async def run_evals(
 async def run_main(host: str, port: int):
     client = EvaluationClient(f"http://{host}:{port}")
 
+    dataset_client = DatasetsClient(f"http://{host}:{port}")
+
     # Custom Eval Task
+
+    # 1. register custom dataset
+    response = await dataset_client.create_dataset(
+        dataset_def=CustomDatasetDef(
+            identifier="mmlu-simple-eval-en",
+            url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
+        ),
+    )
+    cprint(f"datasets/create: {response}", "cyan")
+
+    # 2. run evals on the registered dataset
     response = await client.run_evals(
         model="Llama3.1-8B-Instruct",
         dataset="mmlu-simple-eval-en",
         task="mmlu",
     )
 
+    if response.formatted_report:
+        cprint(response.formatted_report, "green")
+    else:
+        cprint(f"Response: {response}", "green")
+
     # Eleuther Eval Task
     # response = await client.run_evals(
     #     model="Llama3.1-8B-Instruct",
@@ -70,10 +89,6 @@ async def run_main(host: str, port: int):
     #         n_samples=2,
     #     ),
     # )
-    if response.formatted_report:
-        cprint(response.formatted_report, "green")
-    else:
-        cprint(f"Response: {response}", "green")
 
 
 def main(host: str, port: int):
diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py
index 384028b9e3..8164758120 100644
--- a/llama_stack/distribution/registry/datasets/__init__.py
+++ b/llama_stack/distribution/registry/datasets/__init__.py
@@ -3,32 +3,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-
-# TODO: make these import config based
 from llama_stack.apis.datasets import *  # noqa: F403
 from ..registry import Registry
-from .dataset_wrappers import CustomDataset, HuggingfaceDataset
 
 
 class DatasetRegistry(Registry[BaseDataset]):
     _REGISTRY: Dict[str, BaseDataset] = {}
-
-
-DATASETS_REGISTRY = [
-    CustomDataset(
-        config=CustomDatasetDef(
-            identifier="mmlu-simple-eval-en",
-            url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
-        )
-    ),
-    HuggingfaceDataset(
-        config=HuggingfaceDatasetDef(
-            identifier="hellaswag",
-            dataset_name="hellaswag",
-            kwargs={"split": "validation", "trust_remote_code": True},
-        )
-    ),
-]
-
-for d in DATASETS_REGISTRY:
-    DatasetRegistry.register(d.dataset_id, d)

From c50686b6feadb1a15803a91b11650d9caf68514c Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 14 Oct 2024 15:41:31 -0700
Subject: [PATCH 14/27] scorer registry

---
 .../distribution/registry/scorers/__init__.py | 10 ++++++
 .../impls/meta_reference/evals/evals.py       |  1 +
 .../evals/scorer/aggregate_scorer.py          | 35 +++++++++++++++++++
 .../evals/scorer/basic_scorers.py             | 28 ---------------
 .../evals/tasks/run_eval_task.py              | 13 ++++---
 5 files changed, 55 insertions(+), 32 deletions(-)
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py

diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py
index 3332b70527..084a620a74 100644
--- a/llama_stack/distribution/registry/scorers/__init__.py
+++ b/llama_stack/distribution/registry/scorers/__init__.py
@@ -5,9 +5,19 @@
 # the root directory of this source tree.
 # TODO: make these import config based
 from llama_stack.apis.evals import *  # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import *  # noqa: F403
 
 from ..registry import Registry
 
 
 class ScorerRegistry(Registry[BaseScorer]):
     _REGISTRY: Dict[str, BaseScorer] = {}
+
+
+SCORER_REGISTRY = {
+    "accuracy": AccuracyScorer,
+    "random": RandomScorer,
+}
+
+for k, v in SCORER_REGISTRY.items():
+    ScorerRegistry.register(k, v)
diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py
index 3ae988cbdc..1d703a27ce 100644
--- a/llama_stack/providers/impls/meta_reference/evals/evals.py
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@@ -53,6 +53,7 @@ async def run_eval_task(
                 scoring_config=EvaluateScoringConfig(
                     scorer_config_list=[
                         EvaluateSingleScorerConfig(scorer_name="accuracy"),
+                        EvaluateSingleScorerConfig(scorer_name="random"),
                     ]
                 ),
             )
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py
new file mode 100644
index 0000000000..1a0621960e
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult
+from llama_stack.apis.datasets.datasets import *  # noqa: F401 F403
+
+
+class AggregateScorer(BaseScorer[ScorerInputSample]):
+    def __init__(self, scorers: List[BaseScorer[ScorerInputSample]]):
+        self.scorers = scorers
+
+    def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+        all_score_data = {}
+        for scorer in self.scorers:
+            score_data = scorer.score_sample(scorer_input_sample).score_data
+            for k, v in score_data.items():
+                all_score_data[k] = v
+
+        return SingleEvalResult(
+            score_data=all_score_data,
+        )
+
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        all_metrics = {}
+
+        for scorer in self.scorers:
+            metrics = scorer.aggregate_results(eval_results).metrics
+            for k, v in metrics.items():
+                all_metrics[f"{scorer.__class__.__name__}:{k}"] = v
+
+        return EvalResult(
+            metrics=all_metrics,
+        )
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
index 47d41c6d61..48d8caa3fa 100644
--- a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
@@ -9,34 +9,6 @@
 from llama_stack.apis.datasets.datasets import *  # noqa: F401 F403
 
 
-class AggregateScorer(BaseScorer[ScorerInputSample]):
-    def __init__(self, scorers: List[BaseScorer[ScorerInputSample]]):
-        self.scorers = scorers
-
-    def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
-        all_score_data = {}
-        for scorer in self.scorers:
-            score_data = scorer.score_sample(scorer_input_sample).score_data
-            for k, v in score_data.items():
-                all_score_data[k] = v
-
-        return SingleEvalResult(
-            score_data=all_score_data,
-        )
-
-    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
-        all_metrics = {}
-
-        for scorer in self.scorers:
-            metrics = scorer.aggregate_results(eval_results).metrics
-            for k, v in metrics.items():
-                all_metrics[f"{scorer.__class__.__name__}:{k}"] = v
-
-        return EvalResult(
-            metrics=all_metrics,
-        )
-
-
 class RandomScorer(BaseScorer[ScorerInputSample]):
     def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
         return SingleEvalResult(score_data={"random": random.random()})
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
index fde2efdb08..48c4509141 100644
--- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.distribution.registry.datasets import DatasetRegistry
+from llama_stack.distribution.registry.scorers import ScorerRegistry
+from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import *  # noqa: F403
 from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import *  # noqa: F403
 from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import (
     InferenceGenerator,
@@ -59,11 +61,14 @@ async def run(
         cprint(postprocessed, "blue")
 
         # F3 - scorer
+        scorer_config_list = eval_task_config.scoring_config.scorer_config_list
+        scorer_list = []
+        for s_conf in scorer_config_list:
+            scorer = ScorerRegistry.get(s_conf.scorer_name)
+            scorer_list.append(scorer())
+
         scorer = AggregateScorer(
-            scorers=[
-                AccuracyScorer(),
-                RandomScorer(),
-            ]
+            scorers=scorer_list,
         )
 
         scorer_results = scorer.score(postprocessed)

From 95fd53d2921cec04e8c9e71fb49183ac29cf8071 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 14 Oct 2024 16:09:55 -0700
Subject: [PATCH 15/27] registry refactor

---
 llama_stack/apis/datasets/datasets.py         | 14 +++++--
 llama_stack/apis/evals/evals.py               | 12 +-----
 .../registry/datasets/__init__.py             |  3 +-
 llama_stack/distribution/registry/registry.py | 38 +++++++++----------
 .../distribution/registry/scorers/__init__.py |  5 +--
 .../impls/meta_reference/evals/evals.py       |  4 ++
 .../evals/processor/mmlu_processor.py         | 30 +--------------
 .../evals/scorer/basic_scorers.py             |  4 +-
 8 files changed, 39 insertions(+), 71 deletions(-)

diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
index 11a3f60964..0f4354c3fc 100644
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -29,8 +29,7 @@ class GenerationOutput(BaseModel):
 @json_schema_type
 class PostprocessedGeneration(BaseModel):
     completion_message: str
-    # structured transformed output from raw_completion_message to compute scorer metrics
-    transformed_generation: Optional[Any] = None
+    logprobs: Optional[List[TokenLogProbs]] = None
 
 
 # A sample (row) from dataset
@@ -70,8 +69,15 @@ class GenerationResponseSample(DatasetSample):
 
 @json_schema_type
 class ScorerInputSample(DatasetSample):
-    generation_output: PostprocessedGeneration
-    expected_output: Union[str, List[str]]
+    """
+    A dataset is required to have the following columns to be used for scoring:
+    - generated_answer: str
+    - expected_answer: Union[str, List[str]]
+    """
+
+    generated_answer: str
+    expected_answer: Union[str, List[str]]
+    generation_output: Optional[PostprocessedGeneration] = None
 
 
 @json_schema_type
diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index af0b291e87..fb3aa6cd4d 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -217,18 +217,8 @@ def score(
 
 
 class BaseTask(ABC):
-    def __init__(
-        self,
-        generator_processor: Optional[BaseGeneratorProcessor] = None,
-        generator: Optional[BaseGenerator] = None,
-        scorer: Optional[BaseScorer] = None,
-        *args,
-        **kwargs
-    ) -> None:
+    def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.generator_processor = generator_processor
-        self.generator = generator
-        self.scorer = scorer
 
     @abstractmethod
     async def run(self, *args, **kwargs) -> EvalResult:
diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py
index 8164758120..4474c8d7d8 100644
--- a/llama_stack/distribution/registry/datasets/__init__.py
+++ b/llama_stack/distribution/registry/datasets/__init__.py
@@ -7,5 +7,4 @@
 from ..registry import Registry
 
 
-class DatasetRegistry(Registry[BaseDataset]):
-    _REGISTRY: Dict[str, BaseDataset] = {}
+DatasetRegistry = Registry[BaseDataset]()
diff --git a/llama_stack/distribution/registry/registry.py b/llama_stack/distribution/registry/registry.py
index 313fb6d4e4..702ed7d869 100644
--- a/llama_stack/distribution/registry/registry.py
+++ b/llama_stack/distribution/registry/registry.py
@@ -3,36 +3,34 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import AbstractSet, Dict, Generic, TypeVar
+from typing import AbstractSet, Generic, TypeVar
 
 TRegistry = TypeVar("TRegistry")
 
 
 class Registry(Generic[TRegistry]):
-    _REGISTRY: Dict[str, TRegistry] = {}
 
-    @staticmethod
-    def names() -> AbstractSet[str]:
-        return Registry._REGISTRY.keys()
+    def __init__(self) -> None:
+        super().__init__()
+        self.registry = {}
 
-    @staticmethod
-    def register(name: str, task: TRegistry) -> None:
-        if name in Registry._REGISTRY:
+    def names(self) -> AbstractSet[str]:
+        return self.registry.keys()
+
+    def register(self, name: str, task: TRegistry) -> None:
+        if name in self.registry:
             raise ValueError(f"Dataset {name} already exists.")
-        Registry._REGISTRY[name] = task
+        self.registry[name] = task
 
-    @staticmethod
-    def get(name: str) -> TRegistry:
-        if name not in Registry._REGISTRY:
+    def get(self, name: str) -> TRegistry:
+        if name not in self.registry:
             raise ValueError(f"Dataset {name} not found.")
-        return Registry._REGISTRY[name]
+        return self.registry[name]
 
-    @staticmethod
-    def delete(name: str) -> None:
-        if name not in Registry._REGISTRY:
+    def delete(self, name: str) -> None:
+        if name not in self.registry:
             raise ValueError(f"Dataset {name} not found.")
-        del Registry._REGISTRY[name]
+        del self.registry[name]
 
-    @staticmethod
-    def reset() -> None:
-        Registry._REGISTRY = {}
+    def reset(self) -> None:
+        self.registry = {}
diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py
index 084a620a74..dedf32ac3a 100644
--- a/llama_stack/distribution/registry/scorers/__init__.py
+++ b/llama_stack/distribution/registry/scorers/__init__.py
@@ -9,10 +9,7 @@
 
 from ..registry import Registry
 
-
-class ScorerRegistry(Registry[BaseScorer]):
-    _REGISTRY: Dict[str, BaseScorer] = {}
-
+ScorerRegistry = Registry[BaseScorer]()
 
 SCORER_REGISTRY = {
     "accuracy": AccuracyScorer,
diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py
index 1d703a27ce..abd1938ada 100644
--- a/llama_stack/providers/impls/meta_reference/evals/evals.py
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@@ -71,6 +71,10 @@ async def run_scorer(
         dataset_config: EvaluateDatasetConfig,
         eval_scoring_config: EvaluateScoringConfig,
     ) -> EvaluateResponse:
+        cprint("run_scorer")
+
+        # main logic, we need to convert the datset into List[ScorerInputSample]
+
         return EvaluateResponse(
             eval_result={},
         )
diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py b/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py
index 83460bb0c5..fc2d9eb642 100644
--- a/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py
+++ b/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py
@@ -153,35 +153,9 @@ def postprocess_sample(
                 break
 
         return ScorerInputSample(
+            generated_answer=extracted_answer,
+            expected_answer=dataset_sample.data["Answer"],
             generation_output=PostprocessedGeneration(
                 completion_message=response_text,
-                transformed_generation=extracted_answer,
             ),
-            expected_output=dataset_sample.data["Answer"],
         )
-
-    # def score_sample(self, sample: ProcessedDictSample) -> SingleEvalResult:
-    #     postprocessed_output = sample.postprocessed["postprocessed"]
-    #     expected_answer = sample.data["Answer"]
-
-    #     extracted_answer = None
-    #     for answer_regex in MULTILINGUAL_ANSWER_REGEXES:
-    #         regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex)
-    #         match = re.search(regex, postprocessed_output)
-    #         if match:
-    #             extracted_answer = normalize_extracted_answer(match.group(1))
-    #             break
-
-    #     score = 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
-
-    #     return SingleEvalResult(
-    #         score_data={
-    #             "score": score,
-    #         },
-    #     )
-
-    # def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
-    #     print("aggregate_results", eval_results)
-    #     sum_score = sum([result.score_data["score"] for result in eval_results])
-
-    #     return EvalResult(metrics={"score": str(sum_score / len(eval_results))})
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
index 48d8caa3fa..6099353a87 100644
--- a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
@@ -28,8 +28,8 @@ def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
 
 class AccuracyScorer(BaseScorer[ScorerInputSample]):
     def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
-        extracted_answer = scorer_input_sample.generation_output.transformed_generation
-        expected_answer = scorer_input_sample.expected_output
+        extracted_answer = scorer_input_sample.generated_answer
+        expected_answer = scorer_input_sample.expected_answer
 
         accuracy = (
             1.0 if extracted_answer and extracted_answer == expected_answer else 0.0

From a22c31b8a4329948d22154307e769cfb56fc870d Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 14 Oct 2024 16:25:06 -0700
Subject: [PATCH 16/27] processor registry

---
 llama_stack/apis/evals/evals.py                     | 13 +++++++++----
 .../registry/generator_processors/__init__.py       | 11 +++++++++--
 .../providers/impls/meta_reference/evals/evals.py   |  3 +++
 .../meta_reference/evals/processor/__init__.py      |  1 +
 .../impls/meta_reference/evals/scorer/__init__.py   |  2 ++
 .../meta_reference/evals/tasks/run_eval_task.py     | 13 +++++++++----
 6 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index fb3aa6cd4d..ea985ad3b2 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -89,15 +89,21 @@ class EvaluatePostprocessConfig(BaseModel):
     kwargs: Optional[Dict[str, Any]] = None
 
 
+@json_schema_type
+class EvaluateProcessorConfig(BaseModel):
+    processor_identifier: str
+    preprocess_config: Optional[EvaluatePreprocessConfig] = None
+    postprocess_config: Optional[EvaluatePostprocessConfig] = None
+
+
 @json_schema_type
 class EvaluateJudgeScoringConfig(BaseModel): ...
 
 
 @json_schema_type
 class LLMJudgeConfig(BaseModel):
-    judge_preprocess_config: EvaluatePreprocessConfig
+    judge_processor_config: EvaluateProcessorConfig
     judge_model_generation_config: EvaluateModelGenerationConfig
-    judge_postprocess_config: EvaluatePostprocessConfig
     judge_scoring_config: EvaluateJudgeScoringConfig
 
 
@@ -116,9 +122,8 @@ class EvaluateScoringConfig(BaseModel):
 @json_schema_type
 class EvaluateTaskConfig(BaseModel):
     dataset_config: EvaluateDatasetConfig
-    preprocess_config: Optional[EvaluatePreprocessConfig] = None
+    processor_config: EvaluateProcessorConfig
     generation_config: EvaluateModelGenerationConfig
-    postprocess_config: Optional[EvaluatePostprocessConfig] = None
     scoring_config: EvaluateScoringConfig
 
 
diff --git a/llama_stack/distribution/registry/generator_processors/__init__.py b/llama_stack/distribution/registry/generator_processors/__init__.py
index bb9d5c1824..44972cf03e 100644
--- a/llama_stack/distribution/registry/generator_processors/__init__.py
+++ b/llama_stack/distribution/registry/generator_processors/__init__.py
@@ -4,9 +4,16 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.apis.evals import *  # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.processor import *  # noqa: F403
 
 from ..registry import Registry
 
+# TODO: decide whether we should group dataset+processor together via Tasks
+GeneratorProcessorRegistry = Registry[BaseGeneratorProcessor]()
 
-class GeneratorProcessorRegistry(Registry[BaseGeneratorProcessor]):
-    _REGISTRY: Dict[str, BaseGeneratorProcessor] = {}
+PROCESSOR_REGISTRY = {
+    "mmlu": MMLUProcessor,
+}
+
+for k, v in PROCESSOR_REGISTRY.items():
+    GeneratorProcessorRegistry.register(k, v)
diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py
index abd1938ada..80bf2dd7ad 100644
--- a/llama_stack/providers/impls/meta_reference/evals/evals.py
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@@ -47,6 +47,9 @@ async def run_eval_task(
                     dataset_name=dataset,
                     row_limit=3,
                 ),
+                processor_config=EvaluateProcessorConfig(
+                    processor_identifier="mmlu",
+                ),
                 generation_config=EvaluateModelGenerationConfig(
                     model=model,
                 ),
diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py
index 756f351d88..f782f9320a 100644
--- a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py
+++ b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py
@@ -3,3 +3,4 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from .mmlu_processor import MMLUProcessor  # noqa: F401
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py
index 756f351d88..6424963f87 100644
--- a/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py
@@ -3,3 +3,5 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from .basic_scorers import *  # noqa: F401 F403
+from .aggregate_scorer import *  # noqa: F401 F403
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
index 48c4509141..83f6264c0d 100644
--- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
@@ -4,15 +4,17 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.distribution.registry.datasets import DatasetRegistry
+from llama_stack.distribution.registry.generator_processors import (
+    GeneratorProcessorRegistry,
+)
 from llama_stack.distribution.registry.scorers import ScorerRegistry
+
 from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import *  # noqa: F403
 from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import *  # noqa: F403
 from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import (
     InferenceGenerator,
 )
-from llama_stack.providers.impls.meta_reference.evals.processor.mmlu_processor import (
-    MMLUProcessor,
-)
+
 
 from llama_stack.apis.evals import *  # noqa: F403
 from llama_stack.apis.inference import *  # noqa: F403
@@ -46,7 +48,10 @@ async def run(
         print(f"Running on {len(dataset)} samples")
 
         # F1
-        processor = MMLUProcessor()
+        print(GeneratorProcessorRegistry.names())
+        processor = GeneratorProcessorRegistry.get(
+            eval_task_config.processor_config.processor_identifier
+        )()
         preprocessed = processor.preprocess(dataset)
 
         # Generation

From fcb8dea1ef8ba17b0f3df12e85f68d8f5a9e4b16 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 14 Oct 2024 17:46:29 -0700
Subject: [PATCH 17/27] scorer only api

---
 llama_stack/apis/datasets/datasets.py         | 10 ++-
 llama_stack/apis/evals/client.py              | 80 +++++++++++++++----
 llama_stack/apis/evals/evals.py               |  2 +-
 .../registry/datasets/dataset_wrappers.py     | 15 +++-
 .../impls/meta_reference/evals/evals.py       |  9 ++-
 .../evals/scorer/basic_scorers.py             | 11 ++-
 .../evals/tasks/run_eval_task.py              |  4 +-
 .../evals/tasks/run_scoring_task.py           | 80 +++++++++++++++++++
 8 files changed, 184 insertions(+), 27 deletions(-)
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py

diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
index 0f4354c3fc..2b54ac8f66 100644
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -92,8 +92,14 @@ class HuggingfaceDatasetDef(BaseModel):
     identifier: str = Field(
         description="A unique name for the dataset",
     )
-    dataset_name: str = Field(
-        description="The name of the dataset into HF (e.g. hellawag)",
+    dataset_path: str = Field(
+        description="The name of the dataset into HF (e.g. meta-llama/Llama-3.1-8B-Instruct-evals)",
+    )
+    dataset_name: Optional[str] = Field(
+        description="The name of the dataset into HF (e.g. Llama-3.1-8B-Instruct-evals__ifeval__strict__details)",
+    )
+    rename_columns_map: Optional[Dict[str, str]] = Field(
+        description="A map of column names to rename to fit the schema of eval dataset for scoring",
     )
     kwargs: Dict[str, Any] = Field(
         description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)",
diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py
index d61de8c39b..e7c5a475df 100644
--- a/llama_stack/apis/evals/client.py
+++ b/llama_stack/apis/evals/client.py
@@ -51,34 +51,84 @@ async def run_evals(
             response.raise_for_status()
             return EvaluateResponse(**response.json())
 
+    async def run_scorer(
+        self,
+        dataset_config: EvaluateDatasetConfig,
+        eval_scoring_config: EvaluateScoringConfig,
+    ) -> EvaluateResponse:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{self.base_url}/evals/run_scorer",
+                json={
+                    "dataset_config": json.loads(dataset_config.json()),
+                    "eval_scoring_config": json.loads(eval_scoring_config.json()),
+                },
+                headers={"Content-Type": "application/json"},
+                timeout=3600,
+            )
+            response.raise_for_status()
+            return EvaluateResponse(**response.json())
+
 
 async def run_main(host: str, port: int):
     client = EvaluationClient(f"http://{host}:{port}")
 
     dataset_client = DatasetsClient(f"http://{host}:{port}")
 
-    # Custom Eval Task
+    # Full Eval Task
 
-    # 1. register custom dataset
+    # # 1. register custom dataset
+    # response = await dataset_client.create_dataset(
+    #     dataset_def=CustomDatasetDef(
+    #         identifier="mmlu-simple-eval-en",
+    #         url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
+    #     ),
+    # )
+    # cprint(f"datasets/create: {response}", "cyan")
+
+    # # 2. run evals on the registered dataset
+    # response = await client.run_evals(
+    #     model="Llama3.1-8B-Instruct",
+    #     dataset="mmlu-simple-eval-en",
+    #     task="mmlu",
+    # )
+
+    # if response.formatted_report:
+    #     cprint(response.formatted_report, "green")
+    # else:
+    #     cprint(f"Response: {response}", "green")
+
+    # Scoring Task
+
+    # 1. register huggingface dataset
     response = await dataset_client.create_dataset(
-        dataset_def=CustomDatasetDef(
-            identifier="mmlu-simple-eval-en",
-            url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
-        ),
+        dataset_def=HuggingfaceDatasetDef(
+            identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
+            dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals",
+            dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
+            rename_columns_map={
+                "output_parsed_answer": "generated_answer",
+                "input_correct_responses": "expected_answer",
+            },
+            kwargs={"split": "latest"},
+        )
     )
-    cprint(f"datasets/create: {response}", "cyan")
+    cprint(response, "cyan")
 
     # 2. run evals on the registered dataset
-    response = await client.run_evals(
-        model="Llama3.1-8B-Instruct",
-        dataset="mmlu-simple-eval-en",
-        task="mmlu",
+    response = await client.run_scorer(
+        dataset_config=EvaluateDatasetConfig(
+            dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
+            row_limit=10,
+        ),
+        eval_scoring_config=EvaluateScoringConfig(
+            scorer_config_list=[
+                EvaluateSingleScorerConfig(scorer_name="accuracy"),
+            ]
+        ),
     )
 
-    if response.formatted_report:
-        cprint(response.formatted_report, "green")
-    else:
-        cprint(f"Response: {response}", "green")
+    cprint(response, "green")
 
     # Eleuther Eval Task
     # response = await client.run_evals(
diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index ea985ad3b2..6a3ed8ce28 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -66,7 +66,7 @@ class EvaluationJobCreateResponse(BaseModel):
 @json_schema_type
 class EvaluateDatasetConfig(BaseModel):
     # identifier to previously registered dataset via DatasetDef
-    dataset_name: str
+    dataset_identifier: str
     # limit number of rows to evaluate
     row_limit: Optional[int] = None
     kwargs: Optional[Dict[str, Any]] = None
diff --git a/llama_stack/distribution/registry/datasets/dataset_wrappers.py b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
index e18165a110..88a487d602 100644
--- a/llama_stack/distribution/registry/datasets/dataset_wrappers.py
+++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
@@ -72,7 +72,18 @@ def __len__(self):
             self.load()
         return len(self.dataset)
 
-    def load(self):
+    def load(self, n_samples: Optional[int] = None):
         if self.dataset:
             return
-        self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs)
+
+        if self.config.dataset_name:
+            self.config.kwargs["name"] = self.config.dataset_name
+
+        self.dataset = load_dataset(self.config.dataset_path, **self.config.kwargs)
+
+        if n_samples:
+            self.dataset = self.dataset.select(range(n_samples))
+
+        if self.config.rename_columns_map:
+            for k, v in self.config.rename_columns_map.items():
+                self.dataset = self.dataset.rename_column(k, v)
diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py
index 80bf2dd7ad..916e40e3ac 100644
--- a/llama_stack/providers/impls/meta_reference/evals/evals.py
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@@ -13,6 +13,7 @@
 
 from .config import MetaReferenceEvalsImplConfig
 from .tasks.run_eval_task import RunEvalTask
+from .tasks.run_scoring_task import RunScoringTask
 
 
 class MetaReferenceEvalsImpl(Evals):
@@ -44,7 +45,7 @@ async def run_eval_task(
             # construct eval task config from inputs
             eval_task_config = EvaluateTaskConfig(
                 dataset_config=EvaluateDatasetConfig(
-                    dataset_name=dataset,
+                    dataset_identifier=dataset,
                     row_limit=3,
                 ),
                 processor_config=EvaluateProcessorConfig(
@@ -76,8 +77,10 @@ async def run_scorer(
     ) -> EvaluateResponse:
         cprint("run_scorer")
 
-        # main logic, we need to convert the datset into List[ScorerInputSample]
+        run_task = RunScoringTask()
+        eval_result = await run_task.run(dataset_config, eval_scoring_config)
 
         return EvaluateResponse(
-            eval_result={},
+            eval_result=eval_result,
+            formatted_report=json.dumps(eval_result.json(), indent=4),
         )
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
index 6099353a87..748f9fc1f8 100644
--- a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
@@ -31,9 +31,14 @@ def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResu
         extracted_answer = scorer_input_sample.generated_answer
         expected_answer = scorer_input_sample.expected_answer
 
-        accuracy = (
-            1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
-        )
+        if isinstance(expected_answer, list):
+            accuracy = (
+                1.0 if extracted_answer and extracted_answer in expected_answer else 0.0
+            )
+        else:
+            accuracy = (
+                1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
+            )
 
         return SingleEvalResult(score_data={"accuracy": accuracy})
 
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
index 83f6264c0d..bcd842c420 100644
--- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
@@ -43,7 +43,9 @@ async def run(
         print(f"Running eval task w/ {eval_task_config}")
 
         print(DatasetRegistry.names())
-        dataset = DatasetRegistry.get(eval_task_config.dataset_config.dataset_name)
+        dataset = DatasetRegistry.get(
+            eval_task_config.dataset_config.dataset_identifier
+        )
         dataset.load(n_samples=eval_task_config.dataset_config.row_limit)
         print(f"Running on {len(dataset)} samples")
 
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
new file mode 100644
index 0000000000..f856debe95
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
@@ -0,0 +1,80 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.distribution.registry.datasets import DatasetRegistry
+from llama_stack.distribution.registry.scorers import ScorerRegistry
+
+from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import *  # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import *  # noqa: F403
+
+from llama_stack.apis.evals import *  # noqa: F403
+from llama_stack.apis.inference import *  # noqa: F403
+from termcolor import cprint
+
+
+class RunScoringTask(BaseTask):
+    """
+    RunScoringTask - only run scoring (F3) based on dataset and scoring config
+    """
+
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+
+    def transform_score_input_sample(
+        self, dataset: BaseDataset
+    ) -> List[ScorerInputSample]:
+        scorer_inputs = []
+        for x in dataset:
+            expected_answer = x.data["expected_answer"]
+            generated_answer = x.data["generated_answer"]
+
+            scorer_inputs.append(
+                ScorerInputSample(
+                    expected_answer=expected_answer,
+                    generated_answer=generated_answer,
+                )
+            )
+
+        return scorer_inputs
+
+    async def run(
+        self,
+        dataset_config: EvaluateDatasetConfig,
+        eval_scoring_config: EvaluateScoringConfig,
+        *args,
+        **kwargs,
+    ) -> EvalResult:
+        print(
+            f"Running scoring task w/ dataset={dataset_config} scoring={eval_scoring_config}"
+        )
+
+        dataset = DatasetRegistry.get(dataset_config.dataset_identifier)
+        dataset.load(n_samples=dataset_config.row_limit)
+        print(f"Running on {len(dataset)} samples")
+
+        # transform dataset into
+        postprocessed = self.transform_score_input_sample(dataset)
+        cprint(postprocessed, "blue")
+
+        # F3 - scorer
+        scorer_config_list = eval_scoring_config.scorer_config_list
+        scorer_list = []
+        for s_conf in scorer_config_list:
+            scorer = ScorerRegistry.get(s_conf.scorer_name)
+            scorer_list.append(scorer())
+
+        scorer = AggregateScorer(
+            scorers=scorer_list,
+        )
+
+        scorer_results = scorer.score(postprocessed)
+        cprint(scorer_results, "magenta")
+        eval_result = scorer.aggregate_results(scorer_results)
+
+        return eval_result

From c8f6849291eb7db098930fc81e462e8956688a44 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 14 Oct 2024 20:42:22 -0700
Subject: [PATCH 18/27] full accuracy

---
 llama_stack/apis/evals/client.py |  6 +++---
 llama_stack/apis/evals/evals.py  | 12 ------------
 2 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py
index e7c5a475df..1db7afac19 100644
--- a/llama_stack/apis/evals/client.py
+++ b/llama_stack/apis/evals/client.py
@@ -99,7 +99,6 @@ async def run_main(host: str, port: int):
     #     cprint(f"Response: {response}", "green")
 
     # Scoring Task
-
     # 1. register huggingface dataset
     response = await dataset_client.create_dataset(
         dataset_def=HuggingfaceDatasetDef(
@@ -119,7 +118,7 @@ async def run_main(host: str, port: int):
     response = await client.run_scorer(
         dataset_config=EvaluateDatasetConfig(
             dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
-            row_limit=10,
+            # row_limit=10,
         ),
         eval_scoring_config=EvaluateScoringConfig(
             scorer_config_list=[
@@ -128,7 +127,8 @@ async def run_main(host: str, port: int):
         ),
     )
 
-    cprint(response, "green")
+    for k, v in response.eval_result.metrics.items():
+        cprint(f"{k}: {v}", "green")
 
     # Eleuther Eval Task
     # response = await client.run_evals(
diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index 6a3ed8ce28..a02394ee40 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -49,13 +49,6 @@ class EvaluationJobStatusResponse(BaseModel):
     job_uuid: str
 
 
-@json_schema_type
-class EvaluationJobArtifactsResponse(BaseModel):
-    """Artifacts of a evaluation job."""
-
-    job_uuid: str
-
-
 @json_schema_type
 class EvaluationJobCreateResponse(BaseModel):
     """Response to create a evaluation job."""
@@ -267,8 +260,3 @@ async def run_scorer(
 
     # @webmethod(route="/evals/job/cancel")
     # def cancel_evaluation_job(self, job_uuid: str) -> None: ...
-
-    # @webmethod(route="/evals/job/artifacts")
-    # def get_evaluation_job_artifacts(
-    #     self, job_uuid: str
-    # ) -> EvaluationJobArtifactsResponse: ...

From 7b5895003ab2c6feed29e3e960e400b9cc0ab15d Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 14 Oct 2024 21:09:59 -0700
Subject: [PATCH 19/27] braintrust scorer

---
 llama_stack/apis/datasets/datasets.py         |  3 +
 .../distribution/registry/scorers/__init__.py |  4 ++
 .../evals/scorer/braintrust_scorer.py         | 57 +++++++++++++++++++
 3 files changed, 64 insertions(+)
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py

diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
index 2b54ac8f66..ee270b2910 100644
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -73,10 +73,13 @@ class ScorerInputSample(DatasetSample):
     A dataset is required to have the following columns to be used for scoring:
     - generated_answer: str
     - expected_answer: Union[str, List[str]]
+    - (optional) input_query: str
+    - (optional) generation_output: PostprocessedGeneration
     """
 
     generated_answer: str
     expected_answer: Union[str, List[str]]
+    input_query: Optional[str] = None
     generation_output: Optional[PostprocessedGeneration] = None
 
 
diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py
index dedf32ac3a..60e03b2fef 100644
--- a/llama_stack/distribution/registry/scorers/__init__.py
+++ b/llama_stack/distribution/registry/scorers/__init__.py
@@ -6,14 +6,18 @@
 # TODO: make these import config based
 from llama_stack.apis.evals import *  # noqa: F403
 from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import *  # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.scorer.braintrust_scorer import *  # noqa: F403
 
 from ..registry import Registry
 
+# TODO: make these import config based
 ScorerRegistry = Registry[BaseScorer]()
 
 SCORER_REGISTRY = {
     "accuracy": AccuracyScorer,
     "random": RandomScorer,
+    "braintrust::factuality": BrainTrustFactualityScorer,
+    "braintrust::answer-correctness": BraintrustAnswerCorrectnessScorer,
 }
 
 for k, v in SCORER_REGISTRY.items():
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py
new file mode 100644
index 0000000000..5dd4eb383e
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import numpy as np
+
+from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult
+from llama_stack.apis.datasets.datasets import *  # noqa: F401 F403
+from autoevals.llm import *  # noqa: F403
+from autoevals.ragas import *  # noqa: F403
+
+
+class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]):
+    def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+        input_query = scorer_input_sample.input_query
+        extracted_answer = scorer_input_sample.generated_answer
+        expected_answer = scorer_input_sample.expected_answer
+
+        evaluator = Factuality()
+        result = evaluator(output, expected, input=input_query)
+        factuality = result.score
+        return SingleEvalResult(score_data={"factuality": factuality})
+
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        avg_score = np.average(
+            [result.score_data["factuality"] for result in eval_results]
+        )
+
+        return EvalResult(
+            metrics={
+                "avg_factuality_score": avg_score,
+            }
+        )
+
+
+class BraintrustAnswerCorrectnessScorer(BaseScorer[ScorerInputSample]):
+    def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+        input_query = scorer_input_sample.input_query
+        extracted_answer = scorer_input_sample.generated_answer
+        expected_answer = scorer_input_sample.expected_answer
+
+        evaluator = AnswerCorrectness()
+        result = evaluator(output, expected, input=input_query)
+        correctness = result.score
+        return SingleEvalResult(score_data={"answer_correctness": correctness})
+
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        avg_score = np.average(
+            [result.score_data["answer_correctness"] for result in eval_results]
+        )
+
+        return EvalResult(
+            metrics={
+                "avg_correctness_score": avg_score,
+            }
+        )

From 3c29108b6ed107b41e1c887f9276ebad95f267be Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 14 Oct 2024 21:17:16 -0700
Subject: [PATCH 20/27] input query optional input for braintrust scorer

---
 llama_stack/apis/evals/client.py                             | 2 +-
 llama_stack/distribution/registry/scorers/__init__.py        | 2 +-
 .../impls/meta_reference/evals/tasks/run_scoring_task.py     | 5 ++++-
 llama_stack/providers/registry/evals.py                      | 2 ++
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py
index 1db7afac19..b795477132 100644
--- a/llama_stack/apis/evals/client.py
+++ b/llama_stack/apis/evals/client.py
@@ -118,7 +118,7 @@ async def run_main(host: str, port: int):
     response = await client.run_scorer(
         dataset_config=EvaluateDatasetConfig(
             dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
-            # row_limit=10,
+            row_limit=10,
         ),
         eval_scoring_config=EvaluateScoringConfig(
             scorer_config_list=[
diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py
index 60e03b2fef..7cbe2a4262 100644
--- a/llama_stack/distribution/registry/scorers/__init__.py
+++ b/llama_stack/distribution/registry/scorers/__init__.py
@@ -16,7 +16,7 @@
 SCORER_REGISTRY = {
     "accuracy": AccuracyScorer,
     "random": RandomScorer,
-    "braintrust::factuality": BrainTrustFactualityScorer,
+    "braintrust::factuality": BraintrustFactualityScorer,
     "braintrust::answer-correctness": BraintrustAnswerCorrectnessScorer,
 }
 
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
index f856debe95..9e4821a73b 100644
--- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
@@ -33,11 +33,15 @@ def transform_score_input_sample(
         for x in dataset:
             expected_answer = x.data["expected_answer"]
             generated_answer = x.data["generated_answer"]
+            input_query = None
+            if "input_query" in x.data:
+                input_query = x.data["input_query"]
 
             scorer_inputs.append(
                 ScorerInputSample(
                     expected_answer=expected_answer,
                     generated_answer=generated_answer,
+                    input_query=input_query,
                 )
             )
 
@@ -74,7 +78,6 @@ async def run(
         )
 
         scorer_results = scorer.score(postprocessed)
-        cprint(scorer_results, "magenta")
         eval_result = scorer.aggregate_results(scorer_results)
 
         return eval_result
diff --git a/llama_stack/providers/registry/evals.py b/llama_stack/providers/registry/evals.py
index 8693ec603a..6ea4c16f56 100644
--- a/llama_stack/providers/registry/evals.py
+++ b/llama_stack/providers/registry/evals.py
@@ -20,6 +20,8 @@ def available_providers() -> List[ProviderSpec]:
                 "pandas",
                 "scikit-learn",
                 "datasets",
+                "numpy",
+                "autoevals",
             ],
             module="llama_stack.providers.impls.meta_reference.evals",
             config_class="llama_stack.providers.impls.meta_reference.evals.MetaReferenceEvalsImplConfig",

From ec6c63ba5713533991080026de917afa24bd1284 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 14 Oct 2024 23:36:15 -0700
Subject: [PATCH 21/27] dataset accept file uploads

---
 llama_stack/apis/evals/client.py              | 37 ++++++++++++++++---
 .../registry/datasets/dataset_wrappers.py     | 25 ++++++++++++-
 .../evals/tasks/run_scoring_task.py           |  4 +-
 llama_stack/providers/registry/evals.py       |  1 +
 4 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py
index b795477132..7d812817b4 100644
--- a/llama_stack/apis/evals/client.py
+++ b/llama_stack/apis/evals/client.py
@@ -12,9 +12,28 @@
 from termcolor import cprint
 
 from .evals import *  # noqa: F403
+import base64
+import mimetypes
+import os
+
 from ..datasets.client import DatasetsClient
 
 
+def data_url_from_file(file_path: str) -> str:
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    with open(file_path, "rb") as file:
+        file_content = file.read()
+
+    base64_content = base64.b64encode(file_content).decode("utf-8")
+    mime_type, _ = mimetypes.guess_type(file_path)
+
+    data_url = f"data:{mime_type};base64,{base64_content}"
+
+    return data_url
+
+
 class EvaluationClient(Evals):
     def __init__(self, base_url: str):
         self.base_url = base_url
@@ -70,9 +89,8 @@ async def run_scorer(
             return EvaluateResponse(**response.json())
 
 
-async def run_main(host: str, port: int):
+async def run_main(host: str, port: int, eval_dataset_path: str = ""):
     client = EvaluationClient(f"http://{host}:{port}")
-
     dataset_client = DatasetsClient(f"http://{host}:{port}")
 
     # Full Eval Task
@@ -114,10 +132,19 @@ async def run_main(host: str, port: int):
     )
     cprint(response, "cyan")
 
+    response = await dataset_client.create_dataset(
+        dataset_def=CustomDatasetDef(
+            identifier="rag-evals",
+            url=data_url_from_file(eval_dataset_path),
+        )
+    )
+    cprint(response, "cyan")
+
     # 2. run evals on the registered dataset
     response = await client.run_scorer(
         dataset_config=EvaluateDatasetConfig(
-            dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
+            dataset_identifier="rag-evals",
+            # dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
             row_limit=10,
         ),
         eval_scoring_config=EvaluateScoringConfig(
@@ -141,8 +168,8 @@ async def run_main(host: str, port: int):
     # )
 
 
-def main(host: str, port: int):
-    asyncio.run(run_main(host, port))
+def main(host: str, port: int, eval_dataset_path: str = ""):
+    asyncio.run(run_main(host, port, eval_dataset_path))
 
 
 if __name__ == "__main__":
diff --git a/llama_stack/distribution/registry/datasets/dataset_wrappers.py b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
index 88a487d602..410ad394a3 100644
--- a/llama_stack/distribution/registry/datasets/dataset_wrappers.py
+++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
@@ -3,10 +3,13 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import io
+
 import pandas
 from datasets import Dataset, load_dataset
 
 from llama_stack.apis.datasets import *  # noqa: F403
+from llama_stack.providers.utils.memory.vector_store import parse_data_url
 
 
 class CustomDataset(BaseDataset[DictSample]):
@@ -37,11 +40,31 @@ def load(self, n_samples: Optional[int] = None) -> None:
         if self.dataset:
             return
 
-        # TODO: better support w/ data url
+        # TODO: more robust support w/ data url
         if self.config.url.endswith(".csv"):
             df = pandas.read_csv(self.config.url)
         elif self.config.url.endswith(".xlsx"):
             df = pandas.read_excel(self.config.url)
+        elif self.config.url.startswith("data:"):
+            parts = parse_data_url(self.config.url)
+            data = parts["data"]
+            if parts["is_base64"]:
+                data = base64.b64decode(data)
+            else:
+                data = unquote(data)
+                encoding = parts["encoding"] or "utf-8"
+                data = data.encode(encoding)
+
+            mime_type = parts["mimetype"]
+            mime_category = mime_type.split("/")[0]
+            data_bytes = io.BytesIO(data)
+
+            if mime_category == "text":
+                df = pandas.read_csv(data_bytes)
+            else:
+                df = pandas.read_excel(data_bytes)
+        else:
+            raise ValueError(f"Unsupported file type: {self.config.url}")
 
         if n_samples is not None:
             df = df.sample(n=n_samples)
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
index 9e4821a73b..9ff6cde4d6 100644
--- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
@@ -11,7 +11,6 @@
 
 from llama_stack.apis.evals import *  # noqa: F403
 from llama_stack.apis.inference import *  # noqa: F403
-from termcolor import cprint
 
 
 class RunScoringTask(BaseTask):
@@ -62,9 +61,8 @@ async def run(
         dataset.load(n_samples=dataset_config.row_limit)
         print(f"Running on {len(dataset)} samples")
 
-        # transform dataset into
+        # transform dataset into List[ScorerInputSample]
         postprocessed = self.transform_score_input_sample(dataset)
-        cprint(postprocessed, "blue")
 
         # F3 - scorer
         scorer_config_list = eval_scoring_config.scorer_config_list
diff --git a/llama_stack/providers/registry/evals.py b/llama_stack/providers/registry/evals.py
index 6ea4c16f56..a8a7e735ff 100644
--- a/llama_stack/providers/registry/evals.py
+++ b/llama_stack/providers/registry/evals.py
@@ -22,6 +22,7 @@ def available_providers() -> List[ProviderSpec]:
                 "datasets",
                 "numpy",
                 "autoevals",
+                "openpyxl",
             ],
             module="llama_stack.providers.impls.meta_reference.evals",
             config_class="llama_stack.providers.impls.meta_reference.evals.MetaReferenceEvalsImplConfig",

From 9cc0a54f0be8c31061ff0ed19e866bcb5fb7bdbc Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 15 Oct 2024 00:42:03 -0700
Subject: [PATCH 22/27] rag correctness scorer w/ custom dataset

---
 llama_stack/apis/datasets/datasets.py                     | 5 +++++
 llama_stack/apis/evals/client.py                          | 6 ++++++
 .../distribution/registry/datasets/dataset_wrappers.py    | 3 +++
 .../meta_reference/evals/scorer/braintrust_scorer.py      | 8 ++++----
 4 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
index ee270b2910..c0aa4d161e 100644
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -103,6 +103,7 @@ class HuggingfaceDatasetDef(BaseModel):
     )
     rename_columns_map: Optional[Dict[str, str]] = Field(
         description="A map of column names to rename to fit the schema of eval dataset for scoring",
+        default=None,
     )
     kwargs: Dict[str, Any] = Field(
         description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)",
@@ -119,6 +120,10 @@ class CustomDatasetDef(BaseModel):
     url: str = Field(
         description="The URL to the dataset",
     )
+    rename_columns_map: Optional[Dict[str, str]] = Field(
+        description="A map of column names to rename to fit the schema of eval dataset for scoring",
+        default=None,
+    )
 
 
 DatasetDef = Annotated[
diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py
index 7d812817b4..07877c13e5 100644
--- a/llama_stack/apis/evals/client.py
+++ b/llama_stack/apis/evals/client.py
@@ -136,6 +136,9 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""):
         dataset_def=CustomDatasetDef(
             identifier="rag-evals",
             url=data_url_from_file(eval_dataset_path),
+            rename_columns_map={
+                "query": "input_query",
+            },
         )
     )
     cprint(response, "cyan")
@@ -150,6 +153,9 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""):
         eval_scoring_config=EvaluateScoringConfig(
             scorer_config_list=[
                 EvaluateSingleScorerConfig(scorer_name="accuracy"),
+                EvaluateSingleScorerConfig(
+                    scorer_name="braintrust::answer-correctness"
+                ),
             ]
         ),
     )
diff --git a/llama_stack/distribution/registry/datasets/dataset_wrappers.py b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
index 410ad394a3..93cbd9ab21 100644
--- a/llama_stack/distribution/registry/datasets/dataset_wrappers.py
+++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
@@ -70,6 +70,9 @@ def load(self, n_samples: Optional[int] = None) -> None:
             df = df.sample(n=n_samples)
 
         self.dataset = Dataset.from_pandas(df)
+        if self.config.rename_columns_map:
+            for k, v in self.config.rename_columns_map.items():
+                self.dataset = self.dataset.rename_column(k, v)
 
 
 class HuggingfaceDataset(BaseDataset[DictSample]):
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py
index 5dd4eb383e..c124aaad6a 100644
--- a/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py
@@ -14,11 +14,11 @@
 class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]):
     def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
         input_query = scorer_input_sample.input_query
-        extracted_answer = scorer_input_sample.generated_answer
+        generated_answer = scorer_input_sample.generated_answer
         expected_answer = scorer_input_sample.expected_answer
 
         evaluator = Factuality()
-        result = evaluator(output, expected, input=input_query)
+        result = evaluator(generated_answer, expected_answer, input=input_query)
         factuality = result.score
         return SingleEvalResult(score_data={"factuality": factuality})
 
@@ -37,11 +37,11 @@ def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
 class BraintrustAnswerCorrectnessScorer(BaseScorer[ScorerInputSample]):
     def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
         input_query = scorer_input_sample.input_query
-        extracted_answer = scorer_input_sample.generated_answer
+        generated_answer = scorer_input_sample.generated_answer
         expected_answer = scorer_input_sample.expected_answer
 
         evaluator = AnswerCorrectness()
-        result = evaluator(output, expected, input=input_query)
+        result = evaluator(generated_answer, expected_answer, input=input_query)
         correctness = result.score
         return SingleEvalResult(score_data={"answer_correctness": correctness})
 

From d2b62157a3ef7aa6461e5c1857924578890355d9 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 15 Oct 2024 00:44:54 -0700
Subject: [PATCH 23/27] openapi gen

---
 docs/openapi_generator/generate.py            |    4 +-
 docs/resources/llama-stack-spec.html          | 1771 +++++++++--------
 docs/resources/llama-stack-spec.yaml          |  594 +++---
 llama_stack/apis/datasets/datasets.py         |   20 +
 .../apis/post_training/post_training.py       |    2 +-
 5 files changed, 1370 insertions(+), 1021 deletions(-)

diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py
index 871c01a80f..994b06e583 100644
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@@ -33,7 +33,7 @@
 
 from llama_models.llama3.api.datatypes import *  # noqa: F403
 from llama_stack.apis.agents import *  # noqa: F403
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
 from llama_stack.apis.evals import *  # noqa: F403
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.apis.batch_inference import *  # noqa: F403
@@ -61,7 +61,7 @@ class LlamaStack(
     Telemetry,
     PostTraining,
     Memory,
-    Evaluations,
+    Evals,
     Models,
     Shields,
     Inspect,
diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index 96ef7e4bb0..ac75dbf049 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -21,7 +21,7 @@
     "info": {
         "title": "[DRAFT] Llama Stack Specification",
         "version": "0.0.1",
-        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-10-09 21:10:09.073430"
+        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-10-15 00:44:26.278642"
     },
     "servers": [
         {
@@ -109,39 +109,6 @@
                 }
             }
         },
-        "/evaluate/job/cancel": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    }
-                },
-                "tags": [
-                    "Evaluations"
-                ],
-                "parameters": [
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/CancelEvaluationJobRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/post_training/job/cancel": {
             "post": {
                 "responses": {
@@ -355,7 +322,7 @@
                     "200": {
                         "description": "OK",
                         "content": {
-                            "application/json": {
+                            "text/event-stream": {
                                 "schema": {
                                     "$ref": "#/components/schemas/AgentTurnResponseStreamChunk"
                                 }
@@ -393,7 +360,14 @@
             "post": {
                 "responses": {
                     "200": {
-                        "description": "OK"
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/CreateDatasetResponse"
+                                }
+                            }
+                        }
                     }
                 },
                 "tags": [
@@ -489,119 +463,6 @@
             }
         },
         "/datasets/delete": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    }
-                },
-                "tags": [
-                    "Datasets"
-                ],
-                "parameters": [
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/DeleteDatasetRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
-        "/inference/embeddings": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EmbeddingsResponse"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Inference"
-                ],
-                "parameters": [
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/EmbeddingsRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
-        "/evaluate/question_answering/": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJob"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Evaluations"
-                ],
-                "parameters": [
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/EvaluateQuestionAnsweringRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
-        "/evaluate/summarization/": {
             "post": {
                 "responses": {
                     "200": {
@@ -609,14 +470,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJob"
+                                    "$ref": "#/components/schemas/DeleteDatasetResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Evaluations"
+                    "Datasets"
                 ],
                 "parameters": [
                     {
@@ -633,7 +494,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/EvaluateSummarizationRequest"
+                                "$ref": "#/components/schemas/DeleteDatasetRequest"
                             }
                         }
                     },
@@ -641,7 +502,7 @@
                 }
             }
         },
-        "/evaluate/text_generation/": {
+        "/inference/embeddings": {
             "post": {
                 "responses": {
                     "200": {
@@ -649,14 +510,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJob"
+                                    "$ref": "#/components/schemas/EmbeddingsResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Evaluations"
+                    "Inference"
                 ],
                 "parameters": [
                     {
@@ -673,7 +534,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/EvaluateTextGenerationRequest"
+                                "$ref": "#/components/schemas/EmbeddingsRequest"
                             }
                         }
                     },
@@ -845,7 +706,21 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/TrainEvalDataset"
+                                    "oneOf": [
+                                        {
+                                            "oneOf": [
+                                                {
+                                                    "$ref": "#/components/schemas/HuggingfaceDatasetDef"
+                                                },
+                                                {
+                                                    "$ref": "#/components/schemas/CustomDatasetDef"
+                                                }
+                                            ]
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
@@ -856,7 +731,7 @@
                 ],
                 "parameters": [
                     {
-                        "name": "dataset_uuid",
+                        "name": "dataset_identifier",
                         "in": "query",
                         "required": true,
                         "schema": {
@@ -875,7 +750,7 @@
                 ]
             }
         },
-        "/evaluate/job/artifacts": {
+        "/memory_banks/get": {
             "get": {
                 "responses": {
                     "200": {
@@ -883,18 +758,38 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJobArtifactsResponse"
+                                    "oneOf": [
+                                        {
+                                            "oneOf": [
+                                                {
+                                                    "$ref": "#/components/schemas/VectorMemoryBankDef"
+                                                },
+                                                {
+                                                    "$ref": "#/components/schemas/KeyValueMemoryBankDef"
+                                                },
+                                                {
+                                                    "$ref": "#/components/schemas/KeywordMemoryBankDef"
+                                                },
+                                                {
+                                                    "$ref": "#/components/schemas/GraphMemoryBankDef"
+                                                }
+                                            ]
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Evaluations"
+                    "MemoryBanks"
                 ],
                 "parameters": [
                     {
-                        "name": "job_uuid",
+                        "name": "identifier",
                         "in": "query",
                         "required": true,
                         "schema": {
@@ -913,7 +808,7 @@
                 ]
             }
         },
-        "/evaluate/job/logs": {
+        "/models/get": {
             "get": {
                 "responses": {
                     "200": {
@@ -921,18 +816,25 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJobLogStream"
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/ModelDefWithProvider"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Evaluations"
+                    "Models"
                 ],
                 "parameters": [
                     {
-                        "name": "job_uuid",
+                        "name": "identifier",
                         "in": "query",
                         "required": true,
                         "schema": {
@@ -951,7 +853,7 @@
                 ]
             }
         },
-        "/evaluate/job/status": {
+        "/shields/get": {
             "get": {
                 "responses": {
                     "200": {
@@ -959,18 +861,25 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJobStatusResponse"
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/ShieldDefWithProvider"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Evaluations"
+                    "Shields"
                 ],
                 "parameters": [
                     {
-                        "name": "job_uuid",
+                        "name": "shield_type",
                         "in": "query",
                         "required": true,
                         "schema": {
@@ -989,24 +898,32 @@
                 ]
             }
         },
-        "/evaluate/jobs": {
+        "/telemetry/get_trace": {
             "get": {
                 "responses": {
                     "200": {
                         "description": "OK",
                         "content": {
-                            "application/jsonl": {
+                            "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJob"
+                                    "$ref": "#/components/schemas/Trace"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Evaluations"
+                    "Telemetry"
                 ],
                 "parameters": [
+                    {
+                        "name": "trace_id",
+                        "in": "query",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
                     {
                         "name": "X-LlamaStack-ProviderData",
                         "in": "header",
@@ -1019,7 +936,7 @@
                 ]
             }
         },
-        "/memory_banks/get": {
+        "/post_training/job/artifacts": {
             "get": {
                 "responses": {
                     "200": {
@@ -1027,200 +944,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "oneOf": [
-                                                {
-                                                    "$ref": "#/components/schemas/VectorMemoryBankDef"
-                                                },
-                                                {
-                                                    "$ref": "#/components/schemas/KeyValueMemoryBankDef"
-                                                },
-                                                {
-                                                    "$ref": "#/components/schemas/KeywordMemoryBankDef"
-                                                },
-                                                {
-                                                    "$ref": "#/components/schemas/GraphMemoryBankDef"
-                                                }
-                                            ]
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "MemoryBanks"
-                ],
-                "parameters": [
-                    {
-                        "name": "identifier",
-                        "in": "query",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
-        "/models/get": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/ModelDefWithProvider"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Models"
-                ],
-                "parameters": [
-                    {
-                        "name": "identifier",
-                        "in": "query",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
-        "/shields/get": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/ShieldDefWithProvider"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Shields"
-                ],
-                "parameters": [
-                    {
-                        "name": "shield_type",
-                        "in": "query",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
-        "/telemetry/get_trace": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/Trace"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Telemetry"
-                ],
-                "parameters": [
-                    {
-                        "name": "trace_id",
-                        "in": "query",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
-        "/post_training/job/artifacts": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "PostTraining"
+                    "PostTraining"
                 ],
                 "parameters": [
                     {
@@ -1412,6 +1143,43 @@
                 }
             }
         },
+        "/datasets/list": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/jsonl": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/HuggingfaceDatasetDef"
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/CustomDatasetDef"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Datasets"
+                ],
+                "parameters": [
+                    {
+                        "name": "X-LlamaStack-ProviderData",
+                        "in": "header",
+                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
         "/memory_banks/list": {
             "get": {
                 "responses": {
@@ -1836,7 +1604,7 @@
                 }
             }
         },
-        "/safety/run_shield": {
+        "/evals/run_eval_task": {
             "post": {
                 "responses": {
                     "200": {
@@ -1844,14 +1612,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/RunShieldResponse"
+                                    "$ref": "#/components/schemas/EvaluateResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Safety"
+                    "Evals"
                 ],
                 "parameters": [
                     {
@@ -1868,7 +1636,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/RunShieldRequest"
+                                "$ref": "#/components/schemas/RunEvalTaskRequest"
                             }
                         }
                     },
@@ -1876,7 +1644,7 @@
                 }
             }
         },
-        "/post_training/supervised_fine_tune": {
+        "/evals/run_scorer": {
             "post": {
                 "responses": {
                     "200": {
@@ -1884,14 +1652,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/PostTrainingJob"
+                                    "$ref": "#/components/schemas/EvaluateResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "PostTraining"
+                    "Evals"
                 ],
                 "parameters": [
                     {
@@ -1908,7 +1676,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/SupervisedFineTuneRequest"
+                                "$ref": "#/components/schemas/RunScorerRequest"
                             }
                         }
                     },
@@ -1916,7 +1684,7 @@
                 }
             }
         },
-        "/synthetic_data_generation/generate": {
+        "/safety/run_shield": {
             "post": {
                 "responses": {
                     "200": {
@@ -1924,14 +1692,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/SyntheticDataGenerationResponse"
+                                    "$ref": "#/components/schemas/RunShieldResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "SyntheticDataGeneration"
+                    "Safety"
                 ],
                 "parameters": [
                     {
@@ -1948,54 +1716,134 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/SyntheticDataGenerateRequest"
+                                "$ref": "#/components/schemas/RunShieldRequest"
                             }
                         }
                     },
                     "required": true
                 }
             }
-        }
-    },
-    "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
-    "components": {
-        "schemas": {
-            "BuiltinTool": {
-                "type": "string",
-                "enum": [
-                    "brave_search",
-                    "wolfram_alpha",
-                    "photogen",
-                    "code_interpreter"
-                ]
-            },
-            "CompletionMessage": {
-                "type": "object",
-                "properties": {
-                    "role": {
-                        "type": "string",
-                        "const": "assistant",
-                        "default": "assistant"
-                    },
-                    "content": {
-                        "oneOf": [
-                            {
-                                "type": "string"
-                            },
-                            {
-                                "$ref": "#/components/schemas/ImageMedia"
-                            },
-                            {
-                                "type": "array",
-                                "items": {
-                                    "oneOf": [
-                                        {
-                                            "type": "string"
-                                        },
-                                        {
-                                            "$ref": "#/components/schemas/ImageMedia"
-                                        }
-                                    ]
+        },
+        "/post_training/supervised_fine_tune": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/PostTrainingJob"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "PostTraining"
+                ],
+                "parameters": [
+                    {
+                        "name": "X-LlamaStack-ProviderData",
+                        "in": "header",
+                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/SupervisedFineTuneRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
+        "/synthetic_data_generation/generate": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/SyntheticDataGenerationResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "SyntheticDataGeneration"
+                ],
+                "parameters": [
+                    {
+                        "name": "X-LlamaStack-ProviderData",
+                        "in": "header",
+                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/SyntheticDataGenerateRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        }
+    },
+    "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
+    "components": {
+        "schemas": {
+            "BuiltinTool": {
+                "type": "string",
+                "enum": [
+                    "brave_search",
+                    "wolfram_alpha",
+                    "photogen",
+                    "code_interpreter"
+                ]
+            },
+            "CompletionMessage": {
+                "type": "object",
+                "properties": {
+                    "role": {
+                        "type": "string",
+                        "const": "assistant",
+                        "default": "assistant"
+                    },
+                    "content": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "$ref": "#/components/schemas/ImageMedia"
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "oneOf": [
+                                        {
+                                            "type": "string"
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/ImageMedia"
+                                        }
+                                    ]
                                 }
                             }
                         ]
@@ -2571,18 +2419,6 @@
                     "completion_message_batch"
                 ]
             },
-            "CancelEvaluationJobRequest": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ]
-            },
             "CancelTrainingJobRequest": {
                 "type": "object",
                 "properties": {
@@ -4090,19 +3926,58 @@
                     "error"
                 ]
             },
-            "TrainEvalDataset": {
+            "CustomDatasetDef": {
                 "type": "object",
                 "properties": {
-                    "columns": {
+                    "type": {
+                        "type": "string",
+                        "const": "custom",
+                        "default": "custom"
+                    },
+                    "identifier": {
+                        "type": "string"
+                    },
+                    "url": {
+                        "type": "string"
+                    },
+                    "rename_columns_map": {
                         "type": "object",
                         "additionalProperties": {
-                            "$ref": "#/components/schemas/TrainEvalDatasetColumnType"
+                            "type": "string"
                         }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "identifier",
+                    "url"
+                ]
+            },
+            "HuggingfaceDatasetDef": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "huggingface",
+                        "default": "huggingface"
                     },
-                    "content_url": {
-                        "$ref": "#/components/schemas/URL"
+                    "identifier": {
+                        "type": "string"
                     },
-                    "metadata": {
+                    "dataset_path": {
+                        "type": "string"
+                    },
+                    "dataset_name": {
+                        "type": "string"
+                    },
+                    "rename_columns_map": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "string"
+                        }
+                    },
+                    "kwargs": {
                         "type": "object",
                         "additionalProperties": {
                             "oneOf": [
@@ -4130,35 +4005,48 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "columns",
-                    "content_url"
-                ],
-                "title": "Dataset to be used for training or evaluating language models."
-            },
-            "TrainEvalDatasetColumnType": {
-                "type": "string",
-                "enum": [
-                    "dialog",
-                    "text",
-                    "media",
-                    "number",
-                    "json"
+                    "type",
+                    "identifier",
+                    "dataset_path",
+                    "kwargs"
                 ]
             },
             "CreateDatasetRequest": {
                 "type": "object",
                 "properties": {
-                    "uuid": {
-                        "type": "string"
+                    "dataset_def": {
+                        "oneOf": [
+                            {
+                                "$ref": "#/components/schemas/HuggingfaceDatasetDef"
+                            },
+                            {
+                                "$ref": "#/components/schemas/CustomDatasetDef"
+                            }
+                        ]
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dataset_def"
+                ]
+            },
+            "CreateDatasetResponse": {
+                "type": "object",
+                "properties": {
+                    "status": {
+                        "type": "string",
+                        "enum": [
+                            "success",
+                            "fail"
+                        ]
                     },
-                    "dataset": {
-                        "$ref": "#/components/schemas/TrainEvalDataset"
+                    "msg": {
+                        "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "uuid",
-                    "dataset"
+                    "status"
                 ]
             },
             "DeleteAgentsRequest": {
@@ -4192,13 +4080,32 @@
             "DeleteDatasetRequest": {
                 "type": "object",
                 "properties": {
-                    "dataset_uuid": {
+                    "dataset_identifier": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dataset_identifier"
+                ]
+            },
+            "DeleteDatasetResponse": {
+                "type": "object",
+                "properties": {
+                    "status": {
+                        "type": "string",
+                        "enum": [
+                            "success",
+                            "fail"
+                        ]
+                    },
+                    "msg": {
                         "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "dataset_uuid"
+                    "status"
                 ]
             },
             "EmbeddingsRequest": {
@@ -4258,112 +4165,42 @@
                     "embeddings"
                 ]
             },
-            "EvaluateQuestionAnsweringRequest": {
+            "GetAgentsSessionRequest": {
                 "type": "object",
                 "properties": {
-                    "metrics": {
+                    "turn_ids": {
                         "type": "array",
                         "items": {
-                            "type": "string",
-                            "enum": [
-                                "em",
-                                "f1"
-                            ]
+                            "type": "string"
                         }
                     }
                 },
-                "additionalProperties": false,
-                "required": [
-                    "metrics"
-                ]
+                "additionalProperties": false
             },
-            "EvaluationJob": {
+            "GraphMemoryBankDef": {
                 "type": "object",
                 "properties": {
-                    "job_uuid": {
+                    "identifier": {
                         "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string",
+                        "default": ""
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "graph",
+                        "default": "graph"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "job_uuid"
+                    "identifier",
+                    "provider_id",
+                    "type"
                 ]
             },
-            "EvaluateSummarizationRequest": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "type": "string",
-                            "enum": [
-                                "rouge",
-                                "bleu"
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "metrics"
-                ]
-            },
-            "EvaluateTextGenerationRequest": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "type": "string",
-                            "enum": [
-                                "perplexity",
-                                "rouge",
-                                "bleu"
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "metrics"
-                ]
-            },
-            "GetAgentsSessionRequest": {
-                "type": "object",
-                "properties": {
-                    "turn_ids": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    }
-                },
-                "additionalProperties": false
-            },
-            "GraphMemoryBankDef": {
-                "type": "object",
-                "properties": {
-                    "identifier": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string",
-                        "default": ""
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "graph",
-                        "default": "graph"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "identifier",
-                    "provider_id",
-                    "type"
-                ]
-            },
-            "KeyValueMemoryBankDef": {
+            "KeyValueMemoryBankDef": {
                 "type": "object",
                 "properties": {
                     "identifier": {
@@ -4513,43 +4350,6 @@
                     "step"
                 ]
             },
-            "EvaluationJobArtifactsResponse": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ],
-                "title": "Artifacts of a evaluation job."
-            },
-            "EvaluationJobLogStream": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ]
-            },
-            "EvaluationJobStatusResponse": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ]
-            },
             "ModelDefWithProvider": {
                 "type": "object",
                 "properties": {
@@ -5265,6 +5065,61 @@
                     "dpo"
                 ]
             },
+            "TrainEvalDataset": {
+                "type": "object",
+                "properties": {
+                    "columns": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/TrainEvalDatasetColumnType"
+                        }
+                    },
+                    "content_url": {
+                        "$ref": "#/components/schemas/URL"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "columns",
+                    "content_url"
+                ],
+                "title": "Dataset to be used for training or evaluating language models."
+            },
+            "TrainEvalDatasetColumnType": {
+                "type": "string",
+                "enum": [
+                    "dialog",
+                    "text",
+                    "media",
+                    "number",
+                    "json"
+                ]
+            },
             "TrainingConfig": {
                 "type": "object",
                 "properties": {
@@ -5491,222 +5346,530 @@
                                 "document_id": {
                                     "type": "string"
                                 }
-                            },
-                            "additionalProperties": false,
-                            "required": [
-                                "content",
-                                "token_count",
-                                "document_id"
+                            },
+                            "additionalProperties": false,
+                            "required": [
+                                "content",
+                                "token_count",
+                                "document_id"
+                            ]
+                        }
+                    },
+                    "scores": {
+                        "type": "array",
+                        "items": {
+                            "type": "number"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "chunks",
+                    "scores"
+                ]
+            },
+            "RegisterMemoryBankRequest": {
+                "type": "object",
+                "properties": {
+                    "memory_bank": {
+                        "oneOf": [
+                            {
+                                "$ref": "#/components/schemas/VectorMemoryBankDef"
+                            },
+                            {
+                                "$ref": "#/components/schemas/KeyValueMemoryBankDef"
+                            },
+                            {
+                                "$ref": "#/components/schemas/KeywordMemoryBankDef"
+                            },
+                            {
+                                "$ref": "#/components/schemas/GraphMemoryBankDef"
+                            }
+                        ]
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "memory_bank"
+                ]
+            },
+            "RegisterModelRequest": {
+                "type": "object",
+                "properties": {
+                    "model": {
+                        "$ref": "#/components/schemas/ModelDefWithProvider"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "model"
+                ]
+            },
+            "RegisterShieldRequest": {
+                "type": "object",
+                "properties": {
+                    "shield": {
+                        "$ref": "#/components/schemas/ShieldDefWithProvider"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "shield"
+                ]
+            },
+            "DialogGenerations": {
+                "type": "object",
+                "properties": {
+                    "dialog": {
+                        "type": "array",
+                        "items": {
+                            "oneOf": [
+                                {
+                                    "$ref": "#/components/schemas/UserMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/SystemMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ToolResponseMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/CompletionMessage"
+                                }
+                            ]
+                        }
+                    },
+                    "sampled_generations": {
+                        "type": "array",
+                        "items": {
+                            "oneOf": [
+                                {
+                                    "$ref": "#/components/schemas/UserMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/SystemMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ToolResponseMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/CompletionMessage"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dialog",
+                    "sampled_generations"
+                ]
+            },
+            "RewardScoreRequest": {
+                "type": "object",
+                "properties": {
+                    "dialog_generations": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/DialogGenerations"
+                        }
+                    },
+                    "model": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dialog_generations",
+                    "model"
+                ]
+            },
+            "RewardScoringResponse": {
+                "type": "object",
+                "properties": {
+                    "scored_generations": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ScoredDialogGenerations"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "scored_generations"
+                ],
+                "title": "Response from the reward scoring. Batch of (prompt, response, score) tuples that pass the threshold."
+            },
+            "ScoredDialogGenerations": {
+                "type": "object",
+                "properties": {
+                    "dialog": {
+                        "type": "array",
+                        "items": {
+                            "oneOf": [
+                                {
+                                    "$ref": "#/components/schemas/UserMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/SystemMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ToolResponseMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/CompletionMessage"
+                                }
+                            ]
+                        }
+                    },
+                    "scored_generations": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ScoredMessage"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dialog",
+                    "scored_generations"
+                ]
+            },
+            "ScoredMessage": {
+                "type": "object",
+                "properties": {
+                    "message": {
+                        "oneOf": [
+                            {
+                                "$ref": "#/components/schemas/UserMessage"
+                            },
+                            {
+                                "$ref": "#/components/schemas/SystemMessage"
+                            },
+                            {
+                                "$ref": "#/components/schemas/ToolResponseMessage"
+                            },
+                            {
+                                "$ref": "#/components/schemas/CompletionMessage"
+                            }
+                        ]
+                    },
+                    "score": {
+                        "type": "number"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "message",
+                    "score"
+                ]
+            },
+            "EvaluateDatasetConfig": {
+                "type": "object",
+                "properties": {
+                    "dataset_identifier": {
+                        "type": "string"
+                    },
+                    "row_limit": {
+                        "type": "integer"
+                    },
+                    "kwargs": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dataset_identifier"
+                ]
+            },
+            "EvaluateJudgeScoringConfig": {
+                "type": "object"
+            },
+            "EvaluateModelGenerationConfig": {
+                "type": "object",
+                "properties": {
+                    "model": {
+                        "type": "string"
+                    },
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
+                    },
+                    "kwargs": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "model",
+                    "sampling_params"
+                ]
+            },
+            "EvaluatePostprocessConfig": {
+                "type": "object",
+                "properties": {
+                    "kwargs": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false
+            },
+            "EvaluatePreprocessConfig": {
+                "type": "object",
+                "properties": {
+                    "kwargs": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
                             ]
                         }
+                    }
+                },
+                "additionalProperties": false
+            },
+            "EvaluateProcessorConfig": {
+                "type": "object",
+                "properties": {
+                    "processor_identifier": {
+                        "type": "string"
                     },
-                    "scores": {
-                        "type": "array",
-                        "items": {
-                            "type": "number"
-                        }
+                    "preprocess_config": {
+                        "$ref": "#/components/schemas/EvaluatePreprocessConfig"
+                    },
+                    "postprocess_config": {
+                        "$ref": "#/components/schemas/EvaluatePostprocessConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "chunks",
-                    "scores"
+                    "processor_identifier"
                 ]
             },
-            "RegisterMemoryBankRequest": {
+            "EvaluateScoringConfig": {
                 "type": "object",
                 "properties": {
-                    "memory_bank": {
-                        "oneOf": [
-                            {
-                                "$ref": "#/components/schemas/VectorMemoryBankDef"
-                            },
-                            {
-                                "$ref": "#/components/schemas/KeyValueMemoryBankDef"
-                            },
-                            {
-                                "$ref": "#/components/schemas/KeywordMemoryBankDef"
-                            },
-                            {
-                                "$ref": "#/components/schemas/GraphMemoryBankDef"
-                            }
-                        ]
+                    "scorer_config_list": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/EvaluateSingleScorerConfig"
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "memory_bank"
+                    "scorer_config_list"
                 ]
             },
-            "RegisterModelRequest": {
+            "EvaluateSingleScorerConfig": {
                 "type": "object",
                 "properties": {
-                    "model": {
-                        "$ref": "#/components/schemas/ModelDefWithProvider"
+                    "scorer_name": {
+                        "type": "string"
+                    },
+                    "llm_judge_config": {
+                        "$ref": "#/components/schemas/LLMJudgeConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "model"
+                    "scorer_name"
                 ]
             },
-            "RegisterShieldRequest": {
+            "EvaluateTaskConfig": {
                 "type": "object",
                 "properties": {
-                    "shield": {
-                        "$ref": "#/components/schemas/ShieldDefWithProvider"
+                    "dataset_config": {
+                        "$ref": "#/components/schemas/EvaluateDatasetConfig"
+                    },
+                    "processor_config": {
+                        "$ref": "#/components/schemas/EvaluateProcessorConfig"
+                    },
+                    "generation_config": {
+                        "$ref": "#/components/schemas/EvaluateModelGenerationConfig"
+                    },
+                    "scoring_config": {
+                        "$ref": "#/components/schemas/EvaluateScoringConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "shield"
+                    "dataset_config",
+                    "processor_config",
+                    "generation_config",
+                    "scoring_config"
                 ]
             },
-            "DialogGenerations": {
+            "LLMJudgeConfig": {
                 "type": "object",
                 "properties": {
-                    "dialog": {
-                        "type": "array",
-                        "items": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/UserMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/SystemMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/ToolResponseMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/CompletionMessage"
-                                }
-                            ]
-                        }
+                    "judge_processor_config": {
+                        "$ref": "#/components/schemas/EvaluateProcessorConfig"
                     },
-                    "sampled_generations": {
-                        "type": "array",
-                        "items": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/UserMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/SystemMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/ToolResponseMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/CompletionMessage"
-                                }
-                            ]
-                        }
+                    "judge_model_generation_config": {
+                        "$ref": "#/components/schemas/EvaluateModelGenerationConfig"
+                    },
+                    "judge_scoring_config": {
+                        "$ref": "#/components/schemas/EvaluateJudgeScoringConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "dialog",
-                    "sampled_generations"
+                    "judge_processor_config",
+                    "judge_model_generation_config",
+                    "judge_scoring_config"
                 ]
             },
-            "RewardScoreRequest": {
+            "RunEvalTaskRequest": {
                 "type": "object",
                 "properties": {
-                    "dialog_generations": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/DialogGenerations"
-                        }
-                    },
                     "model": {
                         "type": "string"
+                    },
+                    "task": {
+                        "type": "string"
+                    },
+                    "dataset": {
+                        "type": "string"
+                    },
+                    "eval_task_config": {
+                        "$ref": "#/components/schemas/EvaluateTaskConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "dialog_generations",
-                    "model"
+                    "model",
+                    "task"
                 ]
             },
-            "RewardScoringResponse": {
+            "EvalResult": {
                 "type": "object",
                 "properties": {
-                    "scored_generations": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ScoredDialogGenerations"
+                    "metrics": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "number"
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "scored_generations"
+                    "metrics"
                 ],
-                "title": "Response from the reward scoring. Batch of (prompt, response, score) tuples that pass the threshold."
+                "title": "Aggregated final evaluation result."
             },
-            "ScoredDialogGenerations": {
+            "EvaluateResponse": {
                 "type": "object",
                 "properties": {
-                    "dialog": {
-                        "type": "array",
-                        "items": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/UserMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/SystemMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/ToolResponseMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/CompletionMessage"
-                                }
-                            ]
-                        }
+                    "eval_result": {
+                        "$ref": "#/components/schemas/EvalResult"
                     },
-                    "scored_generations": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ScoredMessage"
-                        }
+                    "formatted_report": {
+                        "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "dialog",
-                    "scored_generations"
-                ]
+                    "eval_result"
+                ],
+                "title": "Scores for evaluation."
             },
-            "ScoredMessage": {
+            "RunScorerRequest": {
                 "type": "object",
                 "properties": {
-                    "message": {
-                        "oneOf": [
-                            {
-                                "$ref": "#/components/schemas/UserMessage"
-                            },
-                            {
-                                "$ref": "#/components/schemas/SystemMessage"
-                            },
-                            {
-                                "$ref": "#/components/schemas/ToolResponseMessage"
-                            },
-                            {
-                                "$ref": "#/components/schemas/CompletionMessage"
-                            }
-                        ]
+                    "dataset_config": {
+                        "$ref": "#/components/schemas/EvaluateDatasetConfig"
                     },
-                    "score": {
-                        "type": "number"
+                    "eval_scoring_config": {
+                        "$ref": "#/components/schemas/EvaluateScoringConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "message",
-                    "score"
+                    "dataset_config",
+                    "eval_scoring_config"
                 ]
             },
             "RunShieldRequest": {
@@ -6075,46 +6238,46 @@
     ],
     "tags": [
         {
-            "name": "RewardScoring"
+            "name": "Models"
         },
         {
-            "name": "Memory"
+            "name": "BatchInference"
         },
         {
-            "name": "SyntheticDataGeneration"
+            "name": "Inspect"
         },
         {
-            "name": "Models"
+            "name": "Evals"
         },
         {
             "name": "Safety"
         },
         {
-            "name": "BatchInference"
+            "name": "Shields"
         },
         {
-            "name": "Agents"
+            "name": "Telemetry"
         },
         {
-            "name": "MemoryBanks"
+            "name": "Agents"
         },
         {
-            "name": "Shields"
+            "name": "Memory"
         },
         {
-            "name": "Datasets"
+            "name": "SyntheticDataGeneration"
         },
         {
-            "name": "Evaluations"
+            "name": "PostTraining"
         },
         {
-            "name": "Inspect"
+            "name": "Datasets"
         },
         {
-            "name": "PostTraining"
+            "name": "MemoryBanks"
         },
         {
-            "name": "Telemetry"
+            "name": "RewardScoring"
         },
         {
             "name": "Inference"
@@ -6195,10 +6358,6 @@
             "name": "BatchCompletionResponse",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/BatchCompletionResponse\" />"
         },
-        {
-            "name": "CancelEvaluationJobRequest",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CancelEvaluationJobRequest\" />"
-        },
         {
             "name": "CancelTrainingJobRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CancelTrainingJobRequest\" />"
@@ -6368,17 +6527,21 @@
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/ViolationLevel\" />"
         },
         {
-            "name": "TrainEvalDataset",
-            "description": "Dataset to be used for training or evaluating language models.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/TrainEvalDataset\" />"
+            "name": "CustomDatasetDef",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CustomDatasetDef\" />"
         },
         {
-            "name": "TrainEvalDatasetColumnType",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/TrainEvalDatasetColumnType\" />"
+            "name": "HuggingfaceDatasetDef",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/HuggingfaceDatasetDef\" />"
         },
         {
             "name": "CreateDatasetRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CreateDatasetRequest\" />"
         },
+        {
+            "name": "CreateDatasetResponse",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CreateDatasetResponse\" />"
+        },
         {
             "name": "DeleteAgentsRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/DeleteAgentsRequest\" />"
@@ -6391,6 +6554,10 @@
             "name": "DeleteDatasetRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/DeleteDatasetRequest\" />"
         },
+        {
+            "name": "DeleteDatasetResponse",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/DeleteDatasetResponse\" />"
+        },
         {
             "name": "EmbeddingsRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EmbeddingsRequest\" />"
@@ -6399,22 +6566,6 @@
             "name": "EmbeddingsResponse",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EmbeddingsResponse\" />"
         },
-        {
-            "name": "EvaluateQuestionAnsweringRequest",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateQuestionAnsweringRequest\" />"
-        },
-        {
-            "name": "EvaluationJob",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluationJob\" />"
-        },
-        {
-            "name": "EvaluateSummarizationRequest",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateSummarizationRequest\" />"
-        },
-        {
-            "name": "EvaluateTextGenerationRequest",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateTextGenerationRequest\" />"
-        },
         {
             "name": "GetAgentsSessionRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/GetAgentsSessionRequest\" />"
@@ -6443,18 +6594,6 @@
             "name": "AgentStepResponse",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/AgentStepResponse\" />"
         },
-        {
-            "name": "EvaluationJobArtifactsResponse",
-            "description": "Artifacts of a evaluation job.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/EvaluationJobArtifactsResponse\" />"
-        },
-        {
-            "name": "EvaluationJobLogStream",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluationJobLogStream\" />"
-        },
-        {
-            "name": "EvaluationJobStatusResponse",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluationJobStatusResponse\" />"
-        },
         {
             "name": "ModelDefWithProvider",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/ModelDefWithProvider\" />"
@@ -6555,6 +6694,14 @@
             "name": "RLHFAlgorithm",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/RLHFAlgorithm\" />"
         },
+        {
+            "name": "TrainEvalDataset",
+            "description": "Dataset to be used for training or evaluating language models.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/TrainEvalDataset\" />"
+        },
+        {
+            "name": "TrainEvalDatasetColumnType",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/TrainEvalDatasetColumnType\" />"
+        },
         {
             "name": "TrainingConfig",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/TrainingConfig\" />"
@@ -6603,6 +6750,62 @@
             "name": "ScoredMessage",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/ScoredMessage\" />"
         },
+        {
+            "name": "EvaluateDatasetConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateDatasetConfig\" />"
+        },
+        {
+            "name": "EvaluateJudgeScoringConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateJudgeScoringConfig\" />"
+        },
+        {
+            "name": "EvaluateModelGenerationConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateModelGenerationConfig\" />"
+        },
+        {
+            "name": "EvaluatePostprocessConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluatePostprocessConfig\" />"
+        },
+        {
+            "name": "EvaluatePreprocessConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluatePreprocessConfig\" />"
+        },
+        {
+            "name": "EvaluateProcessorConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateProcessorConfig\" />"
+        },
+        {
+            "name": "EvaluateScoringConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateScoringConfig\" />"
+        },
+        {
+            "name": "EvaluateSingleScorerConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateSingleScorerConfig\" />"
+        },
+        {
+            "name": "EvaluateTaskConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateTaskConfig\" />"
+        },
+        {
+            "name": "LLMJudgeConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/LLMJudgeConfig\" />"
+        },
+        {
+            "name": "RunEvalTaskRequest",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/RunEvalTaskRequest\" />"
+        },
+        {
+            "name": "EvalResult",
+            "description": "Aggregated final evaluation result.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/EvalResult\" />"
+        },
+        {
+            "name": "EvaluateResponse",
+            "description": "Scores for evaluation.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateResponse\" />"
+        },
+        {
+            "name": "RunScorerRequest",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/RunScorerRequest\" />"
+        },
         {
             "name": "RunShieldRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/RunShieldRequest\" />"
@@ -6647,7 +6850,7 @@
                 "Agents",
                 "BatchInference",
                 "Datasets",
-                "Evaluations",
+                "Evals",
                 "Inference",
                 "Inspect",
                 "Memory",
@@ -6681,7 +6884,6 @@
                 "BatchCompletionRequest",
                 "BatchCompletionResponse",
                 "BuiltinTool",
-                "CancelEvaluationJobRequest",
                 "CancelTrainingJobRequest",
                 "ChatCompletionRequest",
                 "ChatCompletionResponse",
@@ -6698,31 +6900,40 @@
                 "CreateAgentSessionRequest",
                 "CreateAgentTurnRequest",
                 "CreateDatasetRequest",
+                "CreateDatasetResponse",
+                "CustomDatasetDef",
                 "DPOAlignmentConfig",
                 "DeleteAgentsRequest",
                 "DeleteAgentsSessionRequest",
                 "DeleteDatasetRequest",
+                "DeleteDatasetResponse",
                 "DialogGenerations",
                 "DoraFinetuningConfig",
                 "EmbeddingsRequest",
                 "EmbeddingsResponse",
-                "EvaluateQuestionAnsweringRequest",
-                "EvaluateSummarizationRequest",
-                "EvaluateTextGenerationRequest",
-                "EvaluationJob",
-                "EvaluationJobArtifactsResponse",
-                "EvaluationJobLogStream",
-                "EvaluationJobStatusResponse",
+                "EvalResult",
+                "EvaluateDatasetConfig",
+                "EvaluateJudgeScoringConfig",
+                "EvaluateModelGenerationConfig",
+                "EvaluatePostprocessConfig",
+                "EvaluatePreprocessConfig",
+                "EvaluateProcessorConfig",
+                "EvaluateResponse",
+                "EvaluateScoringConfig",
+                "EvaluateSingleScorerConfig",
+                "EvaluateTaskConfig",
                 "FinetuningAlgorithm",
                 "FunctionCallToolDefinition",
                 "GetAgentsSessionRequest",
                 "GraphMemoryBankDef",
                 "HealthInfo",
+                "HuggingfaceDatasetDef",
                 "ImageMedia",
                 "InferenceStep",
                 "InsertDocumentsRequest",
                 "KeyValueMemoryBankDef",
                 "KeywordMemoryBankDef",
+                "LLMJudgeConfig",
                 "LogEventRequest",
                 "LogSeverity",
                 "LoraFinetuningConfig",
@@ -6752,6 +6963,8 @@
                 "RewardScoreRequest",
                 "RewardScoringResponse",
                 "RouteInfo",
+                "RunEvalTaskRequest",
+                "RunScorerRequest",
                 "RunShieldRequest",
                 "RunShieldResponse",
                 "SafetyViolation",
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index 9307ee47b2..ab54c4c09e 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -315,14 +315,6 @@ components:
       - photogen
       - code_interpreter
       type: string
-    CancelEvaluationJobRequest:
-      additionalProperties: false
-      properties:
-        job_uuid:
-          type: string
-      required:
-      - job_uuid
-      type: object
     CancelTrainingJobRequest:
       additionalProperties: false
       properties:
@@ -572,13 +564,45 @@ components:
     CreateDatasetRequest:
       additionalProperties: false
       properties:
-        dataset:
-          $ref: '#/components/schemas/TrainEvalDataset'
-        uuid:
+        dataset_def:
+          oneOf:
+          - $ref: '#/components/schemas/HuggingfaceDatasetDef'
+          - $ref: '#/components/schemas/CustomDatasetDef'
+      required:
+      - dataset_def
+      type: object
+    CreateDatasetResponse:
+      additionalProperties: false
+      properties:
+        msg:
+          type: string
+        status:
+          enum:
+          - success
+          - fail
           type: string
       required:
-      - uuid
-      - dataset
+      - status
+      type: object
+    CustomDatasetDef:
+      additionalProperties: false
+      properties:
+        identifier:
+          type: string
+        rename_columns_map:
+          additionalProperties:
+            type: string
+          type: object
+        type:
+          const: custom
+          default: custom
+          type: string
+        url:
+          type: string
+      required:
+      - type
+      - identifier
+      - url
       type: object
     DPOAlignmentConfig:
       additionalProperties: false
@@ -619,10 +643,23 @@ components:
     DeleteDatasetRequest:
       additionalProperties: false
       properties:
-        dataset_uuid:
+        dataset_identifier:
           type: string
       required:
-      - dataset_uuid
+      - dataset_identifier
+      type: object
+    DeleteDatasetResponse:
+      additionalProperties: false
+      properties:
+        msg:
+          type: string
+        status:
+          enum:
+          - success
+          - fail
+          type: string
+      required:
+      - status
       type: object
     DialogGenerations:
       additionalProperties: false
@@ -701,78 +738,147 @@ components:
       required:
       - embeddings
       type: object
-    EvaluateQuestionAnsweringRequest:
+    EvalResult:
       additionalProperties: false
       properties:
         metrics:
-          items:
-            enum:
-            - em
-            - f1
-            type: string
-          type: array
+          additionalProperties:
+            type: number
+          type: object
       required:
       - metrics
+      title: Aggregated final evaluation result.
       type: object
-    EvaluateSummarizationRequest:
+    EvaluateDatasetConfig:
       additionalProperties: false
       properties:
-        metrics:
-          items:
-            enum:
-            - rouge
-            - bleu
-            type: string
-          type: array
+        dataset_identifier:
+          type: string
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+        row_limit:
+          type: integer
       required:
-      - metrics
+      - dataset_identifier
+      type: object
+    EvaluateJudgeScoringConfig:
       type: object
-    EvaluateTextGenerationRequest:
+    EvaluateModelGenerationConfig:
       additionalProperties: false
       properties:
-        metrics:
-          items:
-            enum:
-            - perplexity
-            - rouge
-            - bleu
-            type: string
-          type: array
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+        model:
+          type: string
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
       required:
-      - metrics
+      - model
+      - sampling_params
       type: object
-    EvaluationJob:
+    EvaluatePostprocessConfig:
       additionalProperties: false
       properties:
-        job_uuid:
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+      type: object
+    EvaluatePreprocessConfig:
+      additionalProperties: false
+      properties:
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+      type: object
+    EvaluateProcessorConfig:
+      additionalProperties: false
+      properties:
+        postprocess_config:
+          $ref: '#/components/schemas/EvaluatePostprocessConfig'
+        preprocess_config:
+          $ref: '#/components/schemas/EvaluatePreprocessConfig'
+        processor_identifier:
           type: string
       required:
-      - job_uuid
+      - processor_identifier
       type: object
-    EvaluationJobArtifactsResponse:
+    EvaluateResponse:
       additionalProperties: false
       properties:
-        job_uuid:
+        eval_result:
+          $ref: '#/components/schemas/EvalResult'
+        formatted_report:
           type: string
       required:
-      - job_uuid
-      title: Artifacts of a evaluation job.
+      - eval_result
+      title: Scores for evaluation.
       type: object
-    EvaluationJobLogStream:
+    EvaluateScoringConfig:
       additionalProperties: false
       properties:
-        job_uuid:
-          type: string
+        scorer_config_list:
+          items:
+            $ref: '#/components/schemas/EvaluateSingleScorerConfig'
+          type: array
       required:
-      - job_uuid
+      - scorer_config_list
       type: object
-    EvaluationJobStatusResponse:
+    EvaluateSingleScorerConfig:
       additionalProperties: false
       properties:
-        job_uuid:
+        llm_judge_config:
+          $ref: '#/components/schemas/LLMJudgeConfig'
+        scorer_name:
           type: string
       required:
-      - job_uuid
+      - scorer_name
+      type: object
+    EvaluateTaskConfig:
+      additionalProperties: false
+      properties:
+        dataset_config:
+          $ref: '#/components/schemas/EvaluateDatasetConfig'
+        generation_config:
+          $ref: '#/components/schemas/EvaluateModelGenerationConfig'
+        processor_config:
+          $ref: '#/components/schemas/EvaluateProcessorConfig'
+        scoring_config:
+          $ref: '#/components/schemas/EvaluateScoringConfig'
+      required:
+      - dataset_config
+      - processor_config
+      - generation_config
+      - scoring_config
       type: object
     FinetuningAlgorithm:
       enum:
@@ -845,6 +951,39 @@ components:
       required:
       - status
       type: object
+    HuggingfaceDatasetDef:
+      additionalProperties: false
+      properties:
+        dataset_name:
+          type: string
+        dataset_path:
+          type: string
+        identifier:
+          type: string
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+        rename_columns_map:
+          additionalProperties:
+            type: string
+          type: object
+        type:
+          const: huggingface
+          default: huggingface
+          type: string
+      required:
+      - type
+      - identifier
+      - dataset_path
+      - kwargs
+      type: object
     ImageMedia:
       additionalProperties: false
       properties:
@@ -936,6 +1075,20 @@ components:
       - provider_id
       - type
       type: object
+    LLMJudgeConfig:
+      additionalProperties: false
+      properties:
+        judge_model_generation_config:
+          $ref: '#/components/schemas/EvaluateModelGenerationConfig'
+        judge_processor_config:
+          $ref: '#/components/schemas/EvaluateProcessorConfig'
+        judge_scoring_config:
+          $ref: '#/components/schemas/EvaluateJudgeScoringConfig'
+      required:
+      - judge_processor_config
+      - judge_model_generation_config
+      - judge_scoring_config
+      type: object
     LogEventRequest:
       additionalProperties: false
       properties:
@@ -1629,6 +1782,32 @@ components:
       - method
       - provider_types
       type: object
+    RunEvalTaskRequest:
+      additionalProperties: false
+      properties:
+        dataset:
+          type: string
+        eval_task_config:
+          $ref: '#/components/schemas/EvaluateTaskConfig'
+        model:
+          type: string
+        task:
+          type: string
+      required:
+      - model
+      - task
+      type: object
+    RunScorerRequest:
+      additionalProperties: false
+      properties:
+        dataset_config:
+          $ref: '#/components/schemas/EvaluateDatasetConfig'
+        eval_scoring_config:
+          $ref: '#/components/schemas/EvaluateScoringConfig'
+      required:
+      - dataset_config
+      - eval_scoring_config
+      type: object
     RunShieldRequest:
       additionalProperties: false
       properties:
@@ -2507,7 +2686,7 @@ info:
   description: "This is the specification of the llama stack that provides\n     \
     \           a set of endpoints and their corresponding interfaces that are tailored\
     \ to\n                best leverage Llama Models. The specification is still in\
-    \ draft and subject to change.\n                Generated at 2024-10-09 21:10:09.073430"
+    \ draft and subject to change.\n                Generated at 2024-10-15 00:44:26.278642"
   title: '[DRAFT] Llama Stack Specification'
   version: 0.0.1
 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
@@ -2693,7 +2872,7 @@ paths:
       responses:
         '200':
           content:
-            application/json:
+            text/event-stream:
               schema:
                 $ref: '#/components/schemas/AgentTurnResponseStreamChunk'
           description: OK
@@ -2794,81 +2973,16 @@ paths:
             schema:
               $ref: '#/components/schemas/CreateDatasetRequest'
         required: true
-      responses:
-        '200':
-          description: OK
-      tags:
-      - Datasets
-  /datasets/delete:
-    post:
-      parameters:
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/DeleteDatasetRequest'
-        required: true
-      responses:
-        '200':
-          description: OK
-      tags:
-      - Datasets
-  /datasets/get:
-    get:
-      parameters:
-      - in: query
-        name: dataset_uuid
-        required: true
-        schema:
-          type: string
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
       responses:
         '200':
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/TrainEvalDataset'
+                $ref: '#/components/schemas/CreateDatasetResponse'
           description: OK
       tags:
       - Datasets
-  /evaluate/job/artifacts:
-    get:
-      parameters:
-      - in: query
-        name: job_uuid
-        required: true
-        schema:
-          type: string
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
-      responses:
-        '200':
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluationJobArtifactsResponse'
-          description: OK
-      tags:
-      - Evaluations
-  /evaluate/job/cancel:
+  /datasets/delete:
     post:
       parameters:
       - description: JSON-encoded provider data which will be made available to the
@@ -2882,42 +2996,22 @@ paths:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/CancelEvaluationJobRequest'
-        required: true
-      responses:
-        '200':
-          description: OK
-      tags:
-      - Evaluations
-  /evaluate/job/logs:
-    get:
-      parameters:
-      - in: query
-        name: job_uuid
+              $ref: '#/components/schemas/DeleteDatasetRequest'
         required: true
-        schema:
-          type: string
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
       responses:
         '200':
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/EvaluationJobLogStream'
+                $ref: '#/components/schemas/DeleteDatasetResponse'
           description: OK
       tags:
-      - Evaluations
-  /evaluate/job/status:
+      - Datasets
+  /datasets/get:
     get:
       parameters:
       - in: query
-        name: job_uuid
+        name: dataset_identifier
         required: true
         schema:
           type: string
@@ -2933,11 +3027,15 @@ paths:
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/EvaluationJobStatusResponse'
+                oneOf:
+                - oneOf:
+                  - $ref: '#/components/schemas/HuggingfaceDatasetDef'
+                  - $ref: '#/components/schemas/CustomDatasetDef'
+                - type: 'null'
           description: OK
       tags:
-      - Evaluations
-  /evaluate/jobs:
+      - Datasets
+  /datasets/list:
     get:
       parameters:
       - description: JSON-encoded provider data which will be made available to the
@@ -2952,36 +3050,13 @@ paths:
           content:
             application/jsonl:
               schema:
-                $ref: '#/components/schemas/EvaluationJob'
-          description: OK
-      tags:
-      - Evaluations
-  /evaluate/question_answering/:
-    post:
-      parameters:
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/EvaluateQuestionAnsweringRequest'
-        required: true
-      responses:
-        '200':
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluationJob'
+                oneOf:
+                - $ref: '#/components/schemas/HuggingfaceDatasetDef'
+                - $ref: '#/components/schemas/CustomDatasetDef'
           description: OK
       tags:
-      - Evaluations
-  /evaluate/summarization/:
+      - Datasets
+  /evals/run_eval_task:
     post:
       parameters:
       - description: JSON-encoded provider data which will be made available to the
@@ -2995,18 +3070,18 @@ paths:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/EvaluateSummarizationRequest'
+              $ref: '#/components/schemas/RunEvalTaskRequest'
         required: true
       responses:
         '200':
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/EvaluationJob'
+                $ref: '#/components/schemas/EvaluateResponse'
           description: OK
       tags:
-      - Evaluations
-  /evaluate/text_generation/:
+      - Evals
+  /evals/run_scorer:
     post:
       parameters:
       - description: JSON-encoded provider data which will be made available to the
@@ -3020,17 +3095,17 @@ paths:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/EvaluateTextGenerationRequest'
+              $ref: '#/components/schemas/RunScorerRequest'
         required: true
       responses:
         '200':
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/EvaluationJob'
+                $ref: '#/components/schemas/EvaluateResponse'
           description: OK
       tags:
-      - Evaluations
+      - Evals
   /health:
     get:
       parameters:
@@ -3712,20 +3787,20 @@ security:
 servers:
 - url: http://any-hosted-llama-stack.com
 tags:
-- name: RewardScoring
-- name: Memory
-- name: SyntheticDataGeneration
 - name: Models
-- name: Safety
 - name: BatchInference
-- name: Agents
-- name: MemoryBanks
-- name: Shields
-- name: Datasets
-- name: Evaluations
 - name: Inspect
-- name: PostTraining
+- name: Evals
+- name: Safety
+- name: Shields
 - name: Telemetry
+- name: Agents
+- name: Memory
+- name: SyntheticDataGeneration
+- name: PostTraining
+- name: Datasets
+- name: MemoryBanks
+- name: RewardScoring
 - name: Inference
 - description: <SchemaDefinition schemaRef="#/components/schemas/BuiltinTool" />
   name: BuiltinTool
@@ -3782,9 +3857,6 @@ tags:
 - description: <SchemaDefinition schemaRef="#/components/schemas/BatchCompletionResponse"
     />
   name: BatchCompletionResponse
-- description: <SchemaDefinition schemaRef="#/components/schemas/CancelEvaluationJobRequest"
-    />
-  name: CancelEvaluationJobRequest
 - description: <SchemaDefinition schemaRef="#/components/schemas/CancelTrainingJobRequest"
     />
   name: CancelTrainingJobRequest
@@ -3919,17 +3991,18 @@ tags:
   name: Turn
 - description: <SchemaDefinition schemaRef="#/components/schemas/ViolationLevel" />
   name: ViolationLevel
-- description: 'Dataset to be used for training or evaluating language models.
-
-
-    <SchemaDefinition schemaRef="#/components/schemas/TrainEvalDataset" />'
-  name: TrainEvalDataset
-- description: <SchemaDefinition schemaRef="#/components/schemas/TrainEvalDatasetColumnType"
+- description: <SchemaDefinition schemaRef="#/components/schemas/CustomDatasetDef"
     />
-  name: TrainEvalDatasetColumnType
+  name: CustomDatasetDef
+- description: <SchemaDefinition schemaRef="#/components/schemas/HuggingfaceDatasetDef"
+    />
+  name: HuggingfaceDatasetDef
 - description: <SchemaDefinition schemaRef="#/components/schemas/CreateDatasetRequest"
     />
   name: CreateDatasetRequest
+- description: <SchemaDefinition schemaRef="#/components/schemas/CreateDatasetResponse"
+    />
+  name: CreateDatasetResponse
 - description: <SchemaDefinition schemaRef="#/components/schemas/DeleteAgentsRequest"
     />
   name: DeleteAgentsRequest
@@ -3939,23 +4012,15 @@ tags:
 - description: <SchemaDefinition schemaRef="#/components/schemas/DeleteDatasetRequest"
     />
   name: DeleteDatasetRequest
+- description: <SchemaDefinition schemaRef="#/components/schemas/DeleteDatasetResponse"
+    />
+  name: DeleteDatasetResponse
 - description: <SchemaDefinition schemaRef="#/components/schemas/EmbeddingsRequest"
     />
   name: EmbeddingsRequest
 - description: <SchemaDefinition schemaRef="#/components/schemas/EmbeddingsResponse"
     />
   name: EmbeddingsResponse
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateQuestionAnsweringRequest"
-    />
-  name: EvaluateQuestionAnsweringRequest
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluationJob" />
-  name: EvaluationJob
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateSummarizationRequest"
-    />
-  name: EvaluateSummarizationRequest
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateTextGenerationRequest"
-    />
-  name: EvaluateTextGenerationRequest
 - description: <SchemaDefinition schemaRef="#/components/schemas/GetAgentsSessionRequest"
     />
   name: GetAgentsSessionRequest
@@ -3979,18 +4044,6 @@ tags:
 - description: <SchemaDefinition schemaRef="#/components/schemas/AgentStepResponse"
     />
   name: AgentStepResponse
-- description: 'Artifacts of a evaluation job.
-
-
-    <SchemaDefinition schemaRef="#/components/schemas/EvaluationJobArtifactsResponse"
-    />'
-  name: EvaluationJobArtifactsResponse
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluationJobLogStream"
-    />
-  name: EvaluationJobLogStream
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluationJobStatusResponse"
-    />
-  name: EvaluationJobStatusResponse
 - description: <SchemaDefinition schemaRef="#/components/schemas/ModelDefWithProvider"
     />
   name: ModelDefWithProvider
@@ -4067,6 +4120,14 @@ tags:
   name: OptimizerConfig
 - description: <SchemaDefinition schemaRef="#/components/schemas/RLHFAlgorithm" />
   name: RLHFAlgorithm
+- description: 'Dataset to be used for training or evaluating language models.
+
+
+    <SchemaDefinition schemaRef="#/components/schemas/TrainEvalDataset" />'
+  name: TrainEvalDataset
+- description: <SchemaDefinition schemaRef="#/components/schemas/TrainEvalDatasetColumnType"
+    />
+  name: TrainEvalDatasetColumnType
 - description: <SchemaDefinition schemaRef="#/components/schemas/TrainingConfig" />
   name: TrainingConfig
 - description: <SchemaDefinition schemaRef="#/components/schemas/PreferenceOptimizeRequest"
@@ -4104,6 +4165,51 @@ tags:
   name: ScoredDialogGenerations
 - description: <SchemaDefinition schemaRef="#/components/schemas/ScoredMessage" />
   name: ScoredMessage
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateDatasetConfig"
+    />
+  name: EvaluateDatasetConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateJudgeScoringConfig"
+    />
+  name: EvaluateJudgeScoringConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateModelGenerationConfig"
+    />
+  name: EvaluateModelGenerationConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluatePostprocessConfig"
+    />
+  name: EvaluatePostprocessConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluatePreprocessConfig"
+    />
+  name: EvaluatePreprocessConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateProcessorConfig"
+    />
+  name: EvaluateProcessorConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateScoringConfig"
+    />
+  name: EvaluateScoringConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateSingleScorerConfig"
+    />
+  name: EvaluateSingleScorerConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateTaskConfig"
+    />
+  name: EvaluateTaskConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/LLMJudgeConfig" />
+  name: LLMJudgeConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/RunEvalTaskRequest"
+    />
+  name: RunEvalTaskRequest
+- description: 'Aggregated final evaluation result.
+
+
+    <SchemaDefinition schemaRef="#/components/schemas/EvalResult" />'
+  name: EvalResult
+- description: 'Scores for evaluation.
+
+
+    <SchemaDefinition schemaRef="#/components/schemas/EvaluateResponse" />'
+  name: EvaluateResponse
+- description: <SchemaDefinition schemaRef="#/components/schemas/RunScorerRequest"
+    />
+  name: RunScorerRequest
 - description: <SchemaDefinition schemaRef="#/components/schemas/RunShieldRequest"
     />
   name: RunShieldRequest
@@ -4141,7 +4247,7 @@ x-tagGroups:
   - Agents
   - BatchInference
   - Datasets
-  - Evaluations
+  - Evals
   - Inference
   - Inspect
   - Memory
@@ -4172,7 +4278,6 @@ x-tagGroups:
   - BatchCompletionRequest
   - BatchCompletionResponse
   - BuiltinTool
-  - CancelEvaluationJobRequest
   - CancelTrainingJobRequest
   - ChatCompletionRequest
   - ChatCompletionResponse
@@ -4189,31 +4294,40 @@ x-tagGroups:
   - CreateAgentSessionRequest
   - CreateAgentTurnRequest
   - CreateDatasetRequest
+  - CreateDatasetResponse
+  - CustomDatasetDef
   - DPOAlignmentConfig
   - DeleteAgentsRequest
   - DeleteAgentsSessionRequest
   - DeleteDatasetRequest
+  - DeleteDatasetResponse
   - DialogGenerations
   - DoraFinetuningConfig
   - EmbeddingsRequest
   - EmbeddingsResponse
-  - EvaluateQuestionAnsweringRequest
-  - EvaluateSummarizationRequest
-  - EvaluateTextGenerationRequest
-  - EvaluationJob
-  - EvaluationJobArtifactsResponse
-  - EvaluationJobLogStream
-  - EvaluationJobStatusResponse
+  - EvalResult
+  - EvaluateDatasetConfig
+  - EvaluateJudgeScoringConfig
+  - EvaluateModelGenerationConfig
+  - EvaluatePostprocessConfig
+  - EvaluatePreprocessConfig
+  - EvaluateProcessorConfig
+  - EvaluateResponse
+  - EvaluateScoringConfig
+  - EvaluateSingleScorerConfig
+  - EvaluateTaskConfig
   - FinetuningAlgorithm
   - FunctionCallToolDefinition
   - GetAgentsSessionRequest
   - GraphMemoryBankDef
   - HealthInfo
+  - HuggingfaceDatasetDef
   - ImageMedia
   - InferenceStep
   - InsertDocumentsRequest
   - KeyValueMemoryBankDef
   - KeywordMemoryBankDef
+  - LLMJudgeConfig
   - LogEventRequest
   - LogSeverity
   - LoraFinetuningConfig
@@ -4243,6 +4357,8 @@ x-tagGroups:
   - RewardScoreRequest
   - RewardScoringResponse
   - RouteInfo
+  - RunEvalTaskRequest
+  - RunScorerRequest
   - RunShieldRequest
   - RunShieldResponse
   - SafetyViolation
diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
index c0aa4d161e..f5991c52e1 100644
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -15,6 +15,26 @@
 from typing_extensions import Annotated
 
 
+@json_schema_type
+class TrainEvalDatasetColumnType(Enum):
+    dialog = "dialog"
+    text = "text"
+    media = "media"
+    number = "number"
+    json = "json"
+
+
+@json_schema_type
+class TrainEvalDataset(BaseModel):
+    """Dataset to be used for training or evaluating language models."""
+
+    # TODO(ashwin): figure out if we need to add an enum for a "dataset type"
+
+    columns: Dict[str, TrainEvalDatasetColumnType]
+    content_url: URL
+    metadata: Optional[Dict[str, Any]] = None
+
+
 @json_schema_type
 class GenerationInput(BaseModel):
     messages: List[Message]
diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py
index d943f48b20..cdfe5c4673 100644
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@@ -14,7 +14,7 @@
 from pydantic import BaseModel, Field
 
 from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
 from llama_stack.apis.common.training_types import *  # noqa: F403
 
 

From cccd5be090b36fadf68f1b355556a4820dac2397 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 15 Oct 2024 10:14:35 -0700
Subject: [PATCH 24/27] move eval_task_config to client

---
 llama_stack/apis/evals/client.py              | 141 +++++++++---------
 llama_stack/apis/evals/evals.py               |   5 +-
 .../impls/meta_reference/evals/evals.py       |  36 +----
 3 files changed, 71 insertions(+), 111 deletions(-)

diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py
index 07877c13e5..1e76812c6c 100644
--- a/llama_stack/apis/evals/client.py
+++ b/llama_stack/apis/evals/client.py
@@ -46,23 +46,13 @@ async def shutdown(self) -> None:
 
     async def run_evals(
         self,
-        model: str,
-        task: str,
-        dataset: Optional[str] = None,
-        eval_task_config: Optional[EvaluateTaskConfig] = None,
+        eval_task_config: EvaluateTaskConfig,
     ) -> EvaluateResponse:
         async with httpx.AsyncClient() as client:
             response = await client.post(
                 f"{self.base_url}/evals/run_eval_task",
                 json={
-                    "model": model,
-                    "task": task,
-                    "dataset": dataset,
-                    "eval_task_config": (
-                        json.loads(eval_task_config.json())
-                        if eval_task_config
-                        else None
-                    ),
+                    "eval_task_config": json.loads(eval_task_config.json()),
                 },
                 headers={"Content-Type": "application/json"},
                 timeout=3600,
@@ -94,85 +84,88 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""):
     dataset_client = DatasetsClient(f"http://{host}:{port}")
 
     # Full Eval Task
-
-    # # 1. register custom dataset
-    # response = await dataset_client.create_dataset(
-    #     dataset_def=CustomDatasetDef(
-    #         identifier="mmlu-simple-eval-en",
-    #         url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
-    #     ),
-    # )
-    # cprint(f"datasets/create: {response}", "cyan")
-
-    # # 2. run evals on the registered dataset
-    # response = await client.run_evals(
-    #     model="Llama3.1-8B-Instruct",
-    #     dataset="mmlu-simple-eval-en",
-    #     task="mmlu",
-    # )
-
-    # if response.formatted_report:
-    #     cprint(response.formatted_report, "green")
-    # else:
-    #     cprint(f"Response: {response}", "green")
-
-    # Scoring Task
-    # 1. register huggingface dataset
-    response = await dataset_client.create_dataset(
-        dataset_def=HuggingfaceDatasetDef(
-            identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
-            dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals",
-            dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
-            rename_columns_map={
-                "output_parsed_answer": "generated_answer",
-                "input_correct_responses": "expected_answer",
-            },
-            kwargs={"split": "latest"},
-        )
-    )
-    cprint(response, "cyan")
-
+    # 1. register custom dataset
     response = await dataset_client.create_dataset(
         dataset_def=CustomDatasetDef(
-            identifier="rag-evals",
-            url=data_url_from_file(eval_dataset_path),
-            rename_columns_map={
-                "query": "input_query",
-            },
-        )
+            identifier="mmlu-simple-eval-en",
+            url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
+        ),
     )
-    cprint(response, "cyan")
+    cprint(f"datasets/create: {response}", "cyan")
 
-    # 2. run evals on the registered dataset
-    response = await client.run_scorer(
+    # # 2. run evals on the registered dataset
+    eval_task_config = EvaluateTaskConfig(
         dataset_config=EvaluateDatasetConfig(
-            dataset_identifier="rag-evals",
-            # dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
-            row_limit=10,
+            dataset_identifier="mmlu-simple-eval-en",
+            row_limit=3,
         ),
-        eval_scoring_config=EvaluateScoringConfig(
+        processor_config=EvaluateProcessorConfig(
+            processor_identifier="mmlu",
+        ),
+        generation_config=EvaluateModelGenerationConfig(
+            model="Llama3.1-8B-Instruct",
+        ),
+        scoring_config=EvaluateScoringConfig(
             scorer_config_list=[
                 EvaluateSingleScorerConfig(scorer_name="accuracy"),
-                EvaluateSingleScorerConfig(
-                    scorer_name="braintrust::answer-correctness"
-                ),
+                EvaluateSingleScorerConfig(scorer_name="random"),
             ]
         ),
     )
-
+    response = await client.run_evals(
+        eval_task_config=eval_task_config,
+    )
     for k, v in response.eval_result.metrics.items():
         cprint(f"{k}: {v}", "green")
 
-    # Eleuther Eval Task
-    # response = await client.run_evals(
-    #     model="Llama3.1-8B-Instruct",
-    #     # task="meta_mmlu_pro_instruct",
-    #     task="meta_ifeval",
-    #     eval_task_config=EvaluateTaskConfig(
-    #         n_samples=2,
+    # Scoring Task
+    # # 1. register huggingface dataset
+    # response = await dataset_client.create_dataset(
+    #     dataset_def=HuggingfaceDatasetDef(
+    #         identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
+    #         dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals",
+    #         dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
+    #         rename_columns_map={
+    #             "output_parsed_answer": "generated_answer",
+    #             "input_correct_responses": "expected_answer",
+    #         },
+    #         kwargs={"split": "latest"},
+    #     )
+    # )
+    # cprint(response, "cyan")
+
+    # # register custom dataset from file path
+    # response = await dataset_client.create_dataset(
+    #     dataset_def=CustomDatasetDef(
+    #         identifier="rag-evals",
+    #         url=data_url_from_file(eval_dataset_path),
+    #         rename_columns_map={
+    #             "query": "input_query",
+    #         },
+    #     )
+    # )
+    # cprint(response, "cyan")
+
+    # # 2. run evals on the registered dataset
+    # response = await client.run_scorer(
+    #     dataset_config=EvaluateDatasetConfig(
+    #         dataset_identifier="rag-evals",
+    #         # dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
+    #         row_limit=10,
+    #     ),
+    #     eval_scoring_config=EvaluateScoringConfig(
+    #         scorer_config_list=[
+    #             EvaluateSingleScorerConfig(scorer_name="accuracy"),
+    #             EvaluateSingleScorerConfig(
+    #                 scorer_name="braintrust::answer-correctness"
+    #             ),
+    #         ]
     #     ),
     # )
 
+    # for k, v in response.eval_result.metrics.items():
+    #     cprint(f"{k}: {v}", "green")
+
 
 def main(host: str, port: int, eval_dataset_path: str = ""):
     asyncio.run(run_main(host, port, eval_dataset_path))
diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index a02394ee40..c484db734f 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -228,10 +228,7 @@ class Evals(Protocol):
     @webmethod(route="/evals/run_eval_task")
     async def run_eval_task(
         self,
-        model: str,
-        task: str,
-        dataset: Optional[str] = None,
-        eval_task_config: Optional[EvaluateTaskConfig] = None,
+        eval_task_config: EvaluateTaskConfig,
     ) -> EvaluateResponse: ...
 
     @webmethod(route="/evals/run_scorer")
diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py
index 916e40e3ac..a9e2c641f9 100644
--- a/llama_stack/providers/impls/meta_reference/evals/evals.py
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@@ -28,39 +28,9 @@ async def shutdown(self) -> None:
 
     async def run_eval_task(
         self,
-        model: str,
-        task: str,
-        dataset: Optional[str] = None,
-        eval_task_config: Optional[EvaluateTaskConfig] = None,
+        eval_task_config: EvaluateTaskConfig,
     ) -> EvaluateResponse:
-        cprint(
-            f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}",
-            "red",
-        )
-
-        if not dataset:
-            raise ValueError("dataset must be specified for mete-reference evals")
-
-        if not eval_task_config:
-            # construct eval task config from inputs
-            eval_task_config = EvaluateTaskConfig(
-                dataset_config=EvaluateDatasetConfig(
-                    dataset_identifier=dataset,
-                    row_limit=3,
-                ),
-                processor_config=EvaluateProcessorConfig(
-                    processor_identifier="mmlu",
-                ),
-                generation_config=EvaluateModelGenerationConfig(
-                    model=model,
-                ),
-                scoring_config=EvaluateScoringConfig(
-                    scorer_config_list=[
-                        EvaluateSingleScorerConfig(scorer_name="accuracy"),
-                        EvaluateSingleScorerConfig(scorer_name="random"),
-                    ]
-                ),
-            )
+        cprint(f"run_eval_task: on {eval_task_config}", "green")
 
         run_task = RunEvalTask()
         eval_result = await run_task.run(eval_task_config, self.inference_api)
@@ -75,7 +45,7 @@ async def run_scorer(
         dataset_config: EvaluateDatasetConfig,
         eval_scoring_config: EvaluateScoringConfig,
     ) -> EvaluateResponse:
-        cprint("run_scorer")
+        cprint(f"run_scorer: on {dataset_config} with {eval_scoring_config}", "green")
 
         run_task = RunScoringTask()
         eval_result = await run_task.run(dataset_config, eval_scoring_config)

From be4f395032930f8ba9b7a21da6d8a9644396a631 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 15 Oct 2024 10:17:45 -0700
Subject: [PATCH 25/27] full evals / full scoring flow

---
 docs/resources/llama-stack-spec.html          | 44 ++++------
 docs/resources/llama-stack-spec.yaml          | 31 +++----
 llama_stack/apis/evals/client.py              | 86 +++++++++----------
 .../registry/datasets/dataset_wrappers.py     |  2 +-
 4 files changed, 71 insertions(+), 92 deletions(-)

diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index ac75dbf049..7787001ffb 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -21,7 +21,7 @@
     "info": {
         "title": "[DRAFT] Llama Stack Specification",
         "version": "0.0.1",
-        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-10-15 00:44:26.278642"
+        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-10-15 10:15:15.195382"
     },
     "servers": [
         {
@@ -5805,23 +5805,13 @@
             "RunEvalTaskRequest": {
                 "type": "object",
                 "properties": {
-                    "model": {
-                        "type": "string"
-                    },
-                    "task": {
-                        "type": "string"
-                    },
-                    "dataset": {
-                        "type": "string"
-                    },
                     "eval_task_config": {
                         "$ref": "#/components/schemas/EvaluateTaskConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "model",
-                    "task"
+                    "eval_task_config"
                 ]
             },
             "EvalResult": {
@@ -6238,49 +6228,49 @@
     ],
     "tags": [
         {
-            "name": "Models"
+            "name": "Inference"
         },
         {
-            "name": "BatchInference"
+            "name": "PostTraining"
         },
         {
-            "name": "Inspect"
+            "name": "Agents"
         },
         {
-            "name": "Evals"
+            "name": "MemoryBanks"
         },
         {
-            "name": "Safety"
+            "name": "Inspect"
         },
         {
-            "name": "Shields"
+            "name": "Models"
         },
         {
-            "name": "Telemetry"
+            "name": "Safety"
         },
         {
-            "name": "Agents"
+            "name": "Evals"
         },
         {
-            "name": "Memory"
+            "name": "BatchInference"
         },
         {
-            "name": "SyntheticDataGeneration"
+            "name": "Shields"
         },
         {
-            "name": "PostTraining"
+            "name": "SyntheticDataGeneration"
         },
         {
-            "name": "Datasets"
+            "name": "Telemetry"
         },
         {
-            "name": "MemoryBanks"
+            "name": "RewardScoring"
         },
         {
-            "name": "RewardScoring"
+            "name": "Datasets"
         },
         {
-            "name": "Inference"
+            "name": "Memory"
         },
         {
             "name": "BuiltinTool",
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index ab54c4c09e..d601435d79 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -1785,17 +1785,10 @@ components:
     RunEvalTaskRequest:
       additionalProperties: false
       properties:
-        dataset:
-          type: string
         eval_task_config:
           $ref: '#/components/schemas/EvaluateTaskConfig'
-        model:
-          type: string
-        task:
-          type: string
       required:
-      - model
-      - task
+      - eval_task_config
       type: object
     RunScorerRequest:
       additionalProperties: false
@@ -2686,7 +2679,7 @@ info:
   description: "This is the specification of the llama stack that provides\n     \
     \           a set of endpoints and their corresponding interfaces that are tailored\
     \ to\n                best leverage Llama Models. The specification is still in\
-    \ draft and subject to change.\n                Generated at 2024-10-15 00:44:26.278642"
+    \ draft and subject to change.\n                Generated at 2024-10-15 10:15:15.195382"
   title: '[DRAFT] Llama Stack Specification'
   version: 0.0.1
 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
@@ -3787,21 +3780,21 @@ security:
 servers:
 - url: http://any-hosted-llama-stack.com
 tags:
-- name: Models
-- name: BatchInference
+- name: Inference
+- name: PostTraining
+- name: Agents
+- name: MemoryBanks
 - name: Inspect
-- name: Evals
+- name: Models
 - name: Safety
+- name: Evals
+- name: BatchInference
 - name: Shields
-- name: Telemetry
-- name: Agents
-- name: Memory
 - name: SyntheticDataGeneration
-- name: PostTraining
-- name: Datasets
-- name: MemoryBanks
+- name: Telemetry
 - name: RewardScoring
-- name: Inference
+- name: Datasets
+- name: Memory
 - description: <SchemaDefinition schemaRef="#/components/schemas/BuiltinTool" />
   name: BuiltinTool
 - description: <SchemaDefinition schemaRef="#/components/schemas/CompletionMessage"
diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py
index 1e76812c6c..4756a570ae 100644
--- a/llama_stack/apis/evals/client.py
+++ b/llama_stack/apis/evals/client.py
@@ -119,52 +119,48 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""):
         cprint(f"{k}: {v}", "green")
 
     # Scoring Task
-    # # 1. register huggingface dataset
-    # response = await dataset_client.create_dataset(
-    #     dataset_def=HuggingfaceDatasetDef(
-    #         identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
-    #         dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals",
-    #         dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
-    #         rename_columns_map={
-    #             "output_parsed_answer": "generated_answer",
-    #             "input_correct_responses": "expected_answer",
-    #         },
-    #         kwargs={"split": "latest"},
-    #     )
-    # )
-    # cprint(response, "cyan")
-
-    # # register custom dataset from file path
-    # response = await dataset_client.create_dataset(
-    #     dataset_def=CustomDatasetDef(
-    #         identifier="rag-evals",
-    #         url=data_url_from_file(eval_dataset_path),
-    #         rename_columns_map={
-    #             "query": "input_query",
-    #         },
-    #     )
-    # )
-    # cprint(response, "cyan")
+    # 1. register huggingface dataset
+    response = await dataset_client.create_dataset(
+        dataset_def=HuggingfaceDatasetDef(
+            identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
+            dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals",
+            dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
+            rename_columns_map={
+                "output_parsed_answer": "generated_answer",
+                "input_correct_responses": "expected_answer",
+            },
+            kwargs={"split": "latest"},
+        )
+    )
+    cprint(response, "cyan")
 
-    # # 2. run evals on the registered dataset
-    # response = await client.run_scorer(
-    #     dataset_config=EvaluateDatasetConfig(
-    #         dataset_identifier="rag-evals",
-    #         # dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
-    #         row_limit=10,
-    #     ),
-    #     eval_scoring_config=EvaluateScoringConfig(
-    #         scorer_config_list=[
-    #             EvaluateSingleScorerConfig(scorer_name="accuracy"),
-    #             EvaluateSingleScorerConfig(
-    #                 scorer_name="braintrust::answer-correctness"
-    #             ),
-    #         ]
-    #     ),
-    # )
-
-    # for k, v in response.eval_result.metrics.items():
-    #     cprint(f"{k}: {v}", "green")
+    # register custom dataset from file path
+    response = await dataset_client.create_dataset(
+        dataset_def=CustomDatasetDef(
+            identifier="rag-evals",
+            url=data_url_from_file(eval_dataset_path),
+        )
+    )
+    cprint(response, "cyan")
+
+    # 2. run evals on the registered dataset
+    response = await client.run_scorer(
+        dataset_config=EvaluateDatasetConfig(
+            dataset_identifier="rag-evals",
+            row_limit=10,
+        ),
+        eval_scoring_config=EvaluateScoringConfig(
+            scorer_config_list=[
+                EvaluateSingleScorerConfig(scorer_name="accuracy"),
+                EvaluateSingleScorerConfig(
+                    scorer_name="braintrust::answer-correctness"
+                ),
+            ]
+        ),
+    )
+
+    for k, v in response.eval_result.metrics.items():
+        cprint(f"{k}: {v}", "green")
 
 
 def main(host: str, port: int, eval_dataset_path: str = ""):
diff --git a/llama_stack/distribution/registry/datasets/dataset_wrappers.py b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
index 93cbd9ab21..6c9af5887c 100644
--- a/llama_stack/distribution/registry/datasets/dataset_wrappers.py
+++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
@@ -67,7 +67,7 @@ def load(self, n_samples: Optional[int] = None) -> None:
             raise ValueError(f"Unsupported file type: {self.config.url}")
 
         if n_samples is not None:
-            df = df.sample(n=n_samples)
+            df = df.sample(n=min(n_samples, len(df)))
 
         self.dataset = Dataset.from_pandas(df)
         if self.config.rename_columns_map:

From 0c4ed66ecc512b9cf4a1b55315d4f602e3d0f9a8 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 15 Oct 2024 10:20:30 -0700
Subject: [PATCH 26/27] regen openapi

---
 docs/resources/llama-stack-spec.html | 46 ++++++++++++++++++++++++++--
 docs/resources/llama-stack-spec.yaml | 19 ++++++------
 2 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index c845309964..7ce99db3a7 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -21,7 +21,7 @@
     "info": {
         "title": "[DRAFT] Llama Stack Specification",
         "version": "0.0.1",
-        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-10-10 15:29:56.831109"
+        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-10-15 10:20:19.984531"
     },
     "servers": [
         {
@@ -6228,7 +6228,49 @@
     ],
     "tags": [
         {
-
+            "name": "Agents"
+        },
+        {
+            "name": "Telemetry"
+        },
+        {
+            "name": "Safety"
+        },
+        {
+            "name": "MemoryBanks"
+        },
+        {
+            "name": "Datasets"
+        },
+        {
+            "name": "Shields"
+        },
+        {
+            "name": "RewardScoring"
+        },
+        {
+            "name": "PostTraining"
+        },
+        {
+            "name": "Models"
+        },
+        {
+            "name": "Inspect"
+        },
+        {
+            "name": "Evals"
+        },
+        {
+            "name": "BatchInference"
+        },
+        {
+            "name": "Inference"
+        },
+        {
+            "name": "Memory"
+        },
+        {
+            "name": "SyntheticDataGeneration"
         },
         {
             "name": "BuiltinTool",
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index 8dab4f31d9..c116742243 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -2679,6 +2679,7 @@ info:
   description: "This is the specification of the llama stack that provides\n     \
     \           a set of endpoints and their corresponding interfaces that are tailored\
     \ to\n                best leverage Llama Models. The specification is still in\
+    \ draft and subject to change.\n                Generated at 2024-10-15 10:20:19.984531"
   title: '[DRAFT] Llama Stack Specification'
   version: 0.0.1
 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
@@ -3779,21 +3780,21 @@ security:
 servers:
 - url: http://any-hosted-llama-stack.com
 tags:
-- name: Inference
-- name: PostTraining
 - name: Agents
+- name: Telemetry
+- name: Safety
 - name: MemoryBanks
-- name: Inspect
+- name: Datasets
+- name: Shields
+- name: RewardScoring
+- name: PostTraining
 - name: Models
-- name: Safety
+- name: Inspect
 - name: Evals
 - name: BatchInference
-- name: Shields
-- name: SyntheticDataGeneration
-- name: Telemetry
-- name: RewardScoring
-- name: Datasets
+- name: Inference
 - name: Memory
+- name: SyntheticDataGeneration
 - description: <SchemaDefinition schemaRef="#/components/schemas/BuiltinTool" />
   name: BuiltinTool
 - description: <SchemaDefinition schemaRef="#/components/schemas/CompletionMessage"

From fa68809a2e6df3b9c57f6f0a62c9f3737cd06063 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 15 Oct 2024 13:25:46 -0700
Subject: [PATCH 27/27] llm judge llamastack scorer

---
 llama_stack/apis/evals/client.py              | 18 +++-
 .../registry/generator_processors/__init__.py |  1 +
 .../distribution/registry/scorers/__init__.py |  2 +
 .../impls/meta_reference/evals/evals.py       |  4 +-
 .../evals/generator/inference_generator.py    |  1 -
 .../evals/processor/__init__.py               |  1 +
 .../evals/processor/judge_processor.py        | 75 +++++++++++++++++
 .../evals/scorer/llm_judge_scorer.py          | 83 +++++++++++++++++++
 .../evals/tasks/run_eval_task.py              | 10 ++-
 .../evals/tasks/run_scoring_task.py           | 11 ++-
 10 files changed, 199 insertions(+), 7 deletions(-)
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py
 create mode 100644 llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py

diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py
index 4756a570ae..fc4820232f 100644
--- a/llama_stack/apis/evals/client.py
+++ b/llama_stack/apis/evals/client.py
@@ -93,7 +93,7 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""):
     )
     cprint(f"datasets/create: {response}", "cyan")
 
-    # # 2. run evals on the registered dataset
+    # 2. run evals on the registered dataset
     eval_task_config = EvaluateTaskConfig(
         dataset_config=EvaluateDatasetConfig(
             dataset_identifier="mmlu-simple-eval-en",
@@ -151,9 +151,21 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""):
         ),
         eval_scoring_config=EvaluateScoringConfig(
             scorer_config_list=[
-                EvaluateSingleScorerConfig(scorer_name="accuracy"),
+                # EvaluateSingleScorerConfig(scorer_name="accuracy"),
+                # EvaluateSingleScorerConfig(
+                #     scorer_name="braintrust::answer-correctness"
+                # ),
                 EvaluateSingleScorerConfig(
-                    scorer_name="braintrust::answer-correctness"
+                    scorer_name="llamastack-llm-judge",
+                    llm_judge_config=LLMJudgeConfig(
+                        judge_processor_config=EvaluateProcessorConfig(
+                            processor_identifier="judge",
+                        ),
+                        judge_model_generation_config=EvaluateModelGenerationConfig(
+                            model="Llama3.1-8B-Instruct",
+                        ),
+                        judge_scoring_config=EvaluateJudgeScoringConfig(),
+                    ),
                 ),
             ]
         ),
diff --git a/llama_stack/distribution/registry/generator_processors/__init__.py b/llama_stack/distribution/registry/generator_processors/__init__.py
index 44972cf03e..862984f548 100644
--- a/llama_stack/distribution/registry/generator_processors/__init__.py
+++ b/llama_stack/distribution/registry/generator_processors/__init__.py
@@ -13,6 +13,7 @@
 
 PROCESSOR_REGISTRY = {
     "mmlu": MMLUProcessor,
+    "judge": JudgeProcessor,
 }
 
 for k, v in PROCESSOR_REGISTRY.items():
diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py
index 7cbe2a4262..dda71d4e00 100644
--- a/llama_stack/distribution/registry/scorers/__init__.py
+++ b/llama_stack/distribution/registry/scorers/__init__.py
@@ -7,6 +7,7 @@
 from llama_stack.apis.evals import *  # noqa: F403
 from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import *  # noqa: F403
 from llama_stack.providers.impls.meta_reference.evals.scorer.braintrust_scorer import *  # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.scorer.llm_judge_scorer import *  # noqa: F403
 
 from ..registry import Registry
 
@@ -16,6 +17,7 @@
 SCORER_REGISTRY = {
     "accuracy": AccuracyScorer,
     "random": RandomScorer,
+    "llamastack-llm-judge": LlamaStackLLMJudgeScorer,
     "braintrust::factuality": BraintrustFactualityScorer,
     "braintrust::answer-correctness": BraintrustAnswerCorrectnessScorer,
 }
diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py
index a9e2c641f9..7d3eaa85d8 100644
--- a/llama_stack/providers/impls/meta_reference/evals/evals.py
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@@ -48,7 +48,9 @@ async def run_scorer(
         cprint(f"run_scorer: on {dataset_config} with {eval_scoring_config}", "green")
 
         run_task = RunScoringTask()
-        eval_result = await run_task.run(dataset_config, eval_scoring_config)
+        eval_result = await run_task.run(
+            dataset_config, eval_scoring_config, self.inference_api
+        )
 
         return EvaluateResponse(
             eval_result=eval_result,
diff --git a/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py b/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py
index adc181e237..dafbb16f5b 100644
--- a/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py
+++ b/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py
@@ -30,7 +30,6 @@ async def generate(
     ) -> List[GenerationResponseSample]:
         generation_outputs = []
         for sample in preprocessed_dataset:
-            print("generation: ", sample)
             response = await self.inference_api.chat_completion(
                 model=self.model,
                 messages=sample.generation_input.messages,
diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py
index f782f9320a..5a7ca27958 100644
--- a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py
+++ b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py
@@ -3,4 +3,5 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from .judge_processor import JudgeProcessor  # noqa: F401
 from .mmlu_processor import MMLUProcessor  # noqa: F401
diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py b/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py
new file mode 100644
index 0000000000..d7d6ae3eb2
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import re
+
+from llama_stack.apis.evals import *  # noqa: F403
+
+JUDGE_PROMPT = """
+You will be given a question, a expected_answer, and a system_answer.
+Your task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question.
+Give your answer as a integer on a scale of 0 to 5, where 0 means that the system_answer is not correct at all compared with expected_answer, and 5 means that the answer completely and correctly answers the question.
+
+Provide your feedback as follows:
+
+Feedback:::
+Total rating: (your rating, as a int between 0 and 5)
+
+Now here are the question, expected_answer, system_answer.
+
+Question: {question}
+Expected Answer: {expected_answer}
+System Answer: {answer}
+
+Feedback:::
+Total rating:
+"""
+
+
+class JudgeProcessor(
+    BaseGeneratorProcessor[
+        DictSample, PreprocessedSample, GenerationResponseSample, ScorerInputSample
+    ]
+):
+    """
+    Generator processor for LLM Judge
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def preprocess_sample(self, sample: DictSample) -> PreprocessedSample:
+        content = JUDGE_PROMPT.format(
+            question=sample.data["input_query"],
+            expected_answer=sample.data["expected_answer"],
+            answer=sample.data["generated_answer"],
+        )
+        preprocessed_msgs = [
+            {
+                "role": "user",
+                "content": content,
+            }
+        ]
+        processed_sample = PreprocessedSample(
+            generation_input=GenerationInput(
+                messages=preprocessed_msgs,
+            )
+        )
+        return processed_sample
+
+    def postprocess_sample(
+        self, generation_sample: GenerationResponseSample, dataset_sample: DictSample
+    ) -> ScorerInputSample:
+        response_text = generation_sample.generation_output.completion_message
+        match = re.search(r"Total rating: (\d+)", response_text)
+        judge_rating = int(match.group(1))
+
+        return ScorerInputSample(
+            generated_answer=str(judge_rating),
+            expected_answer=dataset_sample.data["expected_answer"],
+            generation_output=PostprocessedGeneration(
+                completion_message=response_text,
+            ),
+        )
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py
new file mode 100644
index 0000000000..f5f56b435f
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import asyncio
+import threading
+
+import numpy as np
+
+from llama_stack.distribution.registry.generator_processors import (
+    GeneratorProcessorRegistry,
+)
+from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import (
+    InferenceGenerator,
+)
+
+from llama_stack.apis.evals.evals import *  # noqa: F401 F403
+from llama_stack.apis.datasets.datasets import *  # noqa: F401 F403
+from llama_stack.apis.inference import *  # noqa: F403
+
+
+class LlamaStackLLMJudgeScorer(BaseScorer[ScorerInputSample]):
+    def __init__(self, llm_judge_config: LLMJudgeConfig, inference_api: Inference):
+        self.llm_judge_config = llm_judge_config
+        self.inference_api = inference_api
+        # https://stackoverflow.com/questions/74703727/how-to-call-async-function-from-sync-funcion-and-get-result-while-a-loop-is-alr
+        # We will use another thread wih its own event loop to run the async api within sync function
+        self._loop = asyncio.new_event_loop()
+        self._thr = threading.Thread(
+            target=self._loop.run_forever, name="Async Runner", daemon=True
+        )
+        if not self._thr.is_alive():
+            self._thr.start()
+
+    def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+        input_query = scorer_input_sample.input_query
+        generated_answer = scorer_input_sample.generated_answer
+        expected_answer = scorer_input_sample.expected_answer
+
+        # Judge F1
+        processor = GeneratorProcessorRegistry.get(
+            self.llm_judge_config.judge_processor_config.processor_identifier
+        )()
+        data_sample = DictSample(
+            data={
+                "input_query": input_query,
+                "generated_answer": generated_answer,
+                "expected_answer": expected_answer,
+            }
+        )
+        preprocessed_sample = processor.preprocess_sample(data_sample)
+
+        # Judge Generation
+        generator = InferenceGenerator(
+            model=self.llm_judge_config.judge_model_generation_config.model,
+            inference_api=self.inference_api,
+        )
+
+        future = asyncio.run_coroutine_threadsafe(
+            generator.generate([preprocessed_sample]), self._loop
+        )
+        generation_outputs = future.result()
+        # Judge F2
+        postprocessed_sample = processor.postprocess_sample(
+            generation_outputs[0], data_sample
+        )
+
+        # Judge F3
+        score = float(postprocessed_sample.generated_answer)
+
+        return SingleEvalResult(score_data={"judge_score": score})
+
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        avg_score = np.average(
+            [result.score_data["judge_score"] for result in eval_results]
+        )
+
+        return EvalResult(
+            metrics={
+                "avg_judge_score": avg_score,
+            }
+        )
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
index bcd842c420..fbd98128f1 100644
--- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
@@ -72,7 +72,15 @@ async def run(
         scorer_list = []
         for s_conf in scorer_config_list:
             scorer = ScorerRegistry.get(s_conf.scorer_name)
-            scorer_list.append(scorer())
+            if s_conf.llm_judge_config:
+                scorer_list.append(
+                    scorer(
+                        llm_judge_config=s_conf.llm_judge_config,
+                        inference_api=inference_api,
+                    )
+                )
+            else:
+                scorer_list.append(scorer())
 
         scorer = AggregateScorer(
             scorers=scorer_list,
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
index 9ff6cde4d6..6b11191f1e 100644
--- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
@@ -50,6 +50,7 @@ async def run(
         self,
         dataset_config: EvaluateDatasetConfig,
         eval_scoring_config: EvaluateScoringConfig,
+        inference_api: Inference,
         *args,
         **kwargs,
     ) -> EvalResult:
@@ -69,7 +70,15 @@ async def run(
         scorer_list = []
         for s_conf in scorer_config_list:
             scorer = ScorerRegistry.get(s_conf.scorer_name)
-            scorer_list.append(scorer())
+            if s_conf.llm_judge_config:
+                scorer_list.append(
+                    scorer(
+                        llm_judge_config=s_conf.llm_judge_config,
+                        inference_api=inference_api,
+                    )
+                )
+            else:
+                scorer_list.append(scorer())
 
         scorer = AggregateScorer(
             scorers=scorer_list,