From 31c046dcdf8bf9d09fbe0c56a567de07e6e3b525 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 10 Oct 2024 11:35:26 -0700 Subject: [PATCH 01/27] evals new rebase --- llama_stack/apis/dataset/dataset.py | 16 +- llama_stack/apis/evals/client.py | 85 ++++++++ llama_stack/apis/evals/evals.py | 123 +++++------ llama_stack/distribution/registry/__init__.py | 5 + .../registry/datasets/__init__.py | 23 +++ .../distribution/registry/datasets/dataset.py | 62 ++++++ .../registry/datasets/dataset_registry.py | 32 +++ .../distribution/registry/tasks/__init__.py | 13 ++ .../distribution/registry/tasks/task.py | 49 +++++ .../registry/tasks/task_registry.py | 32 +++ llama_stack/distribution/resolver.py | 2 + llama_stack/providers/datatypes.py | 1 + .../impls/meta_reference/evals/__init__.py | 19 ++ .../impls/meta_reference/evals/config.py | 10 + .../impls/meta_reference/evals/evals.py | 71 +++++++ .../meta_reference/evals/tasks/__init__.py | 5 + .../meta_reference/evals/tasks/mmlu_task.py | 150 ++++++++++++++ .../impls/third_party/evals/__init__.py | 5 + .../third_party/evals/eleuther/__init__.py | 19 ++ .../third_party/evals/eleuther/config.py | 10 + .../third_party/evals/eleuther/eleuther.py | 168 +++++++++++++++ .../eleuther/tasks/meta_ifeval/ifeval.yaml | 32 +++ .../evals/eleuther/tasks/meta_ifeval/utils.py | 191 ++++++++++++++++++ .../mmlu_pro_5shot_cot_instruct.yaml | 29 +++ .../eleuther/tasks/meta_mmlu_pro/utils.py | 35 ++++ llama_stack/providers/registry/evals.py | 42 ++++ .../providers/utils/telemetry/tracing.py | 2 +- tests/examples/local-run.yaml | 5 + 28 files changed, 1145 insertions(+), 91 deletions(-) create mode 100644 llama_stack/apis/evals/client.py create mode 100644 llama_stack/distribution/registry/__init__.py create mode 100644 llama_stack/distribution/registry/datasets/__init__.py create mode 100644 llama_stack/distribution/registry/datasets/dataset.py create mode 100644 llama_stack/distribution/registry/datasets/dataset_registry.py create mode 100644 llama_stack/distribution/registry/tasks/__init__.py create mode 100644 llama_stack/distribution/registry/tasks/task.py create mode 100644 llama_stack/distribution/registry/tasks/task_registry.py create mode 100644 llama_stack/providers/impls/meta_reference/evals/__init__.py create mode 100644 llama_stack/providers/impls/meta_reference/evals/config.py create mode 100644 llama_stack/providers/impls/meta_reference/evals/evals.py create mode 100644 llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py create mode 100644 llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py create mode 100644 llama_stack/providers/impls/third_party/evals/__init__.py create mode 100644 llama_stack/providers/impls/third_party/evals/eleuther/__init__.py create mode 100644 llama_stack/providers/impls/third_party/evals/eleuther/config.py create mode 100644 llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py create mode 100644 llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml create mode 100644 llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py create mode 100644 llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml create mode 100644 llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py create mode 100644 llama_stack/providers/registry/evals.py diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py index 2fa8bb4e5e..ba2cb88110 100644 --- a/llama_stack/apis/dataset/dataset.py +++ b/llama_stack/apis/dataset/dataset.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from enum import Enum +# from enum import Enum from typing import Any, Dict, Optional, Protocol from llama_models.llama3.api.datatypes import URL @@ -14,22 +14,12 @@ from pydantic import BaseModel -@json_schema_type -class TrainEvalDatasetColumnType(Enum): - dialog = "dialog" - text = "text" - media = "media" - number = "number" - json = "json" - - @json_schema_type class TrainEvalDataset(BaseModel): """Dataset to be used for training or evaluating language models.""" - # TODO(ashwin): figure out if we need to add an enum for a "dataset type" - - columns: Dict[str, TrainEvalDatasetColumnType] + # unique identifier associated with the dataset + dataset_id: str content_url: URL metadata: Optional[Dict[str, Any]] = None diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py new file mode 100644 index 0000000000..ad4a471455 --- /dev/null +++ b/llama_stack/apis/evals/client.py @@ -0,0 +1,85 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import asyncio +import json + +import fire +import httpx +from termcolor import cprint + +from .evals import * # noqa: F403 + + +class EvaluationClient(Evals): + def __init__(self, base_url: str): + self.base_url = base_url + + async def initialize(self) -> None: + pass + + async def shutdown(self) -> None: + pass + + async def run_evals( + self, + model: str, + task: str, + dataset: Optional[str] = None, + eval_task_config: Optional[EvaluateTaskConfig] = None, + ) -> EvaluateResponse: + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.base_url}/evals/run", + json={ + "model": model, + "task": task, + "dataset": dataset, + "eval_task_config": ( + json.loads(eval_task_config.json()) + if eval_task_config + else None + ), + }, + headers={"Content-Type": "application/json"}, + timeout=3600, + ) + response.raise_for_status() + return EvaluateResponse(**response.json()) + + +async def run_main(host: str, port: int): + client = EvaluationClient(f"http://{host}:{port}") + + # CustomDataset + response = await client.run_evals( + model="Llama3.1-8B-Instruct", + dataset="mmlu-simple-eval-en", + task="mmlu", + eval_task_config=EvaluateTaskConfig( + n_samples=2, + ), + ) + cprint(f"evaluate response={response}", "green") + + # Eleuther Eval Task + # response = await client.run_evals( + # model="Llama3.1-8B-Instruct", + # task="meta_mmlu_pro_instruct", + # # task="meta_ifeval", + # eval_task_config=EvaluateTaskConfig( + # n_samples=2, + # ) + # ) + # cprint(response.metrics["metrics_table"], "red") + + +def main(host: str, port: int): + asyncio.run(run_main(host, port)) + + +if __name__ == "__main__": + fire.Fire(main) diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index 0be2243ab1..dbb1348a53 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -4,8 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from enum import Enum -from typing import List, Protocol +from typing import Protocol from llama_models.schema_utils import webmethod @@ -13,23 +12,6 @@ from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_stack.apis.dataset import * # noqa: F403 -from llama_stack.apis.common.training_types import * # noqa: F403 - - -class TextGenerationMetric(Enum): - perplexity = "perplexity" - rouge = "rouge" - bleu = "bleu" - - -class QuestionAnsweringMetric(Enum): - em = "em" - f1 = "f1" - - -class SummarizationMetric(Enum): - rouge = "rouge" - bleu = "bleu" class EvaluationJob(BaseModel): @@ -40,37 +22,21 @@ class EvaluationJobLogStream(BaseModel): job_uuid: str -class EvaluateTaskRequestCommon(BaseModel): - job_uuid: str - dataset: TrainEvalDataset - - checkpoint: Checkpoint - - # generation params +class EvaluateTaskConfig(BaseModel): + # num examples to evaluate, evaluate all if None + n_samples: Optional[int] = None + # model evaluation params sampling_params: SamplingParams = SamplingParams() @json_schema_type -class EvaluateTextGenerationRequest(EvaluateTaskRequestCommon): - """Request to evaluate text generation.""" - - metrics: List[TextGenerationMetric] - - -@json_schema_type -class EvaluateQuestionAnsweringRequest(EvaluateTaskRequestCommon): - """Request to evaluate question answering.""" +class EvaluateResponse(BaseModel): + """Scores for evaluation.""" - metrics: List[QuestionAnsweringMetric] + metrics: Dict[str, str] @json_schema_type -class EvaluateSummarizationRequest(EvaluateTaskRequestCommon): - """Request to evaluate summarization.""" - - metrics: List[SummarizationMetric] - - class EvaluationJobStatusResponse(BaseModel): job_uuid: str @@ -82,41 +48,44 @@ class EvaluationJobArtifactsResponse(BaseModel): job_uuid: str -class Evaluations(Protocol): - @webmethod(route="/evaluate/text_generation/") - def evaluate_text_generation( - self, - metrics: List[TextGenerationMetric], - ) -> EvaluationJob: ... - - @webmethod(route="/evaluate/question_answering/") - def evaluate_question_answering( - self, - metrics: List[QuestionAnsweringMetric], - ) -> EvaluationJob: ... - - @webmethod(route="/evaluate/summarization/") - def evaluate_summarization( - self, - metrics: List[SummarizationMetric], - ) -> EvaluationJob: ... - - @webmethod(route="/evaluate/jobs") - def get_evaluation_jobs(self) -> List[EvaluationJob]: ... - - @webmethod(route="/evaluate/job/status") - def get_evaluation_job_status( - self, job_uuid: str - ) -> EvaluationJobStatusResponse: ... +@json_schema_type +class EvaluationJobCreateResponse(BaseModel): + """Response to create a evaluation job.""" - # sends SSE stream of logs - @webmethod(route="/evaluate/job/logs") - def get_evaluation_job_logstream(self, job_uuid: str) -> EvaluationJobLogStream: ... + job_uuid: str - @webmethod(route="/evaluate/job/cancel") - def cancel_evaluation_job(self, job_uuid: str) -> None: ... - @webmethod(route="/evaluate/job/artifacts") - def get_evaluation_job_artifacts( - self, job_uuid: str - ) -> EvaluationJobArtifactsResponse: ... +class Evals(Protocol): + @webmethod(route="/evals/run") + async def run_evals( + self, + model: str, + task: str, + dataset: Optional[str] = None, + eval_task_config: Optional[EvaluateTaskConfig] = None, + ) -> EvaluateResponse: ... + + # @webmethod(route="/evals/jobs") + # def get_evaluation_jobs(self) -> List[EvaluationJob]: ... + + # @webmethod(route="/evals/job/create") + # async def create_evaluation_job( + # self, model: str, dataset: str, task: str + # ) -> EvaluationJob: ... + + # @webmethod(route="/evals/job/status") + # def get_evaluation_job_status( + # self, job_uuid: str + # ) -> EvaluationJobStatusResponse: ... + + # # sends SSE stream of logs + # @webmethod(route="/evals/job/logs") + # def get_evaluation_job_logstream(self, job_uuid: str) -> EvaluationJobLogStream: ... + + # @webmethod(route="/evals/job/cancel") + # def cancel_evaluation_job(self, job_uuid: str) -> None: ... + + # @webmethod(route="/evals/job/artifacts") + # def get_evaluation_job_artifacts( + # self, job_uuid: str + # ) -> EvaluationJobArtifactsResponse: ... diff --git a/llama_stack/distribution/registry/__init__.py b/llama_stack/distribution/registry/__init__.py new file mode 100644 index 0000000000..756f351d88 --- /dev/null +++ b/llama_stack/distribution/registry/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py new file mode 100644 index 0000000000..0b7a843953 --- /dev/null +++ b/llama_stack/distribution/registry/datasets/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +# TODO: make these import config based +from .dataset import CustomDataset, HFDataset +from .dataset_registry import DatasetRegistry + +DATASETS_REGISTRY = { + "mmlu-simple-eval-en": CustomDataset( + name="mmlu_eval", + url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", + ), + "hellaswag": HFDataset( + name="hellaswag", + url="hf://hellaswag?split=validation&trust_remote_code=True", + ), +} + +for k, v in DATASETS_REGISTRY.items(): + DatasetRegistry.register(k, v) diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py new file mode 100644 index 0000000000..1a16a5c51b --- /dev/null +++ b/llama_stack/distribution/registry/datasets/dataset.py @@ -0,0 +1,62 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from abc import ABC, abstractmethod +from urllib.parse import parse_qs, urlparse + +import pandas +from datasets import Dataset, load_dataset + + +class BaseDataset(ABC): + def __init__(self, name: str): + self.dataset = None + self.dataset_id = name + self.type = self.__class__.__name__ + + def __iter__(self): + return iter(self.dataset) + + @abstractmethod + def load(self): + pass + + +class CustomDataset(BaseDataset): + def __init__(self, name, url): + super().__init__(name) + self.url = url + + def load(self): + if self.dataset: + return + # TODO: better support w/ data url + if self.url.endswith(".csv"): + df = pandas.read_csv(self.url) + elif self.url.endswith(".xlsx"): + df = pandas.read_excel(self.url) + + self.dataset = Dataset.from_pandas(df) + + +class HFDataset(BaseDataset): + def __init__(self, name, url): + super().__init__(name) + self.url = url + + def load(self): + if self.dataset: + return + + parsed = urlparse(self.url) + + if parsed.scheme != "hf": + raise ValueError(f"Unknown HF dataset: {self.url}") + + query = parse_qs(parsed.query) + query = {k: v[0] for k, v in query.items()} + path = parsed.netloc + self.dataset = load_dataset(path, **query) diff --git a/llama_stack/distribution/registry/datasets/dataset_registry.py b/llama_stack/distribution/registry/datasets/dataset_registry.py new file mode 100644 index 0000000000..9ddaa8bb7a --- /dev/null +++ b/llama_stack/distribution/registry/datasets/dataset_registry.py @@ -0,0 +1,32 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import AbstractSet, Dict + +from .dataset import BaseDataset + + +class DatasetRegistry: + _REGISTRY: Dict[str, BaseDataset] = {} + + @staticmethod + def names() -> AbstractSet[str]: + return DatasetRegistry._REGISTRY.keys() + + @staticmethod + def register(name: str, task: BaseDataset) -> None: + if name in DatasetRegistry._REGISTRY: + raise ValueError(f"Dataset {name} already exists.") + DatasetRegistry._REGISTRY[name] = task + + @staticmethod + def get_dataset(name: str) -> BaseDataset: + if name not in DatasetRegistry._REGISTRY: + raise ValueError(f"Dataset {name} not found.") + return DatasetRegistry._REGISTRY[name] + + @staticmethod + def reset() -> None: + DatasetRegistry._REGISTRY = {} diff --git a/llama_stack/distribution/registry/tasks/__init__.py b/llama_stack/distribution/registry/tasks/__init__.py new file mode 100644 index 0000000000..01ccb18aee --- /dev/null +++ b/llama_stack/distribution/registry/tasks/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +# TODO: make these import config based +from llama_stack.providers.impls.meta_reference.evals.tasks.mmlu_task import MMLUTask +from .task_registry import TaskRegistry + +TaskRegistry.register( + "mmlu", + MMLUTask, +) diff --git a/llama_stack/distribution/registry/tasks/task.py b/llama_stack/distribution/registry/tasks/task.py new file mode 100644 index 0000000000..a92e6241b6 --- /dev/null +++ b/llama_stack/distribution/registry/tasks/task.py @@ -0,0 +1,49 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from abc import ABC, abstractmethod + + +class BaseTask(ABC): + """ + A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods. + Base class for all evaluation tasks. Each task needs to implement the following methods: + - F1: preprocess_sample(self) + - F2: postprocess_sample(self) + - F3: score_sample(self) + """ + + def __init__(self, dataset, *args, **kwargs): + super().__init__(*args, **kwargs) + self._name = self.__class__.__name__ + self.dataset = dataset + + @abstractmethod + def preprocess_sample(self, sample): + raise NotImplementedError() + + @abstractmethod + def postprocess_sample(self, sample): + raise NotImplementedError() + + @abstractmethod + def score_sample(self, sample, ground_truth): + raise NotImplementedError() + + @abstractmethod + def aggregate_results(self, eval_results): + raise NotImplementedError() + + def preprocess(self): + return [self.preprocess_sample(sample) for sample in self.dataset] + + def postprocess(self, generation): + return [self.postprocess_sample(sample) for sample in generation] + + def score(self, postprocessed): + return [ + self.score_sample(sample, ground_truth) + for sample, ground_truth in zip(postprocessed, self.dataset) + ] diff --git a/llama_stack/distribution/registry/tasks/task_registry.py b/llama_stack/distribution/registry/tasks/task_registry.py new file mode 100644 index 0000000000..063894e482 --- /dev/null +++ b/llama_stack/distribution/registry/tasks/task_registry.py @@ -0,0 +1,32 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import AbstractSet, Dict + +from .task import BaseTask + + +class TaskRegistry: + _REGISTRY: Dict[str, BaseTask] = {} + + @staticmethod + def names() -> AbstractSet[str]: + return TaskRegistry._REGISTRY.keys() + + @staticmethod + def register(name: str, task: BaseTask) -> None: + if name in TaskRegistry._REGISTRY: + raise ValueError(f"Task {name} already exists.") + TaskRegistry._REGISTRY[name] = task + + @staticmethod + def get_task(name: str) -> BaseTask: + if name not in TaskRegistry._REGISTRY: + raise ValueError(f"Task {name} not found.") + return TaskRegistry._REGISTRY[name] + + @staticmethod + def reset() -> None: + TaskRegistry._REGISTRY = {} diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py index a05e08cd7c..672a4ea60f 100644 --- a/llama_stack/distribution/resolver.py +++ b/llama_stack/distribution/resolver.py @@ -12,6 +12,7 @@ from llama_stack.distribution.datatypes import * # noqa: F403 from llama_stack.apis.agents import Agents +from llama_stack.apis.evals import Evals from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect from llama_stack.apis.memory import Memory @@ -38,6 +39,7 @@ def api_protocol_map() -> Dict[Api, Any]: Api.safety: Safety, Api.shields: Shields, Api.telemetry: Telemetry, + Api.evals: Evals, } diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py index 777cd855b7..50ab0691b9 100644 --- a/llama_stack/providers/datatypes.py +++ b/llama_stack/providers/datatypes.py @@ -28,6 +28,7 @@ class Api(Enum): models = "models" shields = "shields" memory_banks = "memory_banks" + evals = "evals" # built-in API inspect = "inspect" diff --git a/llama_stack/providers/impls/meta_reference/evals/__init__.py b/llama_stack/providers/impls/meta_reference/evals/__init__.py new file mode 100644 index 0000000000..f4dd4b79d6 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .config import MetaReferenceEvalsImplConfig # noqa +from llama_stack.apis.inference import * # noqa: F403 +from llama_stack.distribution.datatypes import Api, ProviderSpec + + +async def get_provider_impl( + config: MetaReferenceEvalsImplConfig, deps: Dict[Api, ProviderSpec] +): + from .evals import MetaReferenceEvalsImpl + + impl = MetaReferenceEvalsImpl(config, deps[Api.inference]) + await impl.initialize() + return impl diff --git a/llama_stack/providers/impls/meta_reference/evals/config.py b/llama_stack/providers/impls/meta_reference/evals/config.py new file mode 100644 index 0000000000..05dee366ed --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/config.py @@ -0,0 +1,10 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pydantic import BaseModel + + +class MetaReferenceEvalsImplConfig(BaseModel): ... diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py new file mode 100644 index 0000000000..5f475c5395 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -0,0 +1,71 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.inference import * # noqa: F403 +from llama_stack.apis.evals import * # noqa: F403 +from termcolor import cprint + +from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry + +from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry + +from .config import MetaReferenceEvalsImplConfig + + +class MetaReferenceEvalsImpl(Evals): + def __init__(self, config: MetaReferenceEvalsImplConfig, inference_api: Inference): + self.inference_api = inference_api + + async def initialize(self) -> None: + pass + + async def shutdown(self) -> None: + pass + + async def run_evals( + self, + model: str, + task: str, + dataset: Optional[str] = None, + eval_task_config: Optional[EvaluateTaskConfig] = None, + ) -> EvaluateResponse: + cprint( + f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}", + "red", + ) + if not dataset: + raise ValueError("dataset must be specified for mete-reference evals") + + dataset = DatasetRegistry.get_dataset(dataset) + dataset.load() + + task_impl = TaskRegistry.get_task(task)(dataset) + x1 = task_impl.preprocess() + + # TODO: replace w/ batch inference & async return eval job + generation_outputs = [] + if eval_task_config is None: + eval_task_config = EvaluateTaskConfig(n_samples=len(x1)) + if eval_task_config.n_samples is None or eval_task_config.n_samples > len(x1): + eval_task_config.n_samples = len(x1) + + print( + f"Eval generation start, generate on {eval_task_config.n_samples} samples" + ) + + for msg in x1[: eval_task_config.n_samples]: + print("generation for msg: ", msg) + response = await self.inference_api.chat_completion( + model=model, + messages=[msg], + stream=False, + ) + generation_outputs.append(response.completion_message.content) + + x2 = task_impl.postprocess(generation_outputs) + eval_results = task_impl.score(x2) + eval_response = task_impl.aggregate_results(eval_results) + return eval_response diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py b/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py new file mode 100644 index 0000000000..756f351d88 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py new file mode 100644 index 0000000000..673a953791 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py @@ -0,0 +1,150 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import re + +from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.distribution.registry.tasks.task import BaseTask + +QUERY_TEMPLATE_MULTICHOICE = """ +Answer the following multiple choice question and make the answer very simple. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. + +{Question} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + +MULTILINGUAL_ANSWER_REGEXES = [ + r"Answer\s*:", + r"Answer\s*:​​​​​​", # Korean invisible character + r"উত্তর\s*:", + r"उत्तर\s*:", + r"উত্তরঃ", + r"উত্তর\s*:", + r"Antwort\s*:", + r"답변\s*:", + r"정답\s*:", + r"답\s*:", + r"答案\s*:", + r"答案\s*:", + r"答\s*:", + r"答\s*:", + r"答复\s*:", + r"答曰\s*:", + r"الإجابة:", + r"الجواب:", + r"إجابة:", + r"الإجابة النهائية:", + r"الإجابة الصحيحة:", + r"الإجابة الصحيحة هي:", + r"الإجابة هي:", + r"Respuesta\s*:", + r"Risposta\s*:", + r"答え\s*:", + r"答え\s*:", + r"回答\s*:", + r"回答\s*:", + r"解答\s*:", + r"Jawaban\s*:", + r"Réponse\s*:", + r"Resposta\s*:", + r"Jibu\s*:", + r"Idahun\s*:", + r"Ìdáhùn\s*:", + r"Idáhùn\s*:", + r"Àmọ̀nà\s*:", + r"Àdáhùn\s*:", + r"Ànúgọ\s*:", + r"Àṣàyàn\s*:", +] + +MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = ( + r"(?i){}\s*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[A]|[B]|[C]|[D])" +) + + +def normalize_response(response: str) -> str: + """ + Normalize the response by removing markdown and LaTeX formatting that may prevent a match. + """ + + return ( + response.replace("**", "") + .replace("$\\boxed{", "") + .replace("}$", "") + .replace("\\$", "") + .replace("$\\text{", "") + .replace("$", "") + .replace("\\mathrm{", "") + .replace("\\{", "") + .replace("\\text", "") + .replace("\\(", "") + .replace("\\mathbf{", "") + .replace("{", "") + .replace("\\boxed", "") + ) + + +def normalize_extracted_answer(extracted_answer: str) -> str: + return ( + # In arabic these are the letters used for A-D in multiple choice questions + extracted_answer.replace("أ", " A") + .replace("ب", " B") + .replace("ج", " C") + .replace("د", " D") + # In Bengali these are the letters used for A-D in multiple choice questions + .replace("অ", " A") + .replace("ব", " B") + .replace("ড", " C") + .replace("ঢ", " D") + # In Japanese these are the letters sometimes used for A-D in multiple choice questions + .replace("A", " A") + .replace("B", " B") + .replace("C", " C") + .replace("D", " D") + .strip() + ) + + +class MMLUTask(BaseTask): + """ + MMLU Task. + """ + + def __init__(self, dataset, *args, **kwargs): + super().__init__(dataset, *args, **kwargs) + + def preprocess_sample(self, sample): + content = QUERY_TEMPLATE_MULTICHOICE.format(**sample) + return { + "role": "user", + "content": content, + } + + def postprocess_sample(self, sample): + normalized = normalize_response(sample) + return normalized + + def score_sample(self, sample, expected): + extracted_answer = None + for answer_regex in MULTILINGUAL_ANSWER_REGEXES: + regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex) + match = re.search(regex, sample) + if match: + extracted_answer = normalize_extracted_answer(match.group(1)) + break + score = ( + 1.0 if extracted_answer and extracted_answer == expected["Answer"] else 0.0 + ) + # TODO: generalize this into SingleEvalResult + return score + + def aggregate_results(self, eval_results): + return EvaluateResponse( + metrics={"score": str(sum(eval_results) / len(eval_results))} + ) diff --git a/llama_stack/providers/impls/third_party/evals/__init__.py b/llama_stack/providers/impls/third_party/evals/__init__.py new file mode 100644 index 0000000000..756f351d88 --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py b/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py new file mode 100644 index 0000000000..9886ed6d6c --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .config import EleutherEvalsImplConfig # noqa +from llama_stack.apis.inference import * # noqa: F403 +from llama_stack.distribution.datatypes import Api, ProviderSpec + + +async def get_provider_impl( + config: EleutherEvalsImplConfig, deps: Dict[Api, ProviderSpec] +): + from .eleuther import EleutherEvalsAdapter + + impl = EleutherEvalsAdapter(config, deps[Api.inference]) + await impl.initialize() + return impl diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/config.py b/llama_stack/providers/impls/third_party/evals/eleuther/config.py new file mode 100644 index 0000000000..a9ab297b42 --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/eleuther/config.py @@ -0,0 +1,10 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pydantic import BaseModel + + +class EleutherEvalsImplConfig(BaseModel): ... diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py new file mode 100644 index 0000000000..b9f9505e93 --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py @@ -0,0 +1,168 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import asyncio +from llama_stack.apis.inference import * # noqa: F403 +from llama_stack.apis.evals import * # noqa: F403 +import os +import random +import threading +from pathlib import Path + +import lm_eval +import tqdm +from lm_eval.api.model import LM +from lm_eval.evaluator import evaluate, get_task_list +from lm_eval.tasks import get_task_dict, TaskManager +from termcolor import cprint + +from .config import EleutherEvalsImplConfig + + +# https://stackoverflow.com/questions/74703727/how-to-call-async-function-from-sync-funcion-and-get-result-while-a-loop-is-alr +# We will use another thread wih its own event loop to run the async api within sync function +_loop = asyncio.new_event_loop() +_thr = threading.Thread(target=_loop.run_forever, name="Async Runner", daemon=True) + + +class EleutherEvalsWrapper(LM): + def __init__( + self, + inference_api: Inference, + model: str, + **kwargs, + ): + super().__init__(**kwargs) + self.inference_api = inference_api + self.model = model + self.tokenizer = None + self.tokenized_requests = False + self.kwargs = kwargs + + @property + def eot_token_id(self): + raise NotImplementedError("Not implemented") + + @property + def max_length(self) -> int: + return NotImplementedError("Not implemented") + + @property + def max_gen_toks(self) -> int: + return NotImplementedError("Not implemented") + + @property + def batch_size(self): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError("No support for logits.") + + @property + def device(self): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError("No support for logits.") + + @property + def world_size(self): + return 1 + + def tok_encode(self, string: str) -> List[int]: + return NotImplementedError("Not implemented") + + def tok_decode(self, tokens: List[int]) -> str: + return NotImplementedError("Not implemented") + + def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False): + raise NotImplementedError("No support for logits.") + + def _model_call(self, inps): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError() + + def _model_generate(self, context, max_length, eos_token_id): + # Isn't used because we override generate_until + raise NotImplementedError() + + def loglikelihood(self, requests, disable_tqdm: bool = False): + # TODO: implement inference completion with loglikelihood + res = [] + for req in requests: + res.append((-random.random(), False)) + + return res + + def loglikelihood_rolling(self, requests, disable_tqdm: bool = False): + raise NotImplementedError("No support for logits.") + + def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: + res = [] + if not _thr.is_alive(): + _thr.start() + for req in tqdm.tqdm(requests): + chat_completion_coro_fn = self.inference_api.chat_completion( + model=self.model, + messages=[ + { + "role": "user", + "content": req.args[0], + } + ], + stream=False, + ) + future = asyncio.run_coroutine_threadsafe(chat_completion_coro_fn, _loop) + response = future.result() + res.append(response.completion_message.content) + + return res + + +class EleutherEvalsAdapter(Evals): + def __init__(self, config: EleutherEvalsImplConfig, inference_api: Inference): + self.inference_api = inference_api + + async def initialize(self) -> None: + pass + + async def shutdown(self) -> None: + pass + + async def run_evals( + self, + model: str, + task: str, + dataset: Optional[str] = None, + eval_task_config: Optional[EvaluateTaskConfig] = None, + ) -> EvaluateResponse: + cprint(f"Eleuther Evals: {model} {dataset} {task}", "red") + + eluther_wrapper = EleutherEvalsWrapper(self.inference_api, model) + current_dir = Path(os.path.dirname(os.path.abspath(__file__))) + + # custom registry of harness tasks + task_manager = TaskManager( + include_path=str(current_dir / "tasks"), + ) + + task_dict = get_task_dict(task, task_manager) + cprint(task_dict, "blue") + + task_types = set([t.task.OUTPUT_TYPE for t in get_task_list(task_dict)]) + cprint(task_types, "cyan") + + output = evaluate( + eluther_wrapper, + task_dict, + limit=eval_task_config.n_samples, + ) + + formatted_output = lm_eval.utils.make_table(output) + + cprint(formatted_output, "green") + + return EvaluateResponse( + metrics={ + "metrics_table": formatted_output, + }, + ) diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml new file mode 100644 index 0000000000..e10277a314 --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml @@ -0,0 +1,32 @@ +task: meta_ifeval +dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals +dataset_name: Llama-3.1-8B-Instruct-evals__ifeval__strict__details +output_type: generate_until +test_split: latest +process_docs: !function utils.process_docs +num_fewshot: 0 +doc_to_text: prompt +doc_to_target: 0 +generation_kwargs: + until: [] + do_sample: false + temperature: 0.0 + max_gen_toks: 1280 +process_results: !function utils.process_results +metric_list: + - metric: prompt_level_strict_acc + aggregation: mean + higher_is_better: true + - metric: inst_level_strict_acc + aggregation: !function utils.agg_inst_level_acc + higher_is_better: true + - metric: prompt_level_loose_acc + aggregation: mean + higher_is_better: true + - metric: inst_level_loose_acc + aggregation: !function utils.agg_inst_level_acc + higher_is_better: true +metadata: + version: 2.0 +fewshot_config: + sampler: first_n diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py new file mode 100644 index 0000000000..aa171343fd --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py @@ -0,0 +1,191 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import dataclasses +from typing import Dict, Optional, Union + +import datasets + +from lm_eval.tasks.ifeval import instructions_registry + + +@dataclasses.dataclass +class InputExample: + key: int + instruction_id_list: list[str] + prompt: str + kwargs: list[Dict[str, Optional[Union[str, int]]]] + + +@dataclasses.dataclass +class OutputExample: + instruction_id_list: list[str] + prompt: str + response: str + follow_all_instructions: bool + follow_instruction_list: list[bool] + + +def test_instruction_following_strict( + inp, + response, +): + """Tests response to see if instructions are followed.""" + instruction_list = inp.instruction_id_list + is_following_list = [] + + for index, instruction_id in enumerate(instruction_list): + instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id] + instruction = instruction_cls(instruction_id) + + # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method. + kwargs = {k: v for k, v in inp.kwargs[index].items() if v} + instruction.build_description(**kwargs) + args = instruction.get_instruction_args() + if args and "prompt" in args: + instruction.build_description(prompt=inp.prompt) + + if response.strip() and instruction.check_following(response): + is_following_list.append(True) + else: + is_following_list.append(False) + + return OutputExample( + instruction_id_list=inp.instruction_id_list, + prompt=inp.prompt, + response=response, + follow_all_instructions=all(is_following_list), + follow_instruction_list=is_following_list, + ) + + +def test_instruction_following_loose( + inp, + response, +): + """Tests response for an upper bound for following instructions.""" + r = response.split("\n") + response_remove_first = "\n".join(r[1:]).strip() + response_remove_last = "\n".join(r[:-1]).strip() + response_remove_both = "\n".join(r[1:-1]).strip() + revised_response = response.replace("*", "") + revised_response_remove_first = response_remove_first.replace("*", "") + revised_response_remove_last = response_remove_last.replace("*", "") + revised_response_remove_both = response_remove_both.replace("*", "") + all_responses = [ + response, + revised_response, + response_remove_first, + response_remove_last, + response_remove_both, + revised_response_remove_first, + revised_response_remove_last, + revised_response_remove_both, + ] + instruction_list = inp.instruction_id_list + is_following_list = [] + + for index, instruction_id in enumerate(instruction_list): + instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id] + instruction = instruction_cls(instruction_id) + + # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method. + kwargs = {k: v for k, v in inp.kwargs[index].items() if v} + instruction.build_description(**kwargs) + args = instruction.get_instruction_args() + if args and "prompt" in args: + instruction.build_description(prompt=inp.prompt) + + is_following = False + for r in all_responses: + if r.strip() and instruction.check_following(r): + is_following = True + break + + is_following_list.append(is_following) + + return OutputExample( + instruction_id_list=inp.instruction_id_list, + prompt=inp.prompt, + response=response, + follow_all_instructions=all(is_following_list), + follow_instruction_list=is_following_list, + ) + + +def process_results(doc, results): + new_kwargs = [] + for item in doc["kwargs"]: + if item["nth_paragraph"]: + item["nth_paragraph"] = int(item["nth_paragraph"]) + new_kwargs.append(item) + inp = InputExample( + key=doc["key"], + instruction_id_list=doc["instruction_id_list"], + prompt=doc["prompt"], + kwargs=new_kwargs, + ) + response = results[0] + + out_strict = test_instruction_following_strict(inp, response) + out_loose = test_instruction_following_loose(inp, response) + + return { + "prompt_level_strict_acc": out_strict.follow_all_instructions, + "inst_level_strict_acc": out_strict.follow_instruction_list, + "prompt_level_loose_acc": out_loose.follow_all_instructions, + "inst_level_loose_acc": out_loose.follow_instruction_list, + } + + +def agg_inst_level_acc(items): + flat_items = [item for sublist in items for item in sublist] + inst_level_acc = sum(flat_items) / len(flat_items) + return inst_level_acc + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _get_question(example: dict) -> dict: + # get the question from the ifeval dataset + example["input_question"] = ( + eval( + example["input_question"] + .replace("null", "None") + .replace("true", "True") + .replace("false", "False") + )["dialog"][0]["body"] + .replace("Is it True that the first song", "Is it true that the first song") + .replace("Is the following True", "Is the following true") + ) + example["input_final_prompts"] = example["input_final_prompts"][0] + return example + + original_dataset_name = "wis-k/instruction-following-eval" + ifeval_data = datasets.load_dataset(original_dataset_name, split="train") + ifeval_df = ifeval_data.to_pandas() + ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"}) + + meta_dataset = dataset.map(_get_question) + meta_df = meta_dataset.to_pandas() + + # join the two datasets on the input_question column + joined = meta_df.join(ifeval_df.set_index("input_question"), on="input_question") + joined = joined.rename(columns={"input_final_prompts": "prompt"}) + joined = joined.rename(columns={"is_correct": "previous_is_correct"}) + joined = datasets.Dataset.from_pandas(joined) + joined = joined.select_columns( + [ + "input_question", + "prompt", + "previous_is_correct", + "instruction_id_list", + "kwargs", + "output_prediction_text", + "key", + ] + ) + joined.rename_column("output_prediction_text", "previous_output_prediction_text") + return joined diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml new file mode 100644 index 0000000000..1ec3c107d8 --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml @@ -0,0 +1,29 @@ +task: meta_mmlu_pro_instruct +dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals +dataset_name: Llama-3.1-8B-Instruct-evals__mmlu_pro__details +test_split: latest +output_type: generate_until +process_docs: !function utils.process_docs +doc_to_text: !function utils.doc_to_text +doc_to_target: gold +filter_list: + - name: "strict-match" + filter: + - function: "regex" + group_select: -1 + regex_pattern: 'best answer is ([A-Z])' + - function: "take_first" +generation_kwargs: + until: [] + do_sample: false + temperature: 0 + max_gen_toks: 1024 +num_fewshot: 0 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +metadata: + version: 1.0 diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py new file mode 100644 index 0000000000..6b8bc3e5b2 --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py @@ -0,0 +1,35 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import datasets + + +def doc_to_text(doc: dict) -> str: + return doc["input_final_prompts"][0] + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc: dict) -> dict: + out_doc = { + "problem": doc["input_question"], + "gold": doc["input_correct_responses"][0], + } + return out_doc + + dataset = dataset.select_columns( + [ + "input_question", + "input_correct_responses", + "input_final_prompts", + "is_correct", + "input_question_hash", + "input_choice_list", + "output_prediction_text", + ], + ) + dataset = dataset.rename_column("is_correct", "previously_is_correct") + dataset = dataset.map(_process_doc) + return dataset diff --git a/llama_stack/providers/registry/evals.py b/llama_stack/providers/registry/evals.py new file mode 100644 index 0000000000..8693ec603a --- /dev/null +++ b/llama_stack/providers/registry/evals.py @@ -0,0 +1,42 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import List + +from llama_stack.distribution.datatypes import * # noqa: F403 + + +def available_providers() -> List[ProviderSpec]: + return [ + InlineProviderSpec( + api=Api.evals, + provider_type="meta-reference", + pip_packages=[ + "matplotlib", + "pillow", + "pandas", + "scikit-learn", + "datasets", + ], + module="llama_stack.providers.impls.meta_reference.evals", + config_class="llama_stack.providers.impls.meta_reference.evals.MetaReferenceEvalsImplConfig", + api_dependencies=[ + Api.inference, + ], + ), + InlineProviderSpec( + api=Api.evals, + provider_type="eleuther", + pip_packages=[ + "lm-eval", + ], + module="llama_stack.providers.impls.third_party.evals.eleuther", + config_class="llama_stack.providers.impls.third_party.evals.eleuther.EleutherEvalsImplConfig", + api_dependencies=[ + Api.inference, + ], + ), + ] diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py index 9fffc0f99a..2070649043 100644 --- a/llama_stack/providers/utils/telemetry/tracing.py +++ b/llama_stack/providers/utils/telemetry/tracing.py @@ -152,7 +152,7 @@ def severity(levelname: str) -> LogSeverity: elif levelname == "INFO": return LogSeverity.INFO elif levelname == "WARNING": - return LogSeverity.WARNING + return LogSeverity.WARN elif levelname == "ERROR": return LogSeverity.ERROR elif levelname == "CRITICAL": diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml index e12f6e8528..1422d6ee20 100644 --- a/tests/examples/local-run.yaml +++ b/tests/examples/local-run.yaml @@ -11,7 +11,12 @@ apis: - memory_banks - inference - safety +- evals providers: + evals: + - provider_id: meta-reference + provider_type: meta-reference + config: {} inference: - provider_id: meta-reference provider_type: meta-reference From c8de439d9fc7303e704fa88002457a08dfb0674d Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 10 Oct 2024 11:38:37 -0700 Subject: [PATCH 02/27] clean --- llama_stack/apis/dataset/dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py index ba2cb88110..8ab135b6a5 100644 --- a/llama_stack/apis/dataset/dataset.py +++ b/llama_stack/apis/dataset/dataset.py @@ -4,7 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# from enum import Enum from typing import Any, Dict, Optional, Protocol from llama_models.llama3.api.datatypes import URL From 99ed1425fc4db16973fc6224e22caeeb9f2b19dc Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 10 Oct 2024 17:19:18 -0700 Subject: [PATCH 03/27] add dataset datatypes --- llama_stack/apis/dataset/dataset.py | 95 +++++++++++++++---- llama_stack/apis/evals/evals.py | 1 + .../registry/datasets/__init__.py | 28 +++--- .../distribution/registry/datasets/dataset.py | 94 +++++++++++------- .../registry/datasets/dataset_registry.py | 2 +- 5 files changed, 154 insertions(+), 66 deletions(-) diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py index 8ab135b6a5..164e16be44 100644 --- a/llama_stack/apis/dataset/dataset.py +++ b/llama_stack/apis/dataset/dataset.py @@ -4,46 +4,105 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Any, Dict, Optional, Protocol - -from llama_models.llama3.api.datatypes import URL +from abc import ABC, abstractmethod +from enum import Enum +from typing import Any, Dict, Generic, Iterator, Literal, Protocol, TypeVar, Union from llama_models.schema_utils import json_schema_type, webmethod -from pydantic import BaseModel +from pydantic import BaseModel, Field +from typing_extensions import Annotated + +TDatasetRow = TypeVar("TDatasetRow") @json_schema_type -class TrainEvalDataset(BaseModel): - """Dataset to be used for training or evaluating language models.""" +class DatasetRow(BaseModel): ... + - # unique identifier associated with the dataset - dataset_id: str - content_url: URL - metadata: Optional[Dict[str, Any]] = None +@json_schema_type +class DictSample(DatasetRow): + data: Dict[str, Any] @json_schema_type -class CreateDatasetRequest(BaseModel): - """Request to create a dataset.""" +class Generation(BaseModel): ... + - uuid: str - dataset: TrainEvalDataset +@json_schema_type +class DatasetType(Enum): + custom = "custom" + huggingface = "huggingface" + + +@json_schema_type +class HuggingfaceDatasetDef(BaseModel): + type: Literal[DatasetType.huggingface.value] = DatasetType.huggingface.value + identifier: str = Field( + description="A unique name for the dataset", + ) + dataset_name: str = Field( + description="The name of the dataset into HF (e.g. hellawag)", + ) + kwargs: Dict[str, Any] = Field( + description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)", + default_factory=dict, + ) + + +@json_schema_type +class CustomDatasetDef(BaseModel): + type: Literal[DatasetType.custom.value] = DatasetType.custom.value + identifier: str = Field( + description="A unique name for the dataset", + ) + url: str = Field( + description="The URL to the dataset", + ) + + +DatasetDef = Annotated[ + Union[ + HuggingfaceDatasetDef, + CustomDatasetDef, + ], + Field(discriminator="type"), +] + + +class BaseDataset(ABC, Generic[TDatasetRow]): + def __init__(self) -> None: + self.type: str = self.__class__.__name__ + + @abstractmethod + def __iter__(self) -> Iterator[TDatasetRow]: + raise NotImplementedError() + + @abstractmethod + def load(self) -> None: + raise NotImplementedError() + + @abstractmethod + def __str__(self) -> str: + raise NotImplementedError() + + @abstractmethod + def __len__(self) -> int: + raise NotImplementedError() class Datasets(Protocol): @webmethod(route="/datasets/create") def create_dataset( self, - uuid: str, - dataset: TrainEvalDataset, + dataset: DatasetDef, ) -> None: ... @webmethod(route="/datasets/get") def get_dataset( self, - dataset_uuid: str, - ) -> TrainEvalDataset: ... + dataset_identifier: str, + ) -> DatasetDef: ... @webmethod(route="/datasets/delete") def delete_dataset( diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index dbb1348a53..629e68d32b 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -33,6 +33,7 @@ class EvaluateTaskConfig(BaseModel): class EvaluateResponse(BaseModel): """Scores for evaluation.""" + preprocess_output: GenerationOutput metrics: Dict[str, str] diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py index 0b7a843953..3a60d6a5e7 100644 --- a/llama_stack/distribution/registry/datasets/__init__.py +++ b/llama_stack/distribution/registry/datasets/__init__.py @@ -5,19 +5,19 @@ # the root directory of this source tree. # TODO: make these import config based -from .dataset import CustomDataset, HFDataset -from .dataset_registry import DatasetRegistry +# from .dataset import CustomDataset, HFDataset +# from .dataset_registry import DatasetRegistry -DATASETS_REGISTRY = { - "mmlu-simple-eval-en": CustomDataset( - name="mmlu_eval", - url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", - ), - "hellaswag": HFDataset( - name="hellaswag", - url="hf://hellaswag?split=validation&trust_remote_code=True", - ), -} +# DATASETS_REGISTRY = { +# "mmlu-simple-eval-en": CustomDataset( +# name="mmlu_eval", +# url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", +# ), +# "hellaswag": HFDataset( +# name="hellaswag", +# url="hf://hellaswag?split=validation&trust_remote_code=True", +# ), +# } -for k, v in DATASETS_REGISTRY.items(): - DatasetRegistry.register(k, v) +# for k, v in DATASETS_REGISTRY.items(): +# DatasetRegistry.register(k, v) diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py index 1a16a5c51b..e3a2de3994 100644 --- a/llama_stack/distribution/registry/datasets/dataset.py +++ b/llama_stack/distribution/registry/datasets/dataset.py @@ -3,60 +3,88 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. - -from abc import ABC, abstractmethod -from urllib.parse import parse_qs, urlparse - import pandas from datasets import Dataset, load_dataset +from llama_stack.apis.dataset import * # noqa: F403 + -class BaseDataset(ABC): - def __init__(self, name: str): +class CustomDataset(BaseDataset[DictSample]): + def __init__(self, config: CustomDatasetDef) -> None: + super().__init__() + self.config = config self.dataset = None - self.dataset_id = name - self.type = self.__class__.__name__ + self.index = 0 - def __iter__(self): - return iter(self.dataset) + def __iter__(self) -> Iterator[DictSample]: + return self - @abstractmethod - def load(self): - pass + def __next__(self) -> DictSample: + if not self.dataset: + self.load() + if self.index >= len(self.dataset): + raise StopIteration + sample = DictSample(data=self.dataset[self.index]) + self.index += 1 + return sample + def __str__(self): + return f"CustomDataset({self.config})" -class CustomDataset(BaseDataset): - def __init__(self, name, url): - super().__init__(name) - self.url = url + def __len__(self): + if not self.dataset: + self.load() + return len(self.dataset) def load(self): if self.dataset: return # TODO: better support w/ data url - if self.url.endswith(".csv"): - df = pandas.read_csv(self.url) - elif self.url.endswith(".xlsx"): - df = pandas.read_excel(self.url) + if self.config.url.endswith(".csv"): + df = pandas.read_csv(self.config.url) + elif self.config.url.endswith(".xlsx"): + df = pandas.read_excel(self.config.url) self.dataset = Dataset.from_pandas(df) -class HFDataset(BaseDataset): - def __init__(self, name, url): - super().__init__(name) - self.url = url +class HuggingfaceDataset(BaseDataset[DictSample]): + def __init__(self, config: HuggingfaceDatasetDef): + super().__init__() + self.config = config + self.dataset = None + self.index = 0 + + def __iter__(self) -> Iterator[DictSample]: + return self + + def __next__(self) -> DictSample: + if not self.dataset: + self.load() + if self.index >= len(self.dataset): + raise StopIteration + sample = DictSample(data=self.dataset[self.index]) + self.index += 1 + return sample + + def __str__(self): + return f"HuggingfaceDataset({self.config})" + + def __len__(self): + if not self.dataset: + self.load() + return len(self.dataset) def load(self): if self.dataset: return + self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs) + # parsed = urlparse(self.url) - parsed = urlparse(self.url) - - if parsed.scheme != "hf": - raise ValueError(f"Unknown HF dataset: {self.url}") + # if parsed.scheme != "hf": + # raise ValueError(f"Unknown HF dataset: {self.url}") - query = parse_qs(parsed.query) - query = {k: v[0] for k, v in query.items()} - path = parsed.netloc - self.dataset = load_dataset(path, **query) + # query = parse_qs(parsed.query) + # query = {k: v[0] for k, v in query.items()} + # path = parsed.netloc + # self.dataset = load_dataset(path, **query) diff --git a/llama_stack/distribution/registry/datasets/dataset_registry.py b/llama_stack/distribution/registry/datasets/dataset_registry.py index 9ddaa8bb7a..8e9b22266a 100644 --- a/llama_stack/distribution/registry/datasets/dataset_registry.py +++ b/llama_stack/distribution/registry/datasets/dataset_registry.py @@ -5,7 +5,7 @@ # the root directory of this source tree. from typing import AbstractSet, Dict -from .dataset import BaseDataset +from llama_stack.apis.dataset import BaseDataset class DatasetRegistry: From 9816c9aae69803e880377ee97db517c1c0dfea0c Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 10 Oct 2024 19:56:19 -0700 Subject: [PATCH 04/27] wip add datatypes --- llama_stack/apis/dataset/dataset.py | 58 ++++++++++-- llama_stack/apis/evals/evals.py | 94 +++++++++++++++++-- .../registry/datasets/__init__.py | 34 ++++--- .../distribution/registry/datasets/dataset.py | 36 ++----- .../meta_reference/evals/tasks/mmlu_task.py | 10 +- 5 files changed, 175 insertions(+), 57 deletions(-) diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py index 164e16be44..9a4f442e52 100644 --- a/llama_stack/apis/dataset/dataset.py +++ b/llama_stack/apis/dataset/dataset.py @@ -13,20 +13,59 @@ from pydantic import BaseModel, Field from typing_extensions import Annotated -TDatasetRow = TypeVar("TDatasetRow") +# A sample (row) from raw dataset +TDatasetSample = TypeVar("TDatasetSample") @json_schema_type -class DatasetRow(BaseModel): ... +class DatasetSample(BaseModel): ... @json_schema_type -class DictSample(DatasetRow): +class DictSample(DatasetSample): data: Dict[str, Any] @json_schema_type -class Generation(BaseModel): ... +class ProcessedDictSample(DatasetSample): + data: Dict[str, Any] + preprocessed: Dict[str, Any] + prediction: Dict[str, Any] + postprocessed: Dict[str, Any] + + +# # A sample (row) after preprocessing the raw dataset +# TPreprocessedSample = TypeVar("TPreprocessedSample") + +# @json_schema_type +# class PreprocessedSample(BaseModel): ... + +# @json_schema_type +# class InferencePreprocessedSample(PreprocessedSample): +# # TODO: either keep it generic or specific to inference API +# # messages: List[Message] +# data: Dict[str, Any] + +# # A sample (row) from model prediction output +# TPredictionSample = TypeVar("TPredictionSample") + +# @json_schema_type +# class PredictionSample(BaseModel): ... + +# @json_schema_type +# class InferencePredictionSample(PredictionSample): +# data: Dict[str, Any] + + +# # A sample (row) from post-processed output +# TPostprocessedSample = TypeVar("TPostprocessedSample") + +# @json_schema_type +# class PostprocessedSample(BaseModel): ... + +# @json_schema_type +# class InferencePostprocessedSample(PredictionSample): +# data: Dict[str, Any] @json_schema_type @@ -70,16 +109,17 @@ class CustomDatasetDef(BaseModel): ] -class BaseDataset(ABC, Generic[TDatasetRow]): +class BaseDataset(ABC, Generic[TDatasetSample]): def __init__(self) -> None: self.type: str = self.__class__.__name__ + @property @abstractmethod - def __iter__(self) -> Iterator[TDatasetRow]: + def dataset_id(self) -> str: raise NotImplementedError() @abstractmethod - def load(self) -> None: + def __iter__(self) -> Iterator[TDatasetSample]: raise NotImplementedError() @abstractmethod @@ -90,6 +130,10 @@ def __str__(self) -> str: def __len__(self) -> int: raise NotImplementedError() + @abstractmethod + def load(self) -> None: + raise NotImplementedError() + class Datasets(Protocol): @webmethod(route="/datasets/create") diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index 629e68d32b..53a2ff6df1 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -4,10 +4,10 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Protocol +from abc import ABC, abstractmethod +from typing import Dict, Generic, List, Protocol from llama_models.schema_utils import webmethod - from pydantic import BaseModel from llama_models.llama3.api.datatypes import * # noqa: F403 @@ -22,19 +22,26 @@ class EvaluationJobLogStream(BaseModel): job_uuid: str -class EvaluateTaskConfig(BaseModel): - # num examples to evaluate, evaluate all if None - n_samples: Optional[int] = None - # model evaluation params - sampling_params: SamplingParams = SamplingParams() +@json_schema_type +class EvalResult(BaseModel): + """Evaluation result.""" + + metrics: Dict[str, str] + + +@json_schema_type +class SingleEvalResult(BaseModel): + """Single evaluation result.""" + + score_data: Dict[str, float] @json_schema_type class EvaluateResponse(BaseModel): """Scores for evaluation.""" - preprocess_output: GenerationOutput - metrics: Dict[str, str] + eval_result: EvalResult + formatted_report: Optional[str] = None @json_schema_type @@ -56,6 +63,75 @@ class EvaluationJobCreateResponse(BaseModel): job_uuid: str +@json_schema_type +class EvaluateTaskConfig(BaseModel): + # num examples to evaluate, evaluate all if None + n_samples: Optional[int] = None + # model evaluation params + sampling_params: SamplingParams = SamplingParams() + + +class BaseTask( + ABC, + Generic[ + TDatasetSample, + TPreprocessedSample, + TPredictionSample, + TPostprocessedSample, + TSingleEvalResult, + ], +): + """ + A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods. + Base class for all evaluation tasks. Each task needs to implement the following methods: + - F1: preprocess_sample(self) + - F2: postprocess_sample(self) + - F3: score_sample(self) + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self._name = self.__class__.__name__ + + @abstractmethod + def preprocess_sample(self, sample: TDatasetSample) -> TPreprocessedSample: + raise NotImplementedError() + + @abstractmethod + def postprocess_sample(self, sample: TPredictionSample) -> TPostprocessedSample: + raise NotImplementedError() + + @abstractmethod + def score_sample( + self, sample: TPostprocessedSample, ground_truth: TPreprocessedSample + ): + raise NotImplementedError() + + @abstractmethod + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + raise NotImplementedError() + + def preprocess( + self, dataset: BaseDataset[TDatasetSample] + ) -> List[TPreprocessedSample]: + return [self.preprocess_sample(sample) for sample in self.dataset] + + def postprocess( + self, generation: List[TPredictionSample] + ) -> List[TPostprocessedSample]: + return [self.postprocess_sample(sample) for sample in generation] + + def score( + self, + postprocessed: List[TPostprocessedSample], + preprocessed_dataset: List[TPreprocessedSample], + ) -> List[TSingleEvalResult]: + return [ + self.score_sample(sample, ground_truth) + for sample, ground_truth in zip(postprocessed, self.preprocessed_dataset) + ] + + class Evals(Protocol): @webmethod(route="/evals/run") async def run_evals( diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py index 3a60d6a5e7..f0636212ae 100644 --- a/llama_stack/distribution/registry/datasets/__init__.py +++ b/llama_stack/distribution/registry/datasets/__init__.py @@ -5,19 +5,25 @@ # the root directory of this source tree. # TODO: make these import config based -# from .dataset import CustomDataset, HFDataset -# from .dataset_registry import DatasetRegistry +from llama_stack.apis.dataset import * # noqa: F403 +from .dataset import CustomDataset, HuggingfaceDataset +from .dataset_registry import DatasetRegistry -# DATASETS_REGISTRY = { -# "mmlu-simple-eval-en": CustomDataset( -# name="mmlu_eval", -# url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", -# ), -# "hellaswag": HFDataset( -# name="hellaswag", -# url="hf://hellaswag?split=validation&trust_remote_code=True", -# ), -# } +DATASETS_REGISTRY = [ + CustomDataset( + config=CustomDatasetDef( + identifier="mmlu-simple-eval-en", + url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", + ) + ), + HuggingfaceDataset( + config=HuggingfaceDatasetDef( + identifier="hellaswag", + dataset_name="hellaswag", + kwargs={"split": "validation", "trust_remote_code": True}, + ) + ), +] -# for k, v in DATASETS_REGISTRY.items(): -# DatasetRegistry.register(k, v) +for d in DATASETS_REGISTRY: + DatasetRegistry.register(d.dataset_id, d) diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py index e3a2de3994..87a01d311b 100644 --- a/llama_stack/distribution/registry/datasets/dataset.py +++ b/llama_stack/distribution/registry/datasets/dataset.py @@ -16,17 +16,14 @@ def __init__(self, config: CustomDatasetDef) -> None: self.dataset = None self.index = 0 - def __iter__(self) -> Iterator[DictSample]: - return self + @property + def dataset_id(self) -> str: + return self.config.identifier - def __next__(self) -> DictSample: + def __iter__(self) -> Iterator[DictSample]: if not self.dataset: self.load() - if self.index >= len(self.dataset): - raise StopIteration - sample = DictSample(data=self.dataset[self.index]) - self.index += 1 - return sample + return (DictSample(data=x) for x in self.dataset) def __str__(self): return f"CustomDataset({self.config})" @@ -53,19 +50,15 @@ def __init__(self, config: HuggingfaceDatasetDef): super().__init__() self.config = config self.dataset = None - self.index = 0 - def __iter__(self) -> Iterator[DictSample]: - return self + @property + def dataset_id(self) -> str: + return self.config.identifier - def __next__(self) -> DictSample: + def __iter__(self) -> Iterator[DictSample]: if not self.dataset: self.load() - if self.index >= len(self.dataset): - raise StopIteration - sample = DictSample(data=self.dataset[self.index]) - self.index += 1 - return sample + return (DictSample(data=x) for x in self.dataset) def __str__(self): return f"HuggingfaceDataset({self.config})" @@ -79,12 +72,3 @@ def load(self): if self.dataset: return self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs) - # parsed = urlparse(self.url) - - # if parsed.scheme != "hf": - # raise ValueError(f"Unknown HF dataset: {self.url}") - - # query = parse_qs(parsed.query) - # query = {k: v[0] for k, v in query.items()} - # path = parsed.netloc - # self.dataset = load_dataset(path, **query) diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py index 673a953791..c5c9d97563 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py @@ -111,7 +111,14 @@ def normalize_extracted_answer(extracted_answer: str) -> str: ) -class MMLUTask(BaseTask): +class MMLUTask( + BaseTask[ + DictSample, + InferencePreprocessedSample, + InferencePredictionSample, + InferencePostprocessedSample, + ] +): """ MMLU Task. """ @@ -120,6 +127,7 @@ def __init__(self, dataset, *args, **kwargs): super().__init__(dataset, *args, **kwargs) def preprocess_sample(self, sample): + print(sample) content = QUERY_TEMPLATE_MULTICHOICE.format(**sample) return { "role": "user", From ad18dc94acd7ae1713a26432962fc12f8b168d6d Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 10 Oct 2024 21:33:13 -0700 Subject: [PATCH 05/27] add data structure to tasks --- llama_stack/apis/dataset/dataset.py | 49 +++--------- llama_stack/apis/evals/evals.py | 40 +++------- .../distribution/registry/tasks/task.py | 49 ------------ .../registry/tasks/task_registry.py | 2 +- .../impls/meta_reference/evals/evals.py | 35 +++++---- .../meta_reference/evals/tasks/mmlu_task.py | 75 +++++++++++-------- tests/examples/local-run.yaml | 18 +++-- 7 files changed, 100 insertions(+), 168 deletions(-) delete mode 100644 llama_stack/distribution/registry/tasks/task.py diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py index 9a4f442e52..ed21c429fd 100644 --- a/llama_stack/apis/dataset/dataset.py +++ b/llama_stack/apis/dataset/dataset.py @@ -9,11 +9,12 @@ from typing import Any, Dict, Generic, Iterator, Literal, Protocol, TypeVar, Union from llama_models.schema_utils import json_schema_type, webmethod +from llama_models.llama3.api.datatypes import * # noqa: F403 from pydantic import BaseModel, Field from typing_extensions import Annotated -# A sample (row) from raw dataset +# A sample (row) from dataset TDatasetSample = TypeVar("TDatasetSample") @@ -26,46 +27,20 @@ class DictSample(DatasetSample): data: Dict[str, Any] -@json_schema_type -class ProcessedDictSample(DatasetSample): - data: Dict[str, Any] - preprocessed: Dict[str, Any] - prediction: Dict[str, Any] - postprocessed: Dict[str, Any] - - -# # A sample (row) after preprocessing the raw dataset -# TPreprocessedSample = TypeVar("TPreprocessedSample") - -# @json_schema_type -# class PreprocessedSample(BaseModel): ... - -# @json_schema_type -# class InferencePreprocessedSample(PreprocessedSample): -# # TODO: either keep it generic or specific to inference API -# # messages: List[Message] -# data: Dict[str, Any] +# A sample (row) from evals intermediate dataset +TProcessedSample = TypeVar("TProcessedSample") -# # A sample (row) from model prediction output -# TPredictionSample = TypeVar("TPredictionSample") -# @json_schema_type -# class PredictionSample(BaseModel): ... - -# @json_schema_type -# class InferencePredictionSample(PredictionSample): -# data: Dict[str, Any] - - -# # A sample (row) from post-processed output -# TPostprocessedSample = TypeVar("TPostprocessedSample") +@json_schema_type +class PredictionSample(BaseModel): + completion_message: str -# @json_schema_type -# class PostprocessedSample(BaseModel): ... -# @json_schema_type -# class InferencePostprocessedSample(PredictionSample): -# data: Dict[str, Any] +@json_schema_type +class ProcessedDictSample(DictSample): + preprocessed: Optional[Dict[str, Any]] = None + prediction: Optional[PredictionSample] = None + postprocessed: Optional[Dict[str, Any]] = None @json_schema_type diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index 53a2ff6df1..6fe85408e4 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -71,16 +71,7 @@ class EvaluateTaskConfig(BaseModel): sampling_params: SamplingParams = SamplingParams() -class BaseTask( - ABC, - Generic[ - TDatasetSample, - TPreprocessedSample, - TPredictionSample, - TPostprocessedSample, - TSingleEvalResult, - ], -): +class BaseTask(ABC, Generic[TDatasetSample, TProcessedSample]): """ A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods. Base class for all evaluation tasks. Each task needs to implement the following methods: @@ -94,17 +85,15 @@ def __init__(self, *args, **kwargs) -> None: self._name = self.__class__.__name__ @abstractmethod - def preprocess_sample(self, sample: TDatasetSample) -> TPreprocessedSample: + def preprocess_sample(self, sample: TDatasetSample) -> TProcessedSample: raise NotImplementedError() @abstractmethod - def postprocess_sample(self, sample: TPredictionSample) -> TPostprocessedSample: + def postprocess_sample(self, sample: TProcessedSample) -> TProcessedSample: raise NotImplementedError() @abstractmethod - def score_sample( - self, sample: TPostprocessedSample, ground_truth: TPreprocessedSample - ): + def score_sample(self, sample: TProcessedSample) -> SingleEvalResult: raise NotImplementedError() @abstractmethod @@ -112,24 +101,15 @@ def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: raise NotImplementedError() def preprocess( - self, dataset: BaseDataset[TDatasetSample] - ) -> List[TPreprocessedSample]: - return [self.preprocess_sample(sample) for sample in self.dataset] + self, dataset: BaseDataset[TProcessedSample] + ) -> List[TProcessedSample]: + return [self.preprocess_sample(sample) for sample in dataset] - def postprocess( - self, generation: List[TPredictionSample] - ) -> List[TPostprocessedSample]: + def postprocess(self, generation: List[TProcessedSample]) -> List[TProcessedSample]: return [self.postprocess_sample(sample) for sample in generation] - def score( - self, - postprocessed: List[TPostprocessedSample], - preprocessed_dataset: List[TPreprocessedSample], - ) -> List[TSingleEvalResult]: - return [ - self.score_sample(sample, ground_truth) - for sample, ground_truth in zip(postprocessed, self.preprocessed_dataset) - ] + def score(self, postprocessed: List[TProcessedSample]) -> List[SingleEvalResult]: + return [self.score_sample(sample) for sample in postprocessed] class Evals(Protocol): diff --git a/llama_stack/distribution/registry/tasks/task.py b/llama_stack/distribution/registry/tasks/task.py deleted file mode 100644 index a92e6241b6..0000000000 --- a/llama_stack/distribution/registry/tasks/task.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. -from abc import ABC, abstractmethod - - -class BaseTask(ABC): - """ - A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods. - Base class for all evaluation tasks. Each task needs to implement the following methods: - - F1: preprocess_sample(self) - - F2: postprocess_sample(self) - - F3: score_sample(self) - """ - - def __init__(self, dataset, *args, **kwargs): - super().__init__(*args, **kwargs) - self._name = self.__class__.__name__ - self.dataset = dataset - - @abstractmethod - def preprocess_sample(self, sample): - raise NotImplementedError() - - @abstractmethod - def postprocess_sample(self, sample): - raise NotImplementedError() - - @abstractmethod - def score_sample(self, sample, ground_truth): - raise NotImplementedError() - - @abstractmethod - def aggregate_results(self, eval_results): - raise NotImplementedError() - - def preprocess(self): - return [self.preprocess_sample(sample) for sample in self.dataset] - - def postprocess(self, generation): - return [self.postprocess_sample(sample) for sample in generation] - - def score(self, postprocessed): - return [ - self.score_sample(sample, ground_truth) - for sample, ground_truth in zip(postprocessed, self.dataset) - ] diff --git a/llama_stack/distribution/registry/tasks/task_registry.py b/llama_stack/distribution/registry/tasks/task_registry.py index 063894e482..df25686ba6 100644 --- a/llama_stack/distribution/registry/tasks/task_registry.py +++ b/llama_stack/distribution/registry/tasks/task_registry.py @@ -5,7 +5,7 @@ # the root directory of this source tree. from typing import AbstractSet, Dict -from .task import BaseTask +from llama_stack.apis.evals import BaseTask class TaskRegistry: diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index 5f475c5395..d7214663ef 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -6,6 +6,8 @@ from llama_stack.apis.inference import * # noqa: F403 from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.apis.dataset import * # noqa: F403 + from termcolor import cprint from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry @@ -42,30 +44,37 @@ async def run_evals( dataset = DatasetRegistry.get_dataset(dataset) dataset.load() - task_impl = TaskRegistry.get_task(task)(dataset) - x1 = task_impl.preprocess() + task_impl = TaskRegistry.get_task(task)() + preprocessed = task_impl.preprocess(dataset) # TODO: replace w/ batch inference & async return eval job generation_outputs = [] if eval_task_config is None: - eval_task_config = EvaluateTaskConfig(n_samples=len(x1)) - if eval_task_config.n_samples is None or eval_task_config.n_samples > len(x1): - eval_task_config.n_samples = len(x1) + eval_task_config = EvaluateTaskConfig(n_samples=len(preprocessed)) + if eval_task_config.n_samples is None or eval_task_config.n_samples > len( + preprocessed + ): + eval_task_config.n_samples = len(preprocessed) print( f"Eval generation start, generate on {eval_task_config.n_samples} samples" ) - for msg in x1[: eval_task_config.n_samples]: - print("generation for msg: ", msg) + for sample in preprocessed[: eval_task_config.n_samples]: + print("generation: ", sample) response = await self.inference_api.chat_completion( model=model, - messages=[msg], + messages=sample.preprocessed["messages"], stream=False, ) - generation_outputs.append(response.completion_message.content) + sample.prediction = PredictionSample( + completion_message=response.completion_message.content + ) + generation_outputs.append(sample) - x2 = task_impl.postprocess(generation_outputs) - eval_results = task_impl.score(x2) - eval_response = task_impl.aggregate_results(eval_results) - return eval_response + postprocessed = task_impl.postprocess(generation_outputs) + eval_results = task_impl.score(postprocessed) + aggr_result = task_impl.aggregate_results(eval_results) + return EvaluateResponse( + eval_result=aggr_result, + ) diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py index c5c9d97563..e3d9e4ef3c 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py @@ -6,7 +6,8 @@ import re from llama_stack.apis.evals import * # noqa: F403 -from llama_stack.distribution.registry.tasks.task import BaseTask + +# from llama_stack.distribution.registry.tasks.task import BaseTask QUERY_TEMPLATE_MULTICHOICE = """ Answer the following multiple choice question and make the answer very simple. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. @@ -111,48 +112,60 @@ def normalize_extracted_answer(extracted_answer: str) -> str: ) -class MMLUTask( - BaseTask[ - DictSample, - InferencePreprocessedSample, - InferencePredictionSample, - InferencePostprocessedSample, - ] -): +class MMLUTask(BaseTask[DictSample, ProcessedDictSample]): """ MMLU Task. """ - def __init__(self, dataset, *args, **kwargs): - super().__init__(dataset, *args, **kwargs) - - def preprocess_sample(self, sample): - print(sample) - content = QUERY_TEMPLATE_MULTICHOICE.format(**sample) - return { - "role": "user", - "content": content, + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def preprocess_sample(self, sample: ProcessedDictSample) -> ProcessedDictSample: + content = QUERY_TEMPLATE_MULTICHOICE.format(**sample.data) + preprocessed = { + "messages": [ + { + "role": "user", + "content": content, + } + ], } + processed_sample = ProcessedDictSample( + data=sample.data, + preprocessed=preprocessed, + ) + return processed_sample + + def postprocess_sample(self, sample: ProcessedDictSample) -> ProcessedDictSample: + if not sample.postprocessed: + sample.postprocessed = {} + sample.postprocessed["postprocessed"] = normalize_response( + sample.prediction.completion_message + ) + return sample - def postprocess_sample(self, sample): - normalized = normalize_response(sample) - return normalized + def score_sample(self, sample: ProcessedDictSample) -> SingleEvalResult: + postprocessed_output = sample.postprocessed["postprocessed"] + expected_answer = sample.data["Answer"] - def score_sample(self, sample, expected): extracted_answer = None for answer_regex in MULTILINGUAL_ANSWER_REGEXES: regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex) - match = re.search(regex, sample) + match = re.search(regex, postprocessed_output) if match: extracted_answer = normalize_extracted_answer(match.group(1)) break - score = ( - 1.0 if extracted_answer and extracted_answer == expected["Answer"] else 0.0 - ) - # TODO: generalize this into SingleEvalResult - return score - def aggregate_results(self, eval_results): - return EvaluateResponse( - metrics={"score": str(sum(eval_results) / len(eval_results))} + score = 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0 + + return SingleEvalResult( + score_data={ + "score": score, + }, ) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + print("aggregate_results", eval_results) + sum_score = sum([result.score_data["score"] for result in eval_results]) + + return EvalResult(metrics={"score": str(sum_score / len(eval_results))}) diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml index 1422d6ee20..3c9f73e0b1 100644 --- a/tests/examples/local-run.yaml +++ b/tests/examples/local-run.yaml @@ -18,14 +18,18 @@ providers: provider_type: meta-reference config: {} inference: - - provider_id: meta-reference - provider_type: meta-reference + - provider_id: remote::tgi + provider_type: remote::tgi config: - model: Llama3.1-8B-Instruct - quantization: null - torch_seed: null - max_seq_len: 4096 - max_batch_size: 1 + url: http://127.0.0.1:5009 + # - provider_id: meta-reference + # provider_type: meta-reference + # config: + # model: Llama3.1-8B-Instruct + # quantization: null + # torch_seed: null + # max_seq_len: 4096 + # max_batch_size: 1 safety: - provider_id: meta-reference provider_type: meta-reference From fb565dfb066addeacad8b31386c81d0c24787b2c Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Fri, 11 Oct 2024 09:30:10 -0700 Subject: [PATCH 06/27] eleuther eval fix --- llama_stack/apis/evals/client.py | 32 ++++++++++--------- .../third_party/evals/eleuther/eleuther.py | 8 +++-- tests/examples/local-run.yaml | 4 +-- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index ad4a471455..bde78adc94 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -54,27 +54,29 @@ async def run_evals( async def run_main(host: str, port: int): client = EvaluationClient(f"http://{host}:{port}") - # CustomDataset + # Custom Eval Task + # response = await client.run_evals( + # model="Llama3.1-8B-Instruct", + # dataset="mmlu-simple-eval-en", + # task="mmlu", + # eval_task_config=EvaluateTaskConfig( + # n_samples=2, + # ), + # ) + + # Eleuther Eval Task response = await client.run_evals( model="Llama3.1-8B-Instruct", - dataset="mmlu-simple-eval-en", - task="mmlu", + # task="meta_mmlu_pro_instruct", + task="meta_ifeval", eval_task_config=EvaluateTaskConfig( n_samples=2, ), ) - cprint(f"evaluate response={response}", "green") - - # Eleuther Eval Task - # response = await client.run_evals( - # model="Llama3.1-8B-Instruct", - # task="meta_mmlu_pro_instruct", - # # task="meta_ifeval", - # eval_task_config=EvaluateTaskConfig( - # n_samples=2, - # ) - # ) - # cprint(response.metrics["metrics_table"], "red") + if response.formatted_report: + cprint(response.formatted_report, "green") + else: + cprint(f"evaluate response={response}", "green") def main(host: str, port: int): diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py index b9f9505e93..e4b32a45e0 100644 --- a/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py +++ b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py @@ -157,12 +157,14 @@ async def run_evals( limit=eval_task_config.n_samples, ) + eval_result = EvalResult( + metrics={}, + ) formatted_output = lm_eval.utils.make_table(output) cprint(formatted_output, "green") return EvaluateResponse( - metrics={ - "metrics_table": formatted_output, - }, + eval_result=eval_result, + formatted_report=formatted_output, ) diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml index 3c9f73e0b1..430ce61020 100644 --- a/tests/examples/local-run.yaml +++ b/tests/examples/local-run.yaml @@ -14,8 +14,8 @@ apis: - evals providers: evals: - - provider_id: meta-reference - provider_type: meta-reference + - provider_id: eleuther + provider_type: eleuther config: {} inference: - provider_id: remote::tgi From a25aff290ef5103c40566f3fb925f6efbe20bccf Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Sun, 13 Oct 2024 23:27:02 -0700 Subject: [PATCH 07/27] generator + scorer Api for MMLU --- llama_stack/apis/dataset/dataset.py | 47 +++- llama_stack/apis/evals/client.py | 25 +- llama_stack/apis/evals/evals.py | 226 +++++++++++++++--- .../distribution/registry/datasets/dataset.py | 10 +- .../distribution/registry/scorers/__init__.py | 6 + .../registry/scorers/scorer_registry.py | 32 +++ .../distribution/registry/tasks/__init__.py | 8 - .../impls/meta_reference/evals/evals.py | 166 ++++++++++--- .../evals/processor/__init__.py | 5 + .../mmlu_processor.py} | 94 +++++--- .../meta_reference/evals/scorer/__init__.py | 5 + .../evals/scorer/basic_scorers.py | 78 ++++++ .../evals/tasks/run_eval_task.py | 39 +++ tests/examples/local-run.yaml | 4 +- 14 files changed, 616 insertions(+), 129 deletions(-) create mode 100644 llama_stack/distribution/registry/scorers/__init__.py create mode 100644 llama_stack/distribution/registry/scorers/scorer_registry.py create mode 100644 llama_stack/providers/impls/meta_reference/evals/processor/__init__.py rename llama_stack/providers/impls/meta_reference/evals/{tasks/mmlu_task.py => processor/mmlu_processor.py} (60%) create mode 100644 llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py create mode 100644 llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py create mode 100644 llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py index ed21c429fd..798f3aba99 100644 --- a/llama_stack/apis/dataset/dataset.py +++ b/llama_stack/apis/dataset/dataset.py @@ -14,6 +14,25 @@ from pydantic import BaseModel, Field from typing_extensions import Annotated + +@json_schema_type +class GenerationInput(BaseModel): + messages: List[Message] + + +@json_schema_type +class GenerationOutput(BaseModel): + completion_message: str + logprobs: Optional[List[TokenLogProbs]] = None + + +@json_schema_type +class PostprocessedGeneration(BaseModel): + completion_message: str + # structured transformed output from raw_completion_message to compute scorer metrics + transformed_generation: Optional[Any] = None + + # A sample (row) from dataset TDatasetSample = TypeVar("TDatasetSample") @@ -27,20 +46,32 @@ class DictSample(DatasetSample): data: Dict[str, Any] -# A sample (row) from evals intermediate dataset -TProcessedSample = TypeVar("TProcessedSample") +# A sample (row) from evals intermediate dataset after preprocessing +TPreprocessedSample = TypeVar("TPreprocessedSample") @json_schema_type -class PredictionSample(BaseModel): - completion_message: str +class PreprocessedSample(DatasetSample): + generation_input: GenerationInput + + +# A sample (row) from evals intermediate dataset after inference +TGenerationResponseSample = TypeVar("TGenerationResponseSample") + + +@json_schema_type +class GenerationResponseSample(DatasetSample): + generation_output: GenerationOutput + + +# A sample (row) for prepared evals dataset ready for scoring +TScorerInputSample = TypeVar("TScorerInputSample") @json_schema_type -class ProcessedDictSample(DictSample): - preprocessed: Optional[Dict[str, Any]] = None - prediction: Optional[PredictionSample] = None - postprocessed: Optional[Dict[str, Any]] = None +class ScorerInputSample(DatasetSample): + generation_output: PostprocessedGeneration + expected_output: Union[str, List[str]] @json_schema_type diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index bde78adc94..b4d1c39fe7 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -33,7 +33,7 @@ async def run_evals( ) -> EvaluateResponse: async with httpx.AsyncClient() as client: response = await client.post( - f"{self.base_url}/evals/run", + f"{self.base_url}/evals/run_eval_task", json={ "model": model, "task": task, @@ -55,28 +55,25 @@ async def run_main(host: str, port: int): client = EvaluationClient(f"http://{host}:{port}") # Custom Eval Task + response = await client.run_evals( + model="Llama3.1-8B-Instruct", + dataset="mmlu-simple-eval-en", + task="mmlu", + ) + + # Eleuther Eval Task # response = await client.run_evals( # model="Llama3.1-8B-Instruct", - # dataset="mmlu-simple-eval-en", - # task="mmlu", + # # task="meta_mmlu_pro_instruct", + # task="meta_ifeval", # eval_task_config=EvaluateTaskConfig( # n_samples=2, # ), # ) - - # Eleuther Eval Task - response = await client.run_evals( - model="Llama3.1-8B-Instruct", - # task="meta_mmlu_pro_instruct", - task="meta_ifeval", - eval_task_config=EvaluateTaskConfig( - n_samples=2, - ), - ) if response.formatted_report: cprint(response.formatted_report, "green") else: - cprint(f"evaluate response={response}", "green") + cprint(f"Response: {response}", "green") def main(host: str, port: int): diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index 6fe85408e4..92657f6b5c 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -5,7 +5,7 @@ # the root directory of this source tree. from abc import ABC, abstractmethod -from typing import Dict, Generic, List, Protocol +from typing import Dict, Generic, List, Optional, Protocol from llama_models.schema_utils import webmethod from pydantic import BaseModel @@ -24,14 +24,14 @@ class EvaluationJobLogStream(BaseModel): @json_schema_type class EvalResult(BaseModel): - """Evaluation result.""" + """Aggregated final evaluation result.""" - metrics: Dict[str, str] + metrics: Dict[str, float] @json_schema_type class SingleEvalResult(BaseModel): - """Single evaluation result.""" + """Single evaluation result. Contains a scorer name, and corresponding metrics from scorer.""" score_data: Dict[str, float] @@ -64,57 +64,222 @@ class EvaluationJobCreateResponse(BaseModel): @json_schema_type -class EvaluateTaskConfig(BaseModel): - # num examples to evaluate, evaluate all if None - n_samples: Optional[int] = None - # model evaluation params +class EvaluateDatasetConfig(BaseModel): + # identifier to previously registered dataset via DatasetDef + dataset_name: str + # limit number of rows to evaluate + row_limit: Optional[int] = None + kwargs: Optional[Dict[str, Any]] = None + + +@json_schema_type +class EvaluatePreprocessConfig(BaseModel): + kwargs: Optional[Dict[str, Any]] = None + + +@json_schema_type +class EvaluateModelGenerationConfig(BaseModel): + model: str sampling_params: SamplingParams = SamplingParams() + kwargs: Optional[Dict[str, Any]] = None + + +@json_schema_type +class EvaluatePostprocessConfig(BaseModel): + kwargs: Optional[Dict[str, Any]] = None + + +@json_schema_type +class EvaluateJudgeScoringConfig(BaseModel): ... + + +@json_schema_type +class LLMJudgeConfig(BaseModel): + judge_preprocess_config: EvaluatePreprocessConfig + judge_model_generation_config: EvaluateModelGenerationConfig + judge_postprocess_config: EvaluatePostprocessConfig + judge_scoring_config: EvaluateJudgeScoringConfig + + +@json_schema_type +class EvaluateSingleScorerConfig(BaseModel): + scorer_name: str + llm_judge_config: Optional[LLMJudgeConfig] = None + +@json_schema_type +class EvaluateScoringConfig(BaseModel): + # list of scorer (metrics) names to use + scorer_config_list: List[EvaluateSingleScorerConfig] -class BaseTask(ABC, Generic[TDatasetSample, TProcessedSample]): + +@json_schema_type +class EvaluateTaskConfig(BaseModel): + dataset_config: EvaluateDatasetConfig + preprocess_config: Optional[EvaluatePreprocessConfig] = None + generation_config: EvaluateModelGenerationConfig + postprocess_config: Optional[EvaluatePostprocessConfig] = None + scoring_config: EvaluateScoringConfig + + +class BaseGeneratorProcessor( + ABC, + Generic[ + TDatasetSample, + TPreprocessedSample, + TGenerationResponseSample, + TScorerInputSample, + ], +): """ - A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods. - Base class for all evaluation tasks. Each task needs to implement the following methods: - - F1: preprocess_sample(self) + Base class for all generator processors. Each processor needs to implement the following methods: + - F1: preprocess_sample(self, dataset) - F2: postprocess_sample(self) - - F3: score_sample(self) """ def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - self._name = self.__class__.__name__ + + def __str__(self) -> str: + return self.__class__.__name__ + + def preprocess( + self, dataset: BaseDataset[TDatasetSample] + ) -> List[TPreprocessedSample]: + return [self.preprocess_sample(sample) for sample in dataset] + + def postprocess( + self, + generation: List[TGenerationResponseSample], + dataset: BaseDataset[TDatasetSample], + ) -> List[TScorerInputSample]: + return [ + self.postprocess_sample(generation_sample, dataset_sample) + for generation_sample, dataset_sample in zip(generation, dataset) + ] @abstractmethod - def preprocess_sample(self, sample: TDatasetSample) -> TProcessedSample: + def preprocess_sample(self, sample: TDatasetSample) -> TPreprocessedSample: raise NotImplementedError() @abstractmethod - def postprocess_sample(self, sample: TProcessedSample) -> TProcessedSample: + def postprocess_sample( + self, + generation_sample: TGenerationResponseSample, + dataset_sample: TDatasetSample, + ) -> TScorerInputSample: raise NotImplementedError() + +class BaseGenerator(ABC, Generic[TGenerationResponseSample]): + """ + Base class for all generators. Each generator needs to implement the following methods: + - generate(self, preprocessed_dataset) + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + def __str__(self) -> str: + return self.__class__.__name__ + + @abstractmethod + def generate( + self, preprocessed_dataset: List[TPreprocessedSample] + ) -> List[TGenerationResponseSample]: + raise NotImplementedError() + + +class BaseScorer(ABC, Generic[TScorerInputSample]): + """ + Base class for all scorers. Each scorer needs to implement the following methods: + - score_sample(self, scorer_input_sample) + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + def __str__(self) -> str: + return self.__class__.__name__ + @abstractmethod - def score_sample(self, sample: TProcessedSample) -> SingleEvalResult: + def score_sample(self, scorer_input_sample: TScorerInputSample) -> SingleEvalResult: raise NotImplementedError() @abstractmethod def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: raise NotImplementedError() - def preprocess( - self, dataset: BaseDataset[TProcessedSample] - ) -> List[TProcessedSample]: - return [self.preprocess_sample(sample) for sample in dataset] + def score( + self, prepared_eval_dataset: List[TScorerInputSample] + ) -> List[SingleEvalResult]: + return [self.score_sample(sample) for sample in prepared_eval_dataset] - def postprocess(self, generation: List[TProcessedSample]) -> List[TProcessedSample]: - return [self.postprocess_sample(sample) for sample in generation] - def score(self, postprocessed: List[TProcessedSample]) -> List[SingleEvalResult]: - return [self.score_sample(sample) for sample in postprocessed] +class BaseTask(ABC): + def __init__( + self, + generator_processor: Optional[BaseGeneratorProcessor] = None, + generator: Optional[BaseGenerator] = None, + scorer: Optional[BaseScorer] = None, + *args, + **kwargs + ) -> None: + super().__init__(*args, **kwargs) + self.generator_processor = generator_processor + self.generator = generator + self.scorer = scorer + + @abstractmethod + def run(self, *args, **kwargs) -> EvalResult: + raise NotImplementedError() + + +# class BaseTask(ABC, Generic[TDatasetSample, TProcessedSample]): +# """ +# A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods. +# Base class for all evaluation tasks. Each task needs to implement the following methods: +# - F1: preprocess_sample(self) +# - F2: postprocess_sample(self) +# - F3: score_sample(self) +# """ + +# def __init__(self, *args, **kwargs) -> None: +# super().__init__(*args, **kwargs) +# self._name = self.__class__.__name__ + +# @abstractmethod +# def preprocess_sample(self, sample: TDatasetSample) -> TProcessedSample: +# raise NotImplementedError() + +# @abstractmethod +# def postprocess_sample(self, sample: TProcessedSample) -> TProcessedSample: +# raise NotImplementedError() + +# @abstractmethod +# def score_sample(self, sample: TProcessedSample) -> SingleEvalResult: +# raise NotImplementedError() + +# @abstractmethod +# def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: +# raise NotImplementedError() + +# def preprocess( +# self, dataset: BaseDataset[TProcessedSample] +# ) -> List[TProcessedSample]: +# return [self.preprocess_sample(sample) for sample in dataset] + +# def postprocess(self, generation: List[TProcessedSample]) -> List[TProcessedSample]: +# return [self.postprocess_sample(sample) for sample in generation] + +# def score(self, postprocessed: List[TProcessedSample]) -> List[SingleEvalResult]: +# return [self.score_sample(sample) for sample in postprocessed] class Evals(Protocol): - @webmethod(route="/evals/run") - async def run_evals( + + @webmethod(route="/evals/run_eval_task") + async def run_eval_task( self, model: str, task: str, @@ -122,6 +287,13 @@ async def run_evals( eval_task_config: Optional[EvaluateTaskConfig] = None, ) -> EvaluateResponse: ... + @webmethod(route="/evals/run_scorer") + async def run_scorer( + self, + dataset_config: EvaluateDatasetConfig, + eval_scoring_config: EvaluateScoringConfig, + ) -> EvaluateResponse: ... + # @webmethod(route="/evals/jobs") # def get_evaluation_jobs(self) -> List[EvaluationJob]: ... diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py index 87a01d311b..0bd86b8d49 100644 --- a/llama_stack/distribution/registry/datasets/dataset.py +++ b/llama_stack/distribution/registry/datasets/dataset.py @@ -25,23 +25,27 @@ def __iter__(self) -> Iterator[DictSample]: self.load() return (DictSample(data=x) for x in self.dataset) - def __str__(self): + def __str__(self) -> str: return f"CustomDataset({self.config})" - def __len__(self): + def __len__(self) -> int: if not self.dataset: self.load() return len(self.dataset) - def load(self): + def load(self, n_samples: Optional[int] = None) -> None: if self.dataset: return + # TODO: better support w/ data url if self.config.url.endswith(".csv"): df = pandas.read_csv(self.config.url) elif self.config.url.endswith(".xlsx"): df = pandas.read_excel(self.config.url) + if n_samples is not None: + df = df.sample(n=n_samples) + self.dataset = Dataset.from_pandas(df) diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py new file mode 100644 index 0000000000..76edd2ebd3 --- /dev/null +++ b/llama_stack/distribution/registry/scorers/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +# TODO: make these import config based diff --git a/llama_stack/distribution/registry/scorers/scorer_registry.py b/llama_stack/distribution/registry/scorers/scorer_registry.py new file mode 100644 index 0000000000..b6a382c531 --- /dev/null +++ b/llama_stack/distribution/registry/scorers/scorer_registry.py @@ -0,0 +1,32 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import AbstractSet, Dict + +from llama_stack.apis.evals import BaseScorer + + +class ScorerRegistry: + _REGISTRY: Dict[str, BaseScorer] = {} + + @staticmethod + def names() -> AbstractSet[str]: + return ScorerRegistry._REGISTRY.keys() + + @staticmethod + def register(name: str, scorer: BaseScorer) -> None: + if name in ScorerRegistry._REGISTRY: + raise ValueError(f"Task {name} already exists.") + ScorerRegistry._REGISTRY[name] = task + + @staticmethod + def get_scorer(name: str) -> BaseScorer: + if name not in ScorerRegistry._REGISTRY: + raise ValueError(f"Task {name} not found.") + return ScorerRegistry._REGISTRY[name] + + @staticmethod + def reset() -> None: + ScorerRegistry._REGISTRY = {} diff --git a/llama_stack/distribution/registry/tasks/__init__.py b/llama_stack/distribution/registry/tasks/__init__.py index 01ccb18aee..756f351d88 100644 --- a/llama_stack/distribution/registry/tasks/__init__.py +++ b/llama_stack/distribution/registry/tasks/__init__.py @@ -3,11 +3,3 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# TODO: make these import config based -from llama_stack.providers.impls.meta_reference.evals.tasks.mmlu_task import MMLUTask -from .task_registry import TaskRegistry - -TaskRegistry.register( - "mmlu", - MMLUTask, -) diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index d7214663ef..0fbce823e8 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -3,16 +3,27 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import json + +from termcolor import cprint + +from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import ( + AggregateScorer, +) from llama_stack.apis.inference import * # noqa: F403 from llama_stack.apis.evals import * # noqa: F403 from llama_stack.apis.dataset import * # noqa: F403 -from termcolor import cprint - from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry +from llama_stack.providers.impls.meta_reference.evals.processor.mmlu_processor import ( + MMLUProcessor, +) + +# from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry +# from .tasks.run_eval_task import RunEvalTask +from .scorer.basic_scorers import * # noqa: F403 -from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry from .config import MetaReferenceEvalsImplConfig @@ -27,7 +38,7 @@ async def initialize(self) -> None: async def shutdown(self) -> None: pass - async def run_evals( + async def run_eval_task( self, model: str, task: str, @@ -38,43 +49,142 @@ async def run_evals( f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}", "red", ) + if not dataset: raise ValueError("dataset must be specified for mete-reference evals") - dataset = DatasetRegistry.get_dataset(dataset) - dataset.load() + if not eval_task_config: + # construct eval task config from inputs + eval_task_config = EvaluateTaskConfig( + dataset_config=EvaluateDatasetConfig( + dataset_name=dataset, + row_limit=2, + ), + generation_config=EvaluateModelGenerationConfig( + model=model, + ), + scoring_config=EvaluateScoringConfig( + scorer_config_list=[ + EvaluateSingleScorerConfig(scorer_name="accuracy"), + ] + ), + ) - task_impl = TaskRegistry.get_task(task)() - preprocessed = task_impl.preprocess(dataset) + # TODO: wrap inside task + # run_task = RunEvalTask( + # eval_task_config=eval_task_config, + # ) + # eval_result = run_task.run() - # TODO: replace w/ batch inference & async return eval job - generation_outputs = [] - if eval_task_config is None: - eval_task_config = EvaluateTaskConfig(n_samples=len(preprocessed)) - if eval_task_config.n_samples is None or eval_task_config.n_samples > len( - preprocessed - ): - eval_task_config.n_samples = len(preprocessed) - - print( - f"Eval generation start, generate on {eval_task_config.n_samples} samples" + dataset = DatasetRegistry.get_dataset( + eval_task_config.dataset_config.dataset_name ) + dataset.load(n_samples=eval_task_config.dataset_config.row_limit) + print(f"Running on {len(dataset)} samples") + + # F1 + processor = MMLUProcessor() + preprocessed = processor.preprocess(dataset) - for sample in preprocessed[: eval_task_config.n_samples]: + # Generation + # TODO: wrap inside BaseGenerator + generation_outputs = [] + for sample in preprocessed: print("generation: ", sample) response = await self.inference_api.chat_completion( model=model, - messages=sample.preprocessed["messages"], + messages=sample.generation_input.messages, stream=False, ) - sample.prediction = PredictionSample( - completion_message=response.completion_message.content + cprint(f"response: {response}", "cyan") + + generation_outputs.append( + GenerationResponseSample( + generation_output=GenerationOutput( + completion_message=response.completion_message.content + ) + ) ) - generation_outputs.append(sample) + cprint(generation_outputs, "green") + + # F2 + postprocessed = processor.postprocess(generation_outputs, dataset) + cprint(postprocessed, "blue") + + # F3 - scorer + scorer = AggregateScorer( + scorers=[ + AccuracyScorer(), + RandomScorer(), + ] + ) + + scorer_results = scorer.score(postprocessed) + cprint(scorer_results, "magenta") + eval_result = scorer.aggregate_results(scorer_results) - postprocessed = task_impl.postprocess(generation_outputs) - eval_results = task_impl.score(postprocessed) - aggr_result = task_impl.aggregate_results(eval_results) return EvaluateResponse( - eval_result=aggr_result, + eval_result=eval_result, + formatted_report=json.dumps(eval_result.json(), indent=4), ) + + async def run_scorer( + self, + dataset_config: EvaluateDatasetConfig, + eval_scoring_config: EvaluateScoringConfig, + ) -> EvaluateResponse: + return EvaluateResponse( + eval_result={}, + ) + + # async def run_evals( + # self, + # model: str, + # task: str, + # dataset: Optional[str] = None, + # eval_task_config: Optional[EvaluateTaskConfig] = None, + # ) -> EvaluateResponse: + # cprint( + # f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}", + # "red", + # ) + # if not dataset: + # raise ValueError("dataset must be specified for mete-reference evals") + + # dataset = DatasetRegistry.get_dataset(dataset) + # dataset.load() + + # task_impl = TaskRegistry.get_task(task)() + # preprocessed = task_impl.preprocess(dataset) + + # # TODO: replace w/ batch inference & async return eval job + # generation_outputs = [] + # if eval_task_config is None: + # eval_task_config = EvaluateTaskConfig(n_samples=len(preprocessed)) + # if eval_task_config.n_samples is None or eval_task_config.n_samples > len( + # preprocessed + # ): + # eval_task_config.n_samples = len(preprocessed) + + # print( + # f"Eval generation start, generate on {eval_task_config.n_samples} samples" + # ) + + # for sample in preprocessed[: eval_task_config.n_samples]: + # print("generation: ", sample) + # response = await self.inference_api.chat_completion( + # model=model, + # messages=sample.preprocessed["messages"], + # stream=False, + # ) + # sample.prediction = PredictionSample( + # completion_message=response.completion_message.content + # ) + # generation_outputs.append(sample) + + # postprocessed = task_impl.postprocess(generation_outputs) + # eval_results = task_impl.score(postprocessed) + # aggr_result = task_impl.aggregate_results(eval_results) + # return EvaluateResponse( + # eval_result=aggr_result, + # ) diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py new file mode 100644 index 0000000000..756f351d88 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py b/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py similarity index 60% rename from llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py rename to llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py index e3d9e4ef3c..83460bb0c5 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py @@ -7,8 +7,6 @@ from llama_stack.apis.evals import * # noqa: F403 -# from llama_stack.distribution.registry.tasks.task import BaseTask - QUERY_TEMPLATE_MULTICHOICE = """ Answer the following multiple choice question and make the answer very simple. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. @@ -112,60 +110,78 @@ def normalize_extracted_answer(extracted_answer: str) -> str: ) -class MMLUTask(BaseTask[DictSample, ProcessedDictSample]): +class MMLUProcessor( + BaseGeneratorProcessor[ + DictSample, PreprocessedSample, GenerationResponseSample, ScorerInputSample + ] +): """ - MMLU Task. + Generator processor for MMLU """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def preprocess_sample(self, sample: ProcessedDictSample) -> ProcessedDictSample: + def preprocess_sample(self, sample: DictSample) -> PreprocessedSample: content = QUERY_TEMPLATE_MULTICHOICE.format(**sample.data) - preprocessed = { - "messages": [ - { - "role": "user", - "content": content, - } - ], - } - processed_sample = ProcessedDictSample( - data=sample.data, - preprocessed=preprocessed, + preprocessed_msgs = [ + { + "role": "user", + "content": content, + } + ] + processed_sample = PreprocessedSample( + generation_input=GenerationInput( + messages=preprocessed_msgs, + ) ) return processed_sample - def postprocess_sample(self, sample: ProcessedDictSample) -> ProcessedDictSample: - if not sample.postprocessed: - sample.postprocessed = {} - sample.postprocessed["postprocessed"] = normalize_response( - sample.prediction.completion_message - ) - return sample - - def score_sample(self, sample: ProcessedDictSample) -> SingleEvalResult: - postprocessed_output = sample.postprocessed["postprocessed"] - expected_answer = sample.data["Answer"] + def postprocess_sample( + self, generation_sample: GenerationResponseSample, dataset_sample: DictSample + ) -> ScorerInputSample: + response_text = generation_sample.generation_output.completion_message + normalized_response = normalize_response(response_text) - extracted_answer = None + # extract answer + extracted_answer = "" for answer_regex in MULTILINGUAL_ANSWER_REGEXES: regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex) - match = re.search(regex, postprocessed_output) + match = re.search(regex, normalized_response) if match: extracted_answer = normalize_extracted_answer(match.group(1)) break - score = 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0 - - return SingleEvalResult( - score_data={ - "score": score, - }, + return ScorerInputSample( + generation_output=PostprocessedGeneration( + completion_message=response_text, + transformed_generation=extracted_answer, + ), + expected_output=dataset_sample.data["Answer"], ) - def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: - print("aggregate_results", eval_results) - sum_score = sum([result.score_data["score"] for result in eval_results]) + # def score_sample(self, sample: ProcessedDictSample) -> SingleEvalResult: + # postprocessed_output = sample.postprocessed["postprocessed"] + # expected_answer = sample.data["Answer"] + + # extracted_answer = None + # for answer_regex in MULTILINGUAL_ANSWER_REGEXES: + # regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex) + # match = re.search(regex, postprocessed_output) + # if match: + # extracted_answer = normalize_extracted_answer(match.group(1)) + # break + + # score = 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0 + + # return SingleEvalResult( + # score_data={ + # "score": score, + # }, + # ) + + # def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + # print("aggregate_results", eval_results) + # sum_score = sum([result.score_data["score"] for result in eval_results]) - return EvalResult(metrics={"score": str(sum_score / len(eval_results))}) + # return EvalResult(metrics={"score": str(sum_score / len(eval_results))}) diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py new file mode 100644 index 0000000000..756f351d88 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py new file mode 100644 index 0000000000..ff9639ecd7 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py @@ -0,0 +1,78 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import random + +from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult +from llama_stack.apis.dataset.dataset import * # noqa: F401 F403 + + +class AggregateScorer(BaseScorer[ScorerInputSample]): + def __init__(self, scorers: List[BaseScorer[ScorerInputSample]]): + self.scorers = scorers + + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + all_score_data = {} + for scorer in self.scorers: + score_data = scorer.score_sample(scorer_input_sample).score_data + for k, v in score_data.items(): + all_score_data[k] = v + + return SingleEvalResult( + score_data=all_score_data, + ) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + all_metrics = {} + + for scorer in self.scorers: + metrics = scorer.aggregate_results(eval_results).metrics + for k, v in metrics.items(): + all_metrics[f"{scorer.__class__.__name__}:{k}"] = v + + return EvalResult( + metrics=all_metrics, + ) + + +class RandomScorer(BaseScorer[ScorerInputSample]): + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + return SingleEvalResult(score_data={"random": random.random()}) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + avg_random = sum( + [result.score_data["random"] for result in eval_results] + ) / len(eval_results) + max_random = max([result.score_data["random"] for result in eval_results]) + return EvalResult( + metrics={ + "avg_random": avg_random, + "max_random": max_random, + } + ) + + +class AccuracyScorer(BaseScorer[ScorerInputSample]): + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + extracted_answer = scorer_input_sample.generation_output.transformed_generation + expected_answer = scorer_input_sample.expected_output + + accuracy = ( + 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0 + ) + + return SingleEvalResult(score_data={"accuracy": accuracy}) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + num_correct = sum([result.score_data["accuracy"] for result in eval_results]) + num_total = len(eval_results) + + return EvalResult( + metrics={ + "avg_accuracy": num_correct / num_total, + "num_correct": num_correct, + "num_total": num_total, + } + ) diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py new file mode 100644 index 0000000000..df164b4315 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py @@ -0,0 +1,39 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry + +from llama_stack.apis.evals import * # noqa: F403 + + +class RunEvalTask(BaseTask): + """ + RunEvalTask for LlamaStack + """ + + def __init__( + self, + eval_task_config, + generator_processor: Optional[BaseGeneratorProcessor] = None, + generator: Optional[BaseGenerator] = None, + scorer: Optional[BaseScorer] = None, + *args, + **kwargs, + ) -> None: + super().__init__( + generator_processor=generator_processor, + generator=generator, + scorer=scorer, + *args, + **kwargs, + ) + self.eval_task_config = eval_task_config + self.dataset = DatasetRegistry.get_dataset( + eval_task_config.dataset_config.dataset_name + ) + + def run(self, *args, **kwargs) -> EvalResult: + print(f"Running eval task on {self.dataset}") + return EvalResult() diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml index 430ce61020..3c9f73e0b1 100644 --- a/tests/examples/local-run.yaml +++ b/tests/examples/local-run.yaml @@ -14,8 +14,8 @@ apis: - evals providers: evals: - - provider_id: eleuther - provider_type: eleuther + - provider_id: meta-reference + provider_type: meta-reference config: {} inference: - provider_id: remote::tgi From 8890de732204f6906588ae250354a2469d749f54 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Sun, 13 Oct 2024 23:30:21 -0700 Subject: [PATCH 08/27] cleanup original BaseTask --- llama_stack/apis/evals/evals.py | 41 --------------- .../impls/meta_reference/evals/evals.py | 52 ------------------- 2 files changed, 93 deletions(-) diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index 92657f6b5c..098fa5cc45 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -235,47 +235,6 @@ def run(self, *args, **kwargs) -> EvalResult: raise NotImplementedError() -# class BaseTask(ABC, Generic[TDatasetSample, TProcessedSample]): -# """ -# A task represents a single evaluation benchmark, including it's dataset, preprocessing, postprocessing and scoring methods. -# Base class for all evaluation tasks. Each task needs to implement the following methods: -# - F1: preprocess_sample(self) -# - F2: postprocess_sample(self) -# - F3: score_sample(self) -# """ - -# def __init__(self, *args, **kwargs) -> None: -# super().__init__(*args, **kwargs) -# self._name = self.__class__.__name__ - -# @abstractmethod -# def preprocess_sample(self, sample: TDatasetSample) -> TProcessedSample: -# raise NotImplementedError() - -# @abstractmethod -# def postprocess_sample(self, sample: TProcessedSample) -> TProcessedSample: -# raise NotImplementedError() - -# @abstractmethod -# def score_sample(self, sample: TProcessedSample) -> SingleEvalResult: -# raise NotImplementedError() - -# @abstractmethod -# def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: -# raise NotImplementedError() - -# def preprocess( -# self, dataset: BaseDataset[TProcessedSample] -# ) -> List[TProcessedSample]: -# return [self.preprocess_sample(sample) for sample in dataset] - -# def postprocess(self, generation: List[TProcessedSample]) -> List[TProcessedSample]: -# return [self.postprocess_sample(sample) for sample in generation] - -# def score(self, postprocessed: List[TProcessedSample]) -> List[SingleEvalResult]: -# return [self.score_sample(sample) for sample in postprocessed] - - class Evals(Protocol): @webmethod(route="/evals/run_eval_task") diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index 0fbce823e8..411aa0bc2d 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -136,55 +136,3 @@ async def run_scorer( return EvaluateResponse( eval_result={}, ) - - # async def run_evals( - # self, - # model: str, - # task: str, - # dataset: Optional[str] = None, - # eval_task_config: Optional[EvaluateTaskConfig] = None, - # ) -> EvaluateResponse: - # cprint( - # f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}", - # "red", - # ) - # if not dataset: - # raise ValueError("dataset must be specified for mete-reference evals") - - # dataset = DatasetRegistry.get_dataset(dataset) - # dataset.load() - - # task_impl = TaskRegistry.get_task(task)() - # preprocessed = task_impl.preprocess(dataset) - - # # TODO: replace w/ batch inference & async return eval job - # generation_outputs = [] - # if eval_task_config is None: - # eval_task_config = EvaluateTaskConfig(n_samples=len(preprocessed)) - # if eval_task_config.n_samples is None or eval_task_config.n_samples > len( - # preprocessed - # ): - # eval_task_config.n_samples = len(preprocessed) - - # print( - # f"Eval generation start, generate on {eval_task_config.n_samples} samples" - # ) - - # for sample in preprocessed[: eval_task_config.n_samples]: - # print("generation: ", sample) - # response = await self.inference_api.chat_completion( - # model=model, - # messages=sample.preprocessed["messages"], - # stream=False, - # ) - # sample.prediction = PredictionSample( - # completion_message=response.completion_message.content - # ) - # generation_outputs.append(sample) - - # postprocessed = task_impl.postprocess(generation_outputs) - # eval_results = task_impl.score(postprocessed) - # aggr_result = task_impl.aggregate_results(eval_results) - # return EvaluateResponse( - # eval_result=aggr_result, - # ) From 78cb88c3c4b2593c4d03b33981cbbd40c07bab8a Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Sun, 13 Oct 2024 23:48:15 -0700 Subject: [PATCH 09/27] RunEvalTask / InferenceGenerator --- llama_stack/apis/evals/evals.py | 6 +- .../impls/meta_reference/evals/evals.py | 69 ++----------------- .../evals/generator/__init__.py | 5 ++ .../evals/generator/inference_generator.py | 48 +++++++++++++ .../evals/tasks/run_eval_task.py | 67 +++++++++++++----- 5 files changed, 111 insertions(+), 84 deletions(-) create mode 100644 llama_stack/providers/impls/meta_reference/evals/generator/__init__.py create mode 100644 llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index 098fa5cc45..a62fa4418a 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -171,7 +171,7 @@ def postprocess_sample( raise NotImplementedError() -class BaseGenerator(ABC, Generic[TGenerationResponseSample]): +class BaseGenerator(ABC, Generic[TPreprocessedSample, TGenerationResponseSample]): """ Base class for all generators. Each generator needs to implement the following methods: - generate(self, preprocessed_dataset) @@ -184,7 +184,7 @@ def __str__(self) -> str: return self.__class__.__name__ @abstractmethod - def generate( + async def generate( self, preprocessed_dataset: List[TPreprocessedSample] ) -> List[TGenerationResponseSample]: raise NotImplementedError() @@ -231,7 +231,7 @@ def __init__( self.scorer = scorer @abstractmethod - def run(self, *args, **kwargs) -> EvalResult: + async def run(self, *args, **kwargs) -> EvalResult: raise NotImplementedError() diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index 411aa0bc2d..f717fc9d8e 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -7,25 +7,14 @@ from termcolor import cprint -from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import ( - AggregateScorer, -) - from llama_stack.apis.inference import * # noqa: F403 from llama_stack.apis.evals import * # noqa: F403 from llama_stack.apis.dataset import * # noqa: F403 -from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry -from llama_stack.providers.impls.meta_reference.evals.processor.mmlu_processor import ( - MMLUProcessor, -) +from .config import MetaReferenceEvalsImplConfig # from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry -# from .tasks.run_eval_task import RunEvalTask -from .scorer.basic_scorers import * # noqa: F403 - - -from .config import MetaReferenceEvalsImplConfig +from .tasks.run_eval_task import RunEvalTask class MetaReferenceEvalsImpl(Evals): @@ -70,58 +59,8 @@ async def run_eval_task( ), ) - # TODO: wrap inside task - # run_task = RunEvalTask( - # eval_task_config=eval_task_config, - # ) - # eval_result = run_task.run() - - dataset = DatasetRegistry.get_dataset( - eval_task_config.dataset_config.dataset_name - ) - dataset.load(n_samples=eval_task_config.dataset_config.row_limit) - print(f"Running on {len(dataset)} samples") - - # F1 - processor = MMLUProcessor() - preprocessed = processor.preprocess(dataset) - - # Generation - # TODO: wrap inside BaseGenerator - generation_outputs = [] - for sample in preprocessed: - print("generation: ", sample) - response = await self.inference_api.chat_completion( - model=model, - messages=sample.generation_input.messages, - stream=False, - ) - cprint(f"response: {response}", "cyan") - - generation_outputs.append( - GenerationResponseSample( - generation_output=GenerationOutput( - completion_message=response.completion_message.content - ) - ) - ) - cprint(generation_outputs, "green") - - # F2 - postprocessed = processor.postprocess(generation_outputs, dataset) - cprint(postprocessed, "blue") - - # F3 - scorer - scorer = AggregateScorer( - scorers=[ - AccuracyScorer(), - RandomScorer(), - ] - ) - - scorer_results = scorer.score(postprocessed) - cprint(scorer_results, "magenta") - eval_result = scorer.aggregate_results(scorer_results) + run_task = RunEvalTask() + eval_result = await run_task.run(eval_task_config, self.inference_api) return EvaluateResponse( eval_result=eval_result, diff --git a/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py b/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py new file mode 100644 index 0000000000..756f351d88 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py b/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py new file mode 100644 index 0000000000..adc181e237 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py @@ -0,0 +1,48 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from termcolor import cprint + +from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.apis.inference import * # noqa: F403 + + +class InferenceGenerator(BaseGenerator[PreprocessedSample, GenerationResponseSample]): + """ + InferenceGenerator for LlamaStack + """ + + def __init__( + self, + model, + inference_api, + *args, + **kwargs, + ) -> None: + super().__init__(*args, **kwargs) + self.model = model + self.inference_api = inference_api + + async def generate( + self, preprocessed_dataset: List[PreprocessedSample] + ) -> List[GenerationResponseSample]: + generation_outputs = [] + for sample in preprocessed_dataset: + print("generation: ", sample) + response = await self.inference_api.chat_completion( + model=self.model, + messages=sample.generation_input.messages, + stream=False, + ) + cprint(f"response: {response}", "cyan") + + generation_outputs.append( + GenerationResponseSample( + generation_output=GenerationOutput( + completion_message=response.completion_message.content + ) + ) + ) + return generation_outputs diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py index df164b4315..f3a66e18b0 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py @@ -4,8 +4,17 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry +from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import ( + InferenceGenerator, +) +from llama_stack.providers.impls.meta_reference.evals.processor.mmlu_processor import ( + MMLUProcessor, +) from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.apis.inference import * # noqa: F403 +from termcolor import cprint class RunEvalTask(BaseTask): @@ -15,25 +24,51 @@ class RunEvalTask(BaseTask): def __init__( self, - eval_task_config, - generator_processor: Optional[BaseGeneratorProcessor] = None, - generator: Optional[BaseGenerator] = None, - scorer: Optional[BaseScorer] = None, *args, **kwargs, ) -> None: - super().__init__( - generator_processor=generator_processor, - generator=generator, - scorer=scorer, - *args, - **kwargs, - ) - self.eval_task_config = eval_task_config - self.dataset = DatasetRegistry.get_dataset( + super().__init__(*args, **kwargs) + + async def run( + self, + eval_task_config: EvaluateTaskConfig, + inference_api: Inference, + *args, + **kwargs, + ) -> EvalResult: + print(f"Running eval task w/ {eval_task_config}") + + dataset = DatasetRegistry.get_dataset( eval_task_config.dataset_config.dataset_name ) + dataset.load(n_samples=eval_task_config.dataset_config.row_limit) + print(f"Running on {len(dataset)} samples") + + # F1 + processor = MMLUProcessor() + preprocessed = processor.preprocess(dataset) + + # Generation + generator = InferenceGenerator( + model=eval_task_config.generation_config.model, + inference_api=inference_api, + ) + generation_outputs = await generator.generate(preprocessed) + + # F2 + postprocessed = processor.postprocess(generation_outputs, dataset) + cprint(postprocessed, "blue") + + # F3 - scorer + scorer = AggregateScorer( + scorers=[ + AccuracyScorer(), + RandomScorer(), + ] + ) + + scorer_results = scorer.score(postprocessed) + cprint(scorer_results, "magenta") + eval_result = scorer.aggregate_results(scorer_results) - def run(self, *args, **kwargs) -> EvalResult: - print(f"Running eval task on {self.dataset}") - return EvalResult() + return eval_result From 18fe966e96297d797b4c86b343e037855c8af613 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 00:12:46 -0700 Subject: [PATCH 10/27] registry refactor --- .../registry/datasets/__init__.py | 7 +++- .../registry/datasets/dataset_registry.py | 32 ------------------- .../registry/generator_processors/__init__.py | 12 +++++++ llama_stack/distribution/registry/registry.py | 32 +++++++++++++++++++ .../distribution/registry/scorers/__init__.py | 7 ++++ .../registry/scorers/scorer_registry.py | 32 ------------------- .../distribution/registry/tasks/__init__.py | 5 --- .../registry/tasks/task_registry.py | 32 ------------------- .../evals/tasks/run_eval_task.py | 7 ++-- 9 files changed, 60 insertions(+), 106 deletions(-) delete mode 100644 llama_stack/distribution/registry/datasets/dataset_registry.py create mode 100644 llama_stack/distribution/registry/generator_processors/__init__.py create mode 100644 llama_stack/distribution/registry/registry.py delete mode 100644 llama_stack/distribution/registry/scorers/scorer_registry.py delete mode 100644 llama_stack/distribution/registry/tasks/__init__.py delete mode 100644 llama_stack/distribution/registry/tasks/task_registry.py diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py index f0636212ae..68de3fa879 100644 --- a/llama_stack/distribution/registry/datasets/__init__.py +++ b/llama_stack/distribution/registry/datasets/__init__.py @@ -6,8 +6,13 @@ # TODO: make these import config based from llama_stack.apis.dataset import * # noqa: F403 +from ..registry import Registry from .dataset import CustomDataset, HuggingfaceDataset -from .dataset_registry import DatasetRegistry + + +class DatasetRegistry(Registry[BaseDataset]): + _REGISTRY: Dict[str, BaseDataset] = {} + DATASETS_REGISTRY = [ CustomDataset( diff --git a/llama_stack/distribution/registry/datasets/dataset_registry.py b/llama_stack/distribution/registry/datasets/dataset_registry.py deleted file mode 100644 index 8e9b22266a..0000000000 --- a/llama_stack/distribution/registry/datasets/dataset_registry.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. -from typing import AbstractSet, Dict - -from llama_stack.apis.dataset import BaseDataset - - -class DatasetRegistry: - _REGISTRY: Dict[str, BaseDataset] = {} - - @staticmethod - def names() -> AbstractSet[str]: - return DatasetRegistry._REGISTRY.keys() - - @staticmethod - def register(name: str, task: BaseDataset) -> None: - if name in DatasetRegistry._REGISTRY: - raise ValueError(f"Dataset {name} already exists.") - DatasetRegistry._REGISTRY[name] = task - - @staticmethod - def get_dataset(name: str) -> BaseDataset: - if name not in DatasetRegistry._REGISTRY: - raise ValueError(f"Dataset {name} not found.") - return DatasetRegistry._REGISTRY[name] - - @staticmethod - def reset() -> None: - DatasetRegistry._REGISTRY = {} diff --git a/llama_stack/distribution/registry/generator_processors/__init__.py b/llama_stack/distribution/registry/generator_processors/__init__.py new file mode 100644 index 0000000000..bb9d5c1824 --- /dev/null +++ b/llama_stack/distribution/registry/generator_processors/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from llama_stack.apis.evals import * # noqa: F403 + +from ..registry import Registry + + +class GeneratorProcessorRegistry(Registry[BaseGeneratorProcessor]): + _REGISTRY: Dict[str, BaseGeneratorProcessor] = {} diff --git a/llama_stack/distribution/registry/registry.py b/llama_stack/distribution/registry/registry.py new file mode 100644 index 0000000000..b4a5b626d9 --- /dev/null +++ b/llama_stack/distribution/registry/registry.py @@ -0,0 +1,32 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import AbstractSet, Dict, Generic, TypeVar + +TRegistry = TypeVar("TRegistry") + + +class Registry(Generic[TRegistry]): + _REGISTRY: Dict[str, TRegistry] = {} + + @staticmethod + def names() -> AbstractSet[str]: + return Registry._REGISTRY.keys() + + @staticmethod + def register(name: str, task: TRegistry) -> None: + if name in Registry._REGISTRY: + raise ValueError(f"Dataset {name} already exists.") + Registry._REGISTRY[name] = task + + @staticmethod + def get(name: str) -> TRegistry: + if name not in Registry._REGISTRY: + raise ValueError(f"Dataset {name} not found.") + return Registry._REGISTRY[name] + + @staticmethod + def reset() -> None: + Registry._REGISTRY = {} diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py index 76edd2ebd3..3332b70527 100644 --- a/llama_stack/distribution/registry/scorers/__init__.py +++ b/llama_stack/distribution/registry/scorers/__init__.py @@ -4,3 +4,10 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. # TODO: make these import config based +from llama_stack.apis.evals import * # noqa: F403 + +from ..registry import Registry + + +class ScorerRegistry(Registry[BaseScorer]): + _REGISTRY: Dict[str, BaseScorer] = {} diff --git a/llama_stack/distribution/registry/scorers/scorer_registry.py b/llama_stack/distribution/registry/scorers/scorer_registry.py deleted file mode 100644 index b6a382c531..0000000000 --- a/llama_stack/distribution/registry/scorers/scorer_registry.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. -from typing import AbstractSet, Dict - -from llama_stack.apis.evals import BaseScorer - - -class ScorerRegistry: - _REGISTRY: Dict[str, BaseScorer] = {} - - @staticmethod - def names() -> AbstractSet[str]: - return ScorerRegistry._REGISTRY.keys() - - @staticmethod - def register(name: str, scorer: BaseScorer) -> None: - if name in ScorerRegistry._REGISTRY: - raise ValueError(f"Task {name} already exists.") - ScorerRegistry._REGISTRY[name] = task - - @staticmethod - def get_scorer(name: str) -> BaseScorer: - if name not in ScorerRegistry._REGISTRY: - raise ValueError(f"Task {name} not found.") - return ScorerRegistry._REGISTRY[name] - - @staticmethod - def reset() -> None: - ScorerRegistry._REGISTRY = {} diff --git a/llama_stack/distribution/registry/tasks/__init__.py b/llama_stack/distribution/registry/tasks/__init__.py deleted file mode 100644 index 756f351d88..0000000000 --- a/llama_stack/distribution/registry/tasks/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. diff --git a/llama_stack/distribution/registry/tasks/task_registry.py b/llama_stack/distribution/registry/tasks/task_registry.py deleted file mode 100644 index df25686ba6..0000000000 --- a/llama_stack/distribution/registry/tasks/task_registry.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. -from typing import AbstractSet, Dict - -from llama_stack.apis.evals import BaseTask - - -class TaskRegistry: - _REGISTRY: Dict[str, BaseTask] = {} - - @staticmethod - def names() -> AbstractSet[str]: - return TaskRegistry._REGISTRY.keys() - - @staticmethod - def register(name: str, task: BaseTask) -> None: - if name in TaskRegistry._REGISTRY: - raise ValueError(f"Task {name} already exists.") - TaskRegistry._REGISTRY[name] = task - - @staticmethod - def get_task(name: str) -> BaseTask: - if name not in TaskRegistry._REGISTRY: - raise ValueError(f"Task {name} not found.") - return TaskRegistry._REGISTRY[name] - - @staticmethod - def reset() -> None: - TaskRegistry._REGISTRY = {} diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py index f3a66e18b0..fde2efdb08 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py @@ -3,7 +3,7 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_stack.distribution.registry.datasets.dataset_registry import DatasetRegistry +from llama_stack.distribution.registry.datasets import DatasetRegistry from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import ( InferenceGenerator, @@ -38,9 +38,8 @@ async def run( ) -> EvalResult: print(f"Running eval task w/ {eval_task_config}") - dataset = DatasetRegistry.get_dataset( - eval_task_config.dataset_config.dataset_name - ) + print(DatasetRegistry.names()) + dataset = DatasetRegistry.get(eval_task_config.dataset_config.dataset_name) dataset.load(n_samples=eval_task_config.dataset_config.row_limit) print(f"Running on {len(dataset)} samples") From f046899a1cf4b35c1f1f4092196b98437cd3e2b2 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 13:16:39 -0700 Subject: [PATCH 11/27] datasets api --- .../apis/{dataset => datasets}/__init__.py | 2 +- llama_stack/apis/datasets/client.py | 92 +++++++++++++++++++ .../dataset.py => datasets/datasets.py} | 10 +- llama_stack/apis/evals/evals.py | 2 +- llama_stack/distribution/datatypes.py | 10 ++ llama_stack/distribution/distribution.py | 20 +++- llama_stack/distribution/registry/__init__.py | 17 ++++ .../registry/datasets/__init__.py | 4 +- .../distribution/registry/datasets/dataset.py | 90 ++++++------------ .../registry/datasets/dataset_wrappers.py | 78 ++++++++++++++++ llama_stack/distribution/resolver.py | 23 +++++ llama_stack/providers/datatypes.py | 4 +- .../impls/meta_reference/evals/evals.py | 6 +- .../evals/scorer/basic_scorers.py | 2 +- tests/examples/local-run.yaml | 1 + 15 files changed, 281 insertions(+), 80 deletions(-) rename llama_stack/apis/{dataset => datasets}/__init__.py (82%) create mode 100644 llama_stack/apis/datasets/client.py rename llama_stack/apis/{dataset/dataset.py => datasets/datasets.py} (96%) create mode 100644 llama_stack/distribution/registry/datasets/dataset_wrappers.py diff --git a/llama_stack/apis/dataset/__init__.py b/llama_stack/apis/datasets/__init__.py similarity index 82% rename from llama_stack/apis/dataset/__init__.py rename to llama_stack/apis/datasets/__init__.py index 33557a0ab1..102b9927f3 100644 --- a/llama_stack/apis/dataset/__init__.py +++ b/llama_stack/apis/datasets/__init__.py @@ -4,4 +4,4 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from .dataset import * # noqa: F401 F403 +from .datasets import * # noqa: F401 F403 diff --git a/llama_stack/apis/datasets/client.py b/llama_stack/apis/datasets/client.py new file mode 100644 index 0000000000..241db65689 --- /dev/null +++ b/llama_stack/apis/datasets/client.py @@ -0,0 +1,92 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import asyncio +import json + +import fire +import httpx + +from .datasets import * # noqa: F403 + + +class DatasetClient(Datasets): + def __init__(self, base_url: str): + self.base_url = base_url + + async def initialize(self) -> None: + pass + + async def shutdown(self) -> None: + pass + + async def create_dataset( + self, + dataset_def: DatasetDef, + ) -> None: + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.base_url}/datasets/create", + json={ + "dataset_def": json.loads(dataset_def.json()), + }, + headers={"Content-Type": "application/json"}, + timeout=60, + ) + response.raise_for_status() + return None + + async def get_dataset( + self, + dataset_identifier: str, + ) -> DatasetDef: + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.base_url}/datasets/create", + json={ + "dataset_identifier": dataset_identifier, + }, + headers={"Content-Type": "application/json"}, + timeout=60, + ) + response.raise_for_status() + return DatasetDef(**response.json()) + + async def delete_dataset( + self, + dataset_identifier: str, + ) -> DatasetDef: + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.base_url}/datasets/delete", + json={ + "dataset_identifier": dataset_identifier, + }, + headers={"Content-Type": "application/json"}, + timeout=60, + ) + response.raise_for_status() + return None + + +async def run_main(host: str, port: int): + client = DatasetClient(f"http://{host}:{port}") + + # Custom Eval Task + response = await client.create_dataset( + dataset_def=CustomDatasetDef( + identifier="test-dataset", + url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", + ), + ) + + +def main(host: str, port: int): + asyncio.run(run_main(host, port)) + + +if __name__ == "__main__": + fire.Fire(main) diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/datasets/datasets.py similarity index 96% rename from llama_stack/apis/dataset/dataset.py rename to llama_stack/apis/datasets/datasets.py index 798f3aba99..c79301557c 100644 --- a/llama_stack/apis/dataset/dataset.py +++ b/llama_stack/apis/datasets/datasets.py @@ -143,19 +143,19 @@ def load(self) -> None: class Datasets(Protocol): @webmethod(route="/datasets/create") - def create_dataset( + async def create_dataset( self, - dataset: DatasetDef, + dataset_def: DatasetDef, ) -> None: ... @webmethod(route="/datasets/get") - def get_dataset( + async def get_dataset( self, dataset_identifier: str, ) -> DatasetDef: ... @webmethod(route="/datasets/delete") - def delete_dataset( + async def delete_dataset( self, - dataset_uuid: str, + dataset_identifier: str, ) -> None: ... diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index a62fa4418a..af0b291e87 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -11,7 +11,7 @@ from pydantic import BaseModel from llama_models.llama3.api.datatypes import * # noqa: F403 -from llama_stack.apis.dataset import * # noqa: F403 +from llama_stack.apis.datasets import * # noqa: F403 class EvaluationJob(BaseModel): diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py index 0044de09ee..ce7f5a8e50 100644 --- a/llama_stack/distribution/datatypes.py +++ b/llama_stack/distribution/datatypes.py @@ -73,6 +73,16 @@ class RoutingTableProviderSpec(ProviderSpec): pip_packages: List[str] = Field(default_factory=list) +# Example: /datasets +class RegistryProviderSpec(ProviderSpec): + provider_type: str = "registry" + config_class: str = "" + docker_image: Optional[str] = None + + module: str + pip_packages: List[str] = Field(default_factory=list) + + class DistributionSpec(BaseModel): description: Optional[str] = Field( default="", diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py index 999646cc06..d96db23b46 100644 --- a/llama_stack/distribution/distribution.py +++ b/llama_stack/distribution/distribution.py @@ -21,6 +21,19 @@ class AutoRoutedApiInfo(BaseModel): router_api: Api +class RegistryApiInfo(BaseModel): + registry_api: Api + # registry: Registry + + +def builtin_registry_apis() -> List[RegistryApiInfo]: + return [ + RegistryApiInfo( + registry_api=Api.datasets, + ) + ] + + def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]: return [ AutoRoutedApiInfo( @@ -42,7 +55,12 @@ def providable_apis() -> List[Api]: routing_table_apis = set( x.routing_table_api for x in builtin_automatically_routed_apis() ) - return [api for api in Api if api not in routing_table_apis and api != Api.inspect] + registry_apis = set( + x.registry_api for x in builtin_registry_apis() if x.registry_api + ) + non_providable_apis = routing_table_apis | registry_apis | {Api.inspect} + + return [api for api in Api if api not in non_providable_apis] def get_provider_registry() -> Dict[Api, Dict[str, ProviderSpec]]: diff --git a/llama_stack/distribution/registry/__init__.py b/llama_stack/distribution/registry/__init__.py index 756f351d88..6e68333280 100644 --- a/llama_stack/distribution/registry/__init__.py +++ b/llama_stack/distribution/registry/__init__.py @@ -3,3 +3,20 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any + +from llama_stack.providers.datatypes import Api +from .datasets.dataset import DatasetRegistryImpl + + +async def get_registry_impl(api: Api, _deps) -> Any: + api_to_registry = { + "datasets": DatasetRegistryImpl, + } + + if api.value not in api_to_registry: + raise ValueError(f"API {api.value} not found in registry map") + + impl = api_to_registry[api.value]() + await impl.initialize() + return impl diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py index 68de3fa879..384028b9e3 100644 --- a/llama_stack/distribution/registry/datasets/__init__.py +++ b/llama_stack/distribution/registry/datasets/__init__.py @@ -5,9 +5,9 @@ # the root directory of this source tree. # TODO: make these import config based -from llama_stack.apis.dataset import * # noqa: F403 +from llama_stack.apis.datasets import * # noqa: F403 from ..registry import Registry -from .dataset import CustomDataset, HuggingfaceDataset +from .dataset_wrappers import CustomDataset, HuggingfaceDataset class DatasetRegistry(Registry[BaseDataset]): diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py index 0bd86b8d49..936fd0713b 100644 --- a/llama_stack/distribution/registry/datasets/dataset.py +++ b/llama_stack/distribution/registry/datasets/dataset.py @@ -3,76 +3,38 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -import pandas -from datasets import Dataset, load_dataset -from llama_stack.apis.dataset import * # noqa: F403 +# from llama_stack.apis.datasets import * +# from llama_stack.distribution.registry.datasets import DatasetRegistry # noqa: F403 +# from ..registry import Registry +# from .dataset_wrappers import CustomDataset, HuggingfaceDataset -class CustomDataset(BaseDataset[DictSample]): - def __init__(self, config: CustomDatasetDef) -> None: - super().__init__() - self.config = config - self.dataset = None - self.index = 0 +class DatasetRegistryImpl(Datasets): + """API Impl to interact with underlying dataset registry""" - @property - def dataset_id(self) -> str: - return self.config.identifier + def __init__( + self, + ) -> None: + pass - def __iter__(self) -> Iterator[DictSample]: - if not self.dataset: - self.load() - return (DictSample(data=x) for x in self.dataset) + async def initialize(self) -> None: + pass - def __str__(self) -> str: - return f"CustomDataset({self.config})" + async def shutdown(self) -> None: + pass - def __len__(self) -> int: - if not self.dataset: - self.load() - return len(self.dataset) + async def create_dataset( + self, + dataset_def: DatasetDef, + ) -> None: + print(f"Creating dataset {dataset.identifier}") - def load(self, n_samples: Optional[int] = None) -> None: - if self.dataset: - return + async def get_dataset( + self, + dataset_identifier: str, + ) -> DatasetDef: + pass - # TODO: better support w/ data url - if self.config.url.endswith(".csv"): - df = pandas.read_csv(self.config.url) - elif self.config.url.endswith(".xlsx"): - df = pandas.read_excel(self.config.url) - - if n_samples is not None: - df = df.sample(n=n_samples) - - self.dataset = Dataset.from_pandas(df) - - -class HuggingfaceDataset(BaseDataset[DictSample]): - def __init__(self, config: HuggingfaceDatasetDef): - super().__init__() - self.config = config - self.dataset = None - - @property - def dataset_id(self) -> str: - return self.config.identifier - - def __iter__(self) -> Iterator[DictSample]: - if not self.dataset: - self.load() - return (DictSample(data=x) for x in self.dataset) - - def __str__(self): - return f"HuggingfaceDataset({self.config})" - - def __len__(self): - if not self.dataset: - self.load() - return len(self.dataset) - - def load(self): - if self.dataset: - return - self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs) + async def delete_dataset(self, dataset_identifier: str) -> None: + pass diff --git a/llama_stack/distribution/registry/datasets/dataset_wrappers.py b/llama_stack/distribution/registry/datasets/dataset_wrappers.py new file mode 100644 index 0000000000..e18165a110 --- /dev/null +++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py @@ -0,0 +1,78 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import pandas +from datasets import Dataset, load_dataset + +from llama_stack.apis.datasets import * # noqa: F403 + + +class CustomDataset(BaseDataset[DictSample]): + def __init__(self, config: CustomDatasetDef) -> None: + super().__init__() + self.config = config + self.dataset = None + self.index = 0 + + @property + def dataset_id(self) -> str: + return self.config.identifier + + def __iter__(self) -> Iterator[DictSample]: + if not self.dataset: + self.load() + return (DictSample(data=x) for x in self.dataset) + + def __str__(self) -> str: + return f"CustomDataset({self.config})" + + def __len__(self) -> int: + if not self.dataset: + self.load() + return len(self.dataset) + + def load(self, n_samples: Optional[int] = None) -> None: + if self.dataset: + return + + # TODO: better support w/ data url + if self.config.url.endswith(".csv"): + df = pandas.read_csv(self.config.url) + elif self.config.url.endswith(".xlsx"): + df = pandas.read_excel(self.config.url) + + if n_samples is not None: + df = df.sample(n=n_samples) + + self.dataset = Dataset.from_pandas(df) + + +class HuggingfaceDataset(BaseDataset[DictSample]): + def __init__(self, config: HuggingfaceDatasetDef): + super().__init__() + self.config = config + self.dataset = None + + @property + def dataset_id(self) -> str: + return self.config.identifier + + def __iter__(self) -> Iterator[DictSample]: + if not self.dataset: + self.load() + return (DictSample(data=x) for x in self.dataset) + + def __str__(self): + return f"HuggingfaceDataset({self.config})" + + def __len__(self): + if not self.dataset: + self.load() + return len(self.dataset) + + def load(self): + if self.dataset: + return + self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs) diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py index 672a4ea60f..e71c3fd8ce 100644 --- a/llama_stack/distribution/resolver.py +++ b/llama_stack/distribution/resolver.py @@ -12,6 +12,7 @@ from llama_stack.distribution.datatypes import * # noqa: F403 from llama_stack.apis.agents import Agents +from llama_stack.apis.datasets import Datasets from llama_stack.apis.evals import Evals from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect @@ -23,6 +24,7 @@ from llama_stack.apis.telemetry import Telemetry from llama_stack.distribution.distribution import ( builtin_automatically_routed_apis, + builtin_registry_apis, get_provider_registry, ) from llama_stack.distribution.utils.dynamic import instantiate_class_type @@ -40,6 +42,7 @@ def api_protocol_map() -> Dict[Api, Any]: Api.shields: Shields, Api.telemetry: Telemetry, Api.evals: Evals, + Api.datasets: Datasets, } @@ -139,6 +142,20 @@ async def resolve_impls_with_routing(run_config: StackRunConfig) -> Dict[Api, An ) } + for info in builtin_registry_apis(): + providers_with_specs[info.registry_api.value] = { + "__builtin__": ProviderWithSpec( + provider_id="__registry__", + provider_type="__registry__", + config={}, + spec=RegistryProviderSpec( + api=info.registry_api, + module="llama_stack.distribution.registry", + deps__=[], + ), + ) + } + sorted_providers = topological_sort( {k: v.values() for k, v in providers_with_specs.items()} ) @@ -259,6 +276,12 @@ async def instantiate_provider( config = None args = [provider_spec.api, inner_impls, deps] + elif isinstance(provider_spec, RegistryProviderSpec): + print("ROUTER PROVIDER SPEC") + method = "get_registry_impl" + + config = None + args = [provider_spec.api, deps] else: method = "get_provider_impl" diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py index 50ab0691b9..1d397c9e73 100644 --- a/llama_stack/providers/datatypes.py +++ b/llama_stack/providers/datatypes.py @@ -28,11 +28,13 @@ class Api(Enum): models = "models" shields = "shields" memory_banks = "memory_banks" - evals = "evals" # built-in API inspect = "inspect" + evals = "evals" + datasets = "datasets" + class ModelsProtocolPrivate(Protocol): async def list_models(self) -> List[ModelDef]: ... diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index f717fc9d8e..3ae988cbdc 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -9,11 +9,9 @@ from llama_stack.apis.inference import * # noqa: F403 from llama_stack.apis.evals import * # noqa: F403 -from llama_stack.apis.dataset import * # noqa: F403 +from llama_stack.apis.datasets import * # noqa: F403 from .config import MetaReferenceEvalsImplConfig - -# from llama_stack.distribution.registry.tasks.task_registry import TaskRegistry from .tasks.run_eval_task import RunEvalTask @@ -47,7 +45,7 @@ async def run_eval_task( eval_task_config = EvaluateTaskConfig( dataset_config=EvaluateDatasetConfig( dataset_name=dataset, - row_limit=2, + row_limit=3, ), generation_config=EvaluateModelGenerationConfig( model=model, diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py index ff9639ecd7..47d41c6d61 100644 --- a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py @@ -6,7 +6,7 @@ import random from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult -from llama_stack.apis.dataset.dataset import * # noqa: F401 F403 +from llama_stack.apis.datasets.datasets import * # noqa: F401 F403 class AggregateScorer(BaseScorer[ScorerInputSample]): diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml index 3c9f73e0b1..31fb726708 100644 --- a/tests/examples/local-run.yaml +++ b/tests/examples/local-run.yaml @@ -12,6 +12,7 @@ apis: - inference - safety - evals +- datasets providers: evals: - provider_id: meta-reference From a9210cd416ca81c74a0aa52ae22f18b615645e19 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 13:54:16 -0700 Subject: [PATCH 12/27] datasets api crud --- llama_stack/apis/datasets/client.py | 74 ++++++++++++++++--- llama_stack/apis/datasets/datasets.py | 32 +++++++- .../distribution/registry/datasets/dataset.py | 63 +++++++++++++--- llama_stack/distribution/registry/registry.py | 6 ++ 4 files changed, 151 insertions(+), 24 deletions(-) diff --git a/llama_stack/apis/datasets/client.py b/llama_stack/apis/datasets/client.py index 241db65689..476a5964a1 100644 --- a/llama_stack/apis/datasets/client.py +++ b/llama_stack/apis/datasets/client.py @@ -6,13 +6,26 @@ import asyncio import json +from typing import Optional import fire import httpx +from termcolor import cprint from .datasets import * # noqa: F403 +def deserialize_dataset_def(j: Optional[Dict[str, Any]]) -> Optional[DatasetDef]: + if not j: + return None + if j["type"] == "huggingface": + return HuggingfaceDatasetDef(**j) + elif j["type"] == "custom": + return CustomDatasetDef(**j) + else: + raise ValueError(f"Unknown dataset type: {j['type']}") + + class DatasetClient(Datasets): def __init__(self, base_url: str): self.base_url = base_url @@ -26,7 +39,7 @@ async def shutdown(self) -> None: async def create_dataset( self, dataset_def: DatasetDef, - ) -> None: + ) -> CreateDatasetResponse: async with httpx.AsyncClient() as client: response = await client.post( f"{self.base_url}/datasets/create", @@ -37,28 +50,31 @@ async def create_dataset( timeout=60, ) response.raise_for_status() - return None + return CreateDatasetResponse(**response.json()) async def get_dataset( self, dataset_identifier: str, - ) -> DatasetDef: + ) -> Optional[DatasetDef]: async with httpx.AsyncClient() as client: - response = await client.post( - f"{self.base_url}/datasets/create", - json={ + response = await client.get( + f"{self.base_url}/datasets/get", + params={ "dataset_identifier": dataset_identifier, }, headers={"Content-Type": "application/json"}, timeout=60, ) response.raise_for_status() - return DatasetDef(**response.json()) + if not response.json(): + return + + return deserialize_dataset_def(response.json()) async def delete_dataset( self, dataset_identifier: str, - ) -> DatasetDef: + ) -> DeleteDatasetResponse: async with httpx.AsyncClient() as client: response = await client.post( f"{self.base_url}/datasets/delete", @@ -69,19 +85,57 @@ async def delete_dataset( timeout=60, ) response.raise_for_status() - return None + return DeleteDatasetResponse(**response.json()) + + async def list_dataset( + self, + ) -> List[DatasetDef]: + async with httpx.AsyncClient() as client: + response = await client.get( + f"{self.base_url}/datasets/list", + headers={"Content-Type": "application/json"}, + timeout=60, + ) + response.raise_for_status() + if not response.json(): + return + + return [deserialize_dataset_def(x) for x in response.json()] async def run_main(host: str, port: int): client = DatasetClient(f"http://{host}:{port}") - # Custom Eval Task + # register dataset response = await client.create_dataset( dataset_def=CustomDatasetDef( identifier="test-dataset", url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", ), ) + cprint(response, "green") + + # get dataset + get_dataset = await client.get_dataset( + dataset_identifier="test-dataset", + ) + cprint(get_dataset, "cyan") + + # delete dataset + delete_dataset = await client.delete_dataset( + dataset_identifier="test-dataset", + ) + cprint(delete_dataset, "red") + + # get again after deletion + get_dataset = await client.get_dataset( + dataset_identifier="test-dataset", + ) + cprint(get_dataset, "yellow") + + # list datasets + list_dataset = await client.list_dataset() + cprint(list_dataset, "blue") def main(host: str, port: int): diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index c79301557c..11a3f60964 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -115,6 +115,27 @@ class CustomDatasetDef(BaseModel): ] +class DatasetsResponseStatus(Enum): + success = "success" + fail = "fail" + + +@json_schema_type +class CreateDatasetResponse(BaseModel): + status: DatasetsResponseStatus = Field( + description="Return status of the dataset creation", + ) + msg: Optional[str] = None + + +@json_schema_type +class DeleteDatasetResponse(BaseModel): + status: DatasetsResponseStatus = Field( + description="Return status of the dataset creation", + ) + msg: Optional[str] = None + + class BaseDataset(ABC, Generic[TDatasetSample]): def __init__(self) -> None: self.type: str = self.__class__.__name__ @@ -146,16 +167,19 @@ class Datasets(Protocol): async def create_dataset( self, dataset_def: DatasetDef, - ) -> None: ... + ) -> CreateDatasetResponse: ... - @webmethod(route="/datasets/get") + @webmethod(route="/datasets/get", method="GET") async def get_dataset( self, dataset_identifier: str, - ) -> DatasetDef: ... + ) -> Optional[DatasetDef]: ... @webmethod(route="/datasets/delete") async def delete_dataset( self, dataset_identifier: str, - ) -> None: ... + ) -> DeleteDatasetResponse: ... + + @webmethod(route="/datasets/list", method="GET") + async def list_datasets(self) -> List[DatasetDef]: ... diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py index 936fd0713b..838e8c65fa 100644 --- a/llama_stack/distribution/registry/datasets/dataset.py +++ b/llama_stack/distribution/registry/datasets/dataset.py @@ -4,10 +4,12 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# from llama_stack.apis.datasets import * -# from llama_stack.distribution.registry.datasets import DatasetRegistry # noqa: F403 -# from ..registry import Registry -# from .dataset_wrappers import CustomDataset, HuggingfaceDataset +from llama_stack.apis.datasets import * # noqa: F403 +from llama_stack.distribution.registry.datasets import DatasetRegistry +from llama_stack.distribution.registry.datasets.dataset_wrappers import ( + CustomDataset, + HuggingfaceDataset, +) class DatasetRegistryImpl(Datasets): @@ -27,14 +29,55 @@ async def shutdown(self) -> None: async def create_dataset( self, dataset_def: DatasetDef, - ) -> None: - print(f"Creating dataset {dataset.identifier}") + ) -> CreateDatasetResponse: + if dataset_def.type == DatasetType.huggingface.value: + dataset_cls = HuggingfaceDataset(dataset_def) + else: + dataset_cls = CustomDataset(dataset_def) + + try: + DatasetRegistry.register( + dataset_def.identifier, + dataset_cls, + ) + except ValueError as e: + return CreateDatasetResponse( + status=DatasetsResponseStatus.fail, + msg=str(e), + ) + + return CreateDatasetResponse( + status=DatasetsResponseStatus.success, + msg=f"Dataset '{dataset_def.identifier}' registered", + ) async def get_dataset( self, dataset_identifier: str, - ) -> DatasetDef: - pass + ) -> Optional[DatasetDef]: + try: + dataset_ref = DatasetRegistry.get(dataset_identifier).config + except ValueError as e: + return None - async def delete_dataset(self, dataset_identifier: str) -> None: - pass + return dataset_ref + + async def delete_dataset(self, dataset_identifier: str) -> DeleteDatasetResponse: + try: + DatasetRegistry.delete(dataset_identifier) + except ValueError as e: + return DeleteDatasetResponse( + status=DatasetsResponseStatus.fail, + msg=str(e), + ) + + return DeleteDatasetResponse( + status=DatasetsResponseStatus.success, + msg=f"Dataset '{dataset_identifier}' deleted", + ) + + async def list_datasets(self) -> List[DatasetDef]: + return [ + DatasetRegistry.get(dataset_identifier).config + for dataset_identifier in DatasetRegistry.names() + ] diff --git a/llama_stack/distribution/registry/registry.py b/llama_stack/distribution/registry/registry.py index b4a5b626d9..313fb6d4e4 100644 --- a/llama_stack/distribution/registry/registry.py +++ b/llama_stack/distribution/registry/registry.py @@ -27,6 +27,12 @@ def get(name: str) -> TRegistry: raise ValueError(f"Dataset {name} not found.") return Registry._REGISTRY[name] + @staticmethod + def delete(name: str) -> None: + if name not in Registry._REGISTRY: + raise ValueError(f"Dataset {name} not found.") + del Registry._REGISTRY[name] + @staticmethod def reset() -> None: Registry._REGISTRY = {} From 9c501d042b0ca1f2a4bfd2848c1609af8bb46cb1 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 14:19:15 -0700 Subject: [PATCH 13/27] cleanup hardcoded dataset registry --- llama_stack/apis/datasets/client.py | 14 +++++++++-- llama_stack/apis/evals/client.py | 23 +++++++++++++++---- .../registry/datasets/__init__.py | 23 ------------------- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/llama_stack/apis/datasets/client.py b/llama_stack/apis/datasets/client.py index 476a5964a1..e292b14d8c 100644 --- a/llama_stack/apis/datasets/client.py +++ b/llama_stack/apis/datasets/client.py @@ -26,7 +26,7 @@ def deserialize_dataset_def(j: Optional[Dict[str, Any]]) -> Optional[DatasetDef] raise ValueError(f"Unknown dataset type: {j['type']}") -class DatasetClient(Datasets): +class DatasetsClient(Datasets): def __init__(self, base_url: str): self.base_url = base_url @@ -104,7 +104,7 @@ async def list_dataset( async def run_main(host: str, port: int): - client = DatasetClient(f"http://{host}:{port}") + client = DatasetsClient(f"http://{host}:{port}") # register dataset response = await client.create_dataset( @@ -115,6 +115,16 @@ async def run_main(host: str, port: int): ) cprint(response, "green") + # register HF dataset + response = await client.create_dataset( + dataset_def=HuggingfaceDatasetDef( + identifier="hellaswag", + dataset_name="hellaswag", + kwargs={"split": "validation", "trust_remote_code": True}, + ) + ) + cprint(response, "green") + # get dataset get_dataset = await client.get_dataset( dataset_identifier="test-dataset", diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index b4d1c39fe7..d61de8c39b 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -12,6 +12,7 @@ from termcolor import cprint from .evals import * # noqa: F403 +from ..datasets.client import DatasetsClient class EvaluationClient(Evals): @@ -54,13 +55,31 @@ async def run_evals( async def run_main(host: str, port: int): client = EvaluationClient(f"http://{host}:{port}") + dataset_client = DatasetsClient(f"http://{host}:{port}") + # Custom Eval Task + + # 1. register custom dataset + response = await dataset_client.create_dataset( + dataset_def=CustomDatasetDef( + identifier="mmlu-simple-eval-en", + url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", + ), + ) + cprint(f"datasets/create: {response}", "cyan") + + # 2. run evals on the registered dataset response = await client.run_evals( model="Llama3.1-8B-Instruct", dataset="mmlu-simple-eval-en", task="mmlu", ) + if response.formatted_report: + cprint(response.formatted_report, "green") + else: + cprint(f"Response: {response}", "green") + # Eleuther Eval Task # response = await client.run_evals( # model="Llama3.1-8B-Instruct", @@ -70,10 +89,6 @@ async def run_main(host: str, port: int): # n_samples=2, # ), # ) - if response.formatted_report: - cprint(response.formatted_report, "green") - else: - cprint(f"Response: {response}", "green") def main(host: str, port: int): diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py index 384028b9e3..8164758120 100644 --- a/llama_stack/distribution/registry/datasets/__init__.py +++ b/llama_stack/distribution/registry/datasets/__init__.py @@ -3,32 +3,9 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. - -# TODO: make these import config based from llama_stack.apis.datasets import * # noqa: F403 from ..registry import Registry -from .dataset_wrappers import CustomDataset, HuggingfaceDataset class DatasetRegistry(Registry[BaseDataset]): _REGISTRY: Dict[str, BaseDataset] = {} - - -DATASETS_REGISTRY = [ - CustomDataset( - config=CustomDatasetDef( - identifier="mmlu-simple-eval-en", - url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", - ) - ), - HuggingfaceDataset( - config=HuggingfaceDatasetDef( - identifier="hellaswag", - dataset_name="hellaswag", - kwargs={"split": "validation", "trust_remote_code": True}, - ) - ), -] - -for d in DATASETS_REGISTRY: - DatasetRegistry.register(d.dataset_id, d) From c50686b6feadb1a15803a91b11650d9caf68514c Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 15:41:31 -0700 Subject: [PATCH 14/27] scorer registry --- .../distribution/registry/scorers/__init__.py | 10 ++++++ .../impls/meta_reference/evals/evals.py | 1 + .../evals/scorer/aggregate_scorer.py | 35 +++++++++++++++++++ .../evals/scorer/basic_scorers.py | 28 --------------- .../evals/tasks/run_eval_task.py | 13 ++++--- 5 files changed, 55 insertions(+), 32 deletions(-) create mode 100644 llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py index 3332b70527..084a620a74 100644 --- a/llama_stack/distribution/registry/scorers/__init__.py +++ b/llama_stack/distribution/registry/scorers/__init__.py @@ -5,9 +5,19 @@ # the root directory of this source tree. # TODO: make these import config based from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 from ..registry import Registry class ScorerRegistry(Registry[BaseScorer]): _REGISTRY: Dict[str, BaseScorer] = {} + + +SCORER_REGISTRY = { + "accuracy": AccuracyScorer, + "random": RandomScorer, +} + +for k, v in SCORER_REGISTRY.items(): + ScorerRegistry.register(k, v) diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index 3ae988cbdc..1d703a27ce 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -53,6 +53,7 @@ async def run_eval_task( scoring_config=EvaluateScoringConfig( scorer_config_list=[ EvaluateSingleScorerConfig(scorer_name="accuracy"), + EvaluateSingleScorerConfig(scorer_name="random"), ] ), ) diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py new file mode 100644 index 0000000000..1a0621960e --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py @@ -0,0 +1,35 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult +from llama_stack.apis.datasets.datasets import * # noqa: F401 F403 + + +class AggregateScorer(BaseScorer[ScorerInputSample]): + def __init__(self, scorers: List[BaseScorer[ScorerInputSample]]): + self.scorers = scorers + + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + all_score_data = {} + for scorer in self.scorers: + score_data = scorer.score_sample(scorer_input_sample).score_data + for k, v in score_data.items(): + all_score_data[k] = v + + return SingleEvalResult( + score_data=all_score_data, + ) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + all_metrics = {} + + for scorer in self.scorers: + metrics = scorer.aggregate_results(eval_results).metrics + for k, v in metrics.items(): + all_metrics[f"{scorer.__class__.__name__}:{k}"] = v + + return EvalResult( + metrics=all_metrics, + ) diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py index 47d41c6d61..48d8caa3fa 100644 --- a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py @@ -9,34 +9,6 @@ from llama_stack.apis.datasets.datasets import * # noqa: F401 F403 -class AggregateScorer(BaseScorer[ScorerInputSample]): - def __init__(self, scorers: List[BaseScorer[ScorerInputSample]]): - self.scorers = scorers - - def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: - all_score_data = {} - for scorer in self.scorers: - score_data = scorer.score_sample(scorer_input_sample).score_data - for k, v in score_data.items(): - all_score_data[k] = v - - return SingleEvalResult( - score_data=all_score_data, - ) - - def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: - all_metrics = {} - - for scorer in self.scorers: - metrics = scorer.aggregate_results(eval_results).metrics - for k, v in metrics.items(): - all_metrics[f"{scorer.__class__.__name__}:{k}"] = v - - return EvalResult( - metrics=all_metrics, - ) - - class RandomScorer(BaseScorer[ScorerInputSample]): def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: return SingleEvalResult(score_data={"random": random.random()}) diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py index fde2efdb08..48c4509141 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py @@ -4,6 +4,8 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. from llama_stack.distribution.registry.datasets import DatasetRegistry +from llama_stack.distribution.registry.scorers import ScorerRegistry +from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import * # noqa: F403 from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import ( InferenceGenerator, @@ -59,11 +61,14 @@ async def run( cprint(postprocessed, "blue") # F3 - scorer + scorer_config_list = eval_task_config.scoring_config.scorer_config_list + scorer_list = [] + for s_conf in scorer_config_list: + scorer = ScorerRegistry.get(s_conf.scorer_name) + scorer_list.append(scorer()) + scorer = AggregateScorer( - scorers=[ - AccuracyScorer(), - RandomScorer(), - ] + scorers=scorer_list, ) scorer_results = scorer.score(postprocessed) From 95fd53d2921cec04e8c9e71fb49183ac29cf8071 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 16:09:55 -0700 Subject: [PATCH 15/27] registry refactor --- llama_stack/apis/datasets/datasets.py | 14 +++++-- llama_stack/apis/evals/evals.py | 12 +----- .../registry/datasets/__init__.py | 3 +- llama_stack/distribution/registry/registry.py | 38 +++++++++---------- .../distribution/registry/scorers/__init__.py | 5 +-- .../impls/meta_reference/evals/evals.py | 4 ++ .../evals/processor/mmlu_processor.py | 30 +-------------- .../evals/scorer/basic_scorers.py | 4 +- 8 files changed, 39 insertions(+), 71 deletions(-) diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index 11a3f60964..0f4354c3fc 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -29,8 +29,7 @@ class GenerationOutput(BaseModel): @json_schema_type class PostprocessedGeneration(BaseModel): completion_message: str - # structured transformed output from raw_completion_message to compute scorer metrics - transformed_generation: Optional[Any] = None + logprobs: Optional[List[TokenLogProbs]] = None # A sample (row) from dataset @@ -70,8 +69,15 @@ class GenerationResponseSample(DatasetSample): @json_schema_type class ScorerInputSample(DatasetSample): - generation_output: PostprocessedGeneration - expected_output: Union[str, List[str]] + """ + A dataset is required to have the following columns to be used for scoring: + - generated_answer: str + - expected_answer: Union[str, List[str]] + """ + + generated_answer: str + expected_answer: Union[str, List[str]] + generation_output: Optional[PostprocessedGeneration] = None @json_schema_type diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index af0b291e87..fb3aa6cd4d 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -217,18 +217,8 @@ def score( class BaseTask(ABC): - def __init__( - self, - generator_processor: Optional[BaseGeneratorProcessor] = None, - generator: Optional[BaseGenerator] = None, - scorer: Optional[BaseScorer] = None, - *args, - **kwargs - ) -> None: + def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - self.generator_processor = generator_processor - self.generator = generator - self.scorer = scorer @abstractmethod async def run(self, *args, **kwargs) -> EvalResult: diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py index 8164758120..4474c8d7d8 100644 --- a/llama_stack/distribution/registry/datasets/__init__.py +++ b/llama_stack/distribution/registry/datasets/__init__.py @@ -7,5 +7,4 @@ from ..registry import Registry -class DatasetRegistry(Registry[BaseDataset]): - _REGISTRY: Dict[str, BaseDataset] = {} +DatasetRegistry = Registry[BaseDataset]() diff --git a/llama_stack/distribution/registry/registry.py b/llama_stack/distribution/registry/registry.py index 313fb6d4e4..702ed7d869 100644 --- a/llama_stack/distribution/registry/registry.py +++ b/llama_stack/distribution/registry/registry.py @@ -3,36 +3,34 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import AbstractSet, Dict, Generic, TypeVar +from typing import AbstractSet, Generic, TypeVar TRegistry = TypeVar("TRegistry") class Registry(Generic[TRegistry]): - _REGISTRY: Dict[str, TRegistry] = {} - @staticmethod - def names() -> AbstractSet[str]: - return Registry._REGISTRY.keys() + def __init__(self) -> None: + super().__init__() + self.registry = {} - @staticmethod - def register(name: str, task: TRegistry) -> None: - if name in Registry._REGISTRY: + def names(self) -> AbstractSet[str]: + return self.registry.keys() + + def register(self, name: str, task: TRegistry) -> None: + if name in self.registry: raise ValueError(f"Dataset {name} already exists.") - Registry._REGISTRY[name] = task + self.registry[name] = task - @staticmethod - def get(name: str) -> TRegistry: - if name not in Registry._REGISTRY: + def get(self, name: str) -> TRegistry: + if name not in self.registry: raise ValueError(f"Dataset {name} not found.") - return Registry._REGISTRY[name] + return self.registry[name] - @staticmethod - def delete(name: str) -> None: - if name not in Registry._REGISTRY: + def delete(self, name: str) -> None: + if name not in self.registry: raise ValueError(f"Dataset {name} not found.") - del Registry._REGISTRY[name] + del self.registry[name] - @staticmethod - def reset() -> None: - Registry._REGISTRY = {} + def reset(self) -> None: + self.registry = {} diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py index 084a620a74..dedf32ac3a 100644 --- a/llama_stack/distribution/registry/scorers/__init__.py +++ b/llama_stack/distribution/registry/scorers/__init__.py @@ -9,10 +9,7 @@ from ..registry import Registry - -class ScorerRegistry(Registry[BaseScorer]): - _REGISTRY: Dict[str, BaseScorer] = {} - +ScorerRegistry = Registry[BaseScorer]() SCORER_REGISTRY = { "accuracy": AccuracyScorer, diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index 1d703a27ce..abd1938ada 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -71,6 +71,10 @@ async def run_scorer( dataset_config: EvaluateDatasetConfig, eval_scoring_config: EvaluateScoringConfig, ) -> EvaluateResponse: + cprint("run_scorer") + + # main logic, we need to convert the datset into List[ScorerInputSample] + return EvaluateResponse( eval_result={}, ) diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py b/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py index 83460bb0c5..fc2d9eb642 100644 --- a/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py +++ b/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py @@ -153,35 +153,9 @@ def postprocess_sample( break return ScorerInputSample( + generated_answer=extracted_answer, + expected_answer=dataset_sample.data["Answer"], generation_output=PostprocessedGeneration( completion_message=response_text, - transformed_generation=extracted_answer, ), - expected_output=dataset_sample.data["Answer"], ) - - # def score_sample(self, sample: ProcessedDictSample) -> SingleEvalResult: - # postprocessed_output = sample.postprocessed["postprocessed"] - # expected_answer = sample.data["Answer"] - - # extracted_answer = None - # for answer_regex in MULTILINGUAL_ANSWER_REGEXES: - # regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex) - # match = re.search(regex, postprocessed_output) - # if match: - # extracted_answer = normalize_extracted_answer(match.group(1)) - # break - - # score = 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0 - - # return SingleEvalResult( - # score_data={ - # "score": score, - # }, - # ) - - # def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: - # print("aggregate_results", eval_results) - # sum_score = sum([result.score_data["score"] for result in eval_results]) - - # return EvalResult(metrics={"score": str(sum_score / len(eval_results))}) diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py index 48d8caa3fa..6099353a87 100644 --- a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py @@ -28,8 +28,8 @@ def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: class AccuracyScorer(BaseScorer[ScorerInputSample]): def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: - extracted_answer = scorer_input_sample.generation_output.transformed_generation - expected_answer = scorer_input_sample.expected_output + extracted_answer = scorer_input_sample.generated_answer + expected_answer = scorer_input_sample.expected_answer accuracy = ( 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0 From a22c31b8a4329948d22154307e769cfb56fc870d Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 16:25:06 -0700 Subject: [PATCH 16/27] processor registry --- llama_stack/apis/evals/evals.py | 13 +++++++++---- .../registry/generator_processors/__init__.py | 11 +++++++++-- .../providers/impls/meta_reference/evals/evals.py | 3 +++ .../meta_reference/evals/processor/__init__.py | 1 + .../impls/meta_reference/evals/scorer/__init__.py | 2 ++ .../meta_reference/evals/tasks/run_eval_task.py | 13 +++++++++---- 6 files changed, 33 insertions(+), 10 deletions(-) diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index fb3aa6cd4d..ea985ad3b2 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -89,15 +89,21 @@ class EvaluatePostprocessConfig(BaseModel): kwargs: Optional[Dict[str, Any]] = None +@json_schema_type +class EvaluateProcessorConfig(BaseModel): + processor_identifier: str + preprocess_config: Optional[EvaluatePreprocessConfig] = None + postprocess_config: Optional[EvaluatePostprocessConfig] = None + + @json_schema_type class EvaluateJudgeScoringConfig(BaseModel): ... @json_schema_type class LLMJudgeConfig(BaseModel): - judge_preprocess_config: EvaluatePreprocessConfig + judge_processor_config: EvaluateProcessorConfig judge_model_generation_config: EvaluateModelGenerationConfig - judge_postprocess_config: EvaluatePostprocessConfig judge_scoring_config: EvaluateJudgeScoringConfig @@ -116,9 +122,8 @@ class EvaluateScoringConfig(BaseModel): @json_schema_type class EvaluateTaskConfig(BaseModel): dataset_config: EvaluateDatasetConfig - preprocess_config: Optional[EvaluatePreprocessConfig] = None + processor_config: EvaluateProcessorConfig generation_config: EvaluateModelGenerationConfig - postprocess_config: Optional[EvaluatePostprocessConfig] = None scoring_config: EvaluateScoringConfig diff --git a/llama_stack/distribution/registry/generator_processors/__init__.py b/llama_stack/distribution/registry/generator_processors/__init__.py index bb9d5c1824..44972cf03e 100644 --- a/llama_stack/distribution/registry/generator_processors/__init__.py +++ b/llama_stack/distribution/registry/generator_processors/__init__.py @@ -4,9 +4,16 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.processor import * # noqa: F403 from ..registry import Registry +# TODO: decide whether we should group dataset+processor together via Tasks +GeneratorProcessorRegistry = Registry[BaseGeneratorProcessor]() -class GeneratorProcessorRegistry(Registry[BaseGeneratorProcessor]): - _REGISTRY: Dict[str, BaseGeneratorProcessor] = {} +PROCESSOR_REGISTRY = { + "mmlu": MMLUProcessor, +} + +for k, v in PROCESSOR_REGISTRY.items(): + GeneratorProcessorRegistry.register(k, v) diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index abd1938ada..80bf2dd7ad 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -47,6 +47,9 @@ async def run_eval_task( dataset_name=dataset, row_limit=3, ), + processor_config=EvaluateProcessorConfig( + processor_identifier="mmlu", + ), generation_config=EvaluateModelGenerationConfig( model=model, ), diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py index 756f351d88..f782f9320a 100644 --- a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py +++ b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py @@ -3,3 +3,4 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from .mmlu_processor import MMLUProcessor # noqa: F401 diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py index 756f351d88..6424963f87 100644 --- a/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py @@ -3,3 +3,5 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from .basic_scorers import * # noqa: F401 F403 +from .aggregate_scorer import * # noqa: F401 F403 diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py index 48c4509141..83f6264c0d 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py @@ -4,15 +4,17 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. from llama_stack.distribution.registry.datasets import DatasetRegistry +from llama_stack.distribution.registry.generator_processors import ( + GeneratorProcessorRegistry, +) from llama_stack.distribution.registry.scorers import ScorerRegistry + from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import * # noqa: F403 from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import ( InferenceGenerator, ) -from llama_stack.providers.impls.meta_reference.evals.processor.mmlu_processor import ( - MMLUProcessor, -) + from llama_stack.apis.evals import * # noqa: F403 from llama_stack.apis.inference import * # noqa: F403 @@ -46,7 +48,10 @@ async def run( print(f"Running on {len(dataset)} samples") # F1 - processor = MMLUProcessor() + print(GeneratorProcessorRegistry.names()) + processor = GeneratorProcessorRegistry.get( + eval_task_config.processor_config.processor_identifier + )() preprocessed = processor.preprocess(dataset) # Generation From fcb8dea1ef8ba17b0f3df12e85f68d8f5a9e4b16 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 17:46:29 -0700 Subject: [PATCH 17/27] scorer only api --- llama_stack/apis/datasets/datasets.py | 10 ++- llama_stack/apis/evals/client.py | 80 +++++++++++++++---- llama_stack/apis/evals/evals.py | 2 +- .../registry/datasets/dataset_wrappers.py | 15 +++- .../impls/meta_reference/evals/evals.py | 9 ++- .../evals/scorer/basic_scorers.py | 11 ++- .../evals/tasks/run_eval_task.py | 4 +- .../evals/tasks/run_scoring_task.py | 80 +++++++++++++++++++ 8 files changed, 184 insertions(+), 27 deletions(-) create mode 100644 llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index 0f4354c3fc..2b54ac8f66 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -92,8 +92,14 @@ class HuggingfaceDatasetDef(BaseModel): identifier: str = Field( description="A unique name for the dataset", ) - dataset_name: str = Field( - description="The name of the dataset into HF (e.g. hellawag)", + dataset_path: str = Field( + description="The name of the dataset into HF (e.g. meta-llama/Llama-3.1-8B-Instruct-evals)", + ) + dataset_name: Optional[str] = Field( + description="The name of the dataset into HF (e.g. Llama-3.1-8B-Instruct-evals__ifeval__strict__details)", + ) + rename_columns_map: Optional[Dict[str, str]] = Field( + description="A map of column names to rename to fit the schema of eval dataset for scoring", ) kwargs: Dict[str, Any] = Field( description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)", diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index d61de8c39b..e7c5a475df 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -51,34 +51,84 @@ async def run_evals( response.raise_for_status() return EvaluateResponse(**response.json()) + async def run_scorer( + self, + dataset_config: EvaluateDatasetConfig, + eval_scoring_config: EvaluateScoringConfig, + ) -> EvaluateResponse: + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.base_url}/evals/run_scorer", + json={ + "dataset_config": json.loads(dataset_config.json()), + "eval_scoring_config": json.loads(eval_scoring_config.json()), + }, + headers={"Content-Type": "application/json"}, + timeout=3600, + ) + response.raise_for_status() + return EvaluateResponse(**response.json()) + async def run_main(host: str, port: int): client = EvaluationClient(f"http://{host}:{port}") dataset_client = DatasetsClient(f"http://{host}:{port}") - # Custom Eval Task + # Full Eval Task - # 1. register custom dataset + # # 1. register custom dataset + # response = await dataset_client.create_dataset( + # dataset_def=CustomDatasetDef( + # identifier="mmlu-simple-eval-en", + # url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", + # ), + # ) + # cprint(f"datasets/create: {response}", "cyan") + + # # 2. run evals on the registered dataset + # response = await client.run_evals( + # model="Llama3.1-8B-Instruct", + # dataset="mmlu-simple-eval-en", + # task="mmlu", + # ) + + # if response.formatted_report: + # cprint(response.formatted_report, "green") + # else: + # cprint(f"Response: {response}", "green") + + # Scoring Task + + # 1. register huggingface dataset response = await dataset_client.create_dataset( - dataset_def=CustomDatasetDef( - identifier="mmlu-simple-eval-en", - url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", - ), + dataset_def=HuggingfaceDatasetDef( + identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals", + dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + rename_columns_map={ + "output_parsed_answer": "generated_answer", + "input_correct_responses": "expected_answer", + }, + kwargs={"split": "latest"}, + ) ) - cprint(f"datasets/create: {response}", "cyan") + cprint(response, "cyan") # 2. run evals on the registered dataset - response = await client.run_evals( - model="Llama3.1-8B-Instruct", - dataset="mmlu-simple-eval-en", - task="mmlu", + response = await client.run_scorer( + dataset_config=EvaluateDatasetConfig( + dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + row_limit=10, + ), + eval_scoring_config=EvaluateScoringConfig( + scorer_config_list=[ + EvaluateSingleScorerConfig(scorer_name="accuracy"), + ] + ), ) - if response.formatted_report: - cprint(response.formatted_report, "green") - else: - cprint(f"Response: {response}", "green") + cprint(response, "green") # Eleuther Eval Task # response = await client.run_evals( diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index ea985ad3b2..6a3ed8ce28 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -66,7 +66,7 @@ class EvaluationJobCreateResponse(BaseModel): @json_schema_type class EvaluateDatasetConfig(BaseModel): # identifier to previously registered dataset via DatasetDef - dataset_name: str + dataset_identifier: str # limit number of rows to evaluate row_limit: Optional[int] = None kwargs: Optional[Dict[str, Any]] = None diff --git a/llama_stack/distribution/registry/datasets/dataset_wrappers.py b/llama_stack/distribution/registry/datasets/dataset_wrappers.py index e18165a110..88a487d602 100644 --- a/llama_stack/distribution/registry/datasets/dataset_wrappers.py +++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py @@ -72,7 +72,18 @@ def __len__(self): self.load() return len(self.dataset) - def load(self): + def load(self, n_samples: Optional[int] = None): if self.dataset: return - self.dataset = load_dataset(self.config.dataset_name, **self.config.kwargs) + + if self.config.dataset_name: + self.config.kwargs["name"] = self.config.dataset_name + + self.dataset = load_dataset(self.config.dataset_path, **self.config.kwargs) + + if n_samples: + self.dataset = self.dataset.select(range(n_samples)) + + if self.config.rename_columns_map: + for k, v in self.config.rename_columns_map.items(): + self.dataset = self.dataset.rename_column(k, v) diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index 80bf2dd7ad..916e40e3ac 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -13,6 +13,7 @@ from .config import MetaReferenceEvalsImplConfig from .tasks.run_eval_task import RunEvalTask +from .tasks.run_scoring_task import RunScoringTask class MetaReferenceEvalsImpl(Evals): @@ -44,7 +45,7 @@ async def run_eval_task( # construct eval task config from inputs eval_task_config = EvaluateTaskConfig( dataset_config=EvaluateDatasetConfig( - dataset_name=dataset, + dataset_identifier=dataset, row_limit=3, ), processor_config=EvaluateProcessorConfig( @@ -76,8 +77,10 @@ async def run_scorer( ) -> EvaluateResponse: cprint("run_scorer") - # main logic, we need to convert the datset into List[ScorerInputSample] + run_task = RunScoringTask() + eval_result = await run_task.run(dataset_config, eval_scoring_config) return EvaluateResponse( - eval_result={}, + eval_result=eval_result, + formatted_report=json.dumps(eval_result.json(), indent=4), ) diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py index 6099353a87..748f9fc1f8 100644 --- a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py @@ -31,9 +31,14 @@ def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResu extracted_answer = scorer_input_sample.generated_answer expected_answer = scorer_input_sample.expected_answer - accuracy = ( - 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0 - ) + if isinstance(expected_answer, list): + accuracy = ( + 1.0 if extracted_answer and extracted_answer in expected_answer else 0.0 + ) + else: + accuracy = ( + 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0 + ) return SingleEvalResult(score_data={"accuracy": accuracy}) diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py index 83f6264c0d..bcd842c420 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py @@ -43,7 +43,9 @@ async def run( print(f"Running eval task w/ {eval_task_config}") print(DatasetRegistry.names()) - dataset = DatasetRegistry.get(eval_task_config.dataset_config.dataset_name) + dataset = DatasetRegistry.get( + eval_task_config.dataset_config.dataset_identifier + ) dataset.load(n_samples=eval_task_config.dataset_config.row_limit) print(f"Running on {len(dataset)} samples") diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py new file mode 100644 index 0000000000..f856debe95 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py @@ -0,0 +1,80 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from llama_stack.distribution.registry.datasets import DatasetRegistry +from llama_stack.distribution.registry.scorers import ScorerRegistry + +from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 + +from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.apis.inference import * # noqa: F403 +from termcolor import cprint + + +class RunScoringTask(BaseTask): + """ + RunScoringTask - only run scoring (F3) based on dataset and scoring config + """ + + def __init__( + self, + *args, + **kwargs, + ) -> None: + super().__init__(*args, **kwargs) + + def transform_score_input_sample( + self, dataset: BaseDataset + ) -> List[ScorerInputSample]: + scorer_inputs = [] + for x in dataset: + expected_answer = x.data["expected_answer"] + generated_answer = x.data["generated_answer"] + + scorer_inputs.append( + ScorerInputSample( + expected_answer=expected_answer, + generated_answer=generated_answer, + ) + ) + + return scorer_inputs + + async def run( + self, + dataset_config: EvaluateDatasetConfig, + eval_scoring_config: EvaluateScoringConfig, + *args, + **kwargs, + ) -> EvalResult: + print( + f"Running scoring task w/ dataset={dataset_config} scoring={eval_scoring_config}" + ) + + dataset = DatasetRegistry.get(dataset_config.dataset_identifier) + dataset.load(n_samples=dataset_config.row_limit) + print(f"Running on {len(dataset)} samples") + + # transform dataset into + postprocessed = self.transform_score_input_sample(dataset) + cprint(postprocessed, "blue") + + # F3 - scorer + scorer_config_list = eval_scoring_config.scorer_config_list + scorer_list = [] + for s_conf in scorer_config_list: + scorer = ScorerRegistry.get(s_conf.scorer_name) + scorer_list.append(scorer()) + + scorer = AggregateScorer( + scorers=scorer_list, + ) + + scorer_results = scorer.score(postprocessed) + cprint(scorer_results, "magenta") + eval_result = scorer.aggregate_results(scorer_results) + + return eval_result From c8f6849291eb7db098930fc81e462e8956688a44 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 20:42:22 -0700 Subject: [PATCH 18/27] full accuracy --- llama_stack/apis/evals/client.py | 6 +++--- llama_stack/apis/evals/evals.py | 12 ------------ 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index e7c5a475df..1db7afac19 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -99,7 +99,6 @@ async def run_main(host: str, port: int): # cprint(f"Response: {response}", "green") # Scoring Task - # 1. register huggingface dataset response = await dataset_client.create_dataset( dataset_def=HuggingfaceDatasetDef( @@ -119,7 +118,7 @@ async def run_main(host: str, port: int): response = await client.run_scorer( dataset_config=EvaluateDatasetConfig( dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", - row_limit=10, + # row_limit=10, ), eval_scoring_config=EvaluateScoringConfig( scorer_config_list=[ @@ -128,7 +127,8 @@ async def run_main(host: str, port: int): ), ) - cprint(response, "green") + for k, v in response.eval_result.metrics.items(): + cprint(f"{k}: {v}", "green") # Eleuther Eval Task # response = await client.run_evals( diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index 6a3ed8ce28..a02394ee40 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -49,13 +49,6 @@ class EvaluationJobStatusResponse(BaseModel): job_uuid: str -@json_schema_type -class EvaluationJobArtifactsResponse(BaseModel): - """Artifacts of a evaluation job.""" - - job_uuid: str - - @json_schema_type class EvaluationJobCreateResponse(BaseModel): """Response to create a evaluation job.""" @@ -267,8 +260,3 @@ async def run_scorer( # @webmethod(route="/evals/job/cancel") # def cancel_evaluation_job(self, job_uuid: str) -> None: ... - - # @webmethod(route="/evals/job/artifacts") - # def get_evaluation_job_artifacts( - # self, job_uuid: str - # ) -> EvaluationJobArtifactsResponse: ... From 7b5895003ab2c6feed29e3e960e400b9cc0ab15d Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 21:09:59 -0700 Subject: [PATCH 19/27] braintrust scorer --- llama_stack/apis/datasets/datasets.py | 3 + .../distribution/registry/scorers/__init__.py | 4 ++ .../evals/scorer/braintrust_scorer.py | 57 +++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index 2b54ac8f66..ee270b2910 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -73,10 +73,13 @@ class ScorerInputSample(DatasetSample): A dataset is required to have the following columns to be used for scoring: - generated_answer: str - expected_answer: Union[str, List[str]] + - (optional) input_query: str + - (optional) generation_output: PostprocessedGeneration """ generated_answer: str expected_answer: Union[str, List[str]] + input_query: Optional[str] = None generation_output: Optional[PostprocessedGeneration] = None diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py index dedf32ac3a..60e03b2fef 100644 --- a/llama_stack/distribution/registry/scorers/__init__.py +++ b/llama_stack/distribution/registry/scorers/__init__.py @@ -6,14 +6,18 @@ # TODO: make these import config based from llama_stack.apis.evals import * # noqa: F403 from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.scorer.braintrust_scorer import * # noqa: F403 from ..registry import Registry +# TODO: make these import config based ScorerRegistry = Registry[BaseScorer]() SCORER_REGISTRY = { "accuracy": AccuracyScorer, "random": RandomScorer, + "braintrust::factuality": BrainTrustFactualityScorer, + "braintrust::answer-correctness": BraintrustAnswerCorrectnessScorer, } for k, v in SCORER_REGISTRY.items(): diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py new file mode 100644 index 0000000000..5dd4eb383e --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py @@ -0,0 +1,57 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import numpy as np + +from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult +from llama_stack.apis.datasets.datasets import * # noqa: F401 F403 +from autoevals.llm import * # noqa: F403 +from autoevals.ragas import * # noqa: F403 + + +class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]): + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + input_query = scorer_input_sample.input_query + extracted_answer = scorer_input_sample.generated_answer + expected_answer = scorer_input_sample.expected_answer + + evaluator = Factuality() + result = evaluator(output, expected, input=input_query) + factuality = result.score + return SingleEvalResult(score_data={"factuality": factuality}) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + avg_score = np.average( + [result.score_data["factuality"] for result in eval_results] + ) + + return EvalResult( + metrics={ + "avg_factuality_score": avg_score, + } + ) + + +class BraintrustAnswerCorrectnessScorer(BaseScorer[ScorerInputSample]): + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + input_query = scorer_input_sample.input_query + extracted_answer = scorer_input_sample.generated_answer + expected_answer = scorer_input_sample.expected_answer + + evaluator = AnswerCorrectness() + result = evaluator(output, expected, input=input_query) + correctness = result.score + return SingleEvalResult(score_data={"answer_correctness": correctness}) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + avg_score = np.average( + [result.score_data["answer_correctness"] for result in eval_results] + ) + + return EvalResult( + metrics={ + "avg_correctness_score": avg_score, + } + ) From 3c29108b6ed107b41e1c887f9276ebad95f267be Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 21:17:16 -0700 Subject: [PATCH 20/27] input query optional input for braintrust scorer --- llama_stack/apis/evals/client.py | 2 +- llama_stack/distribution/registry/scorers/__init__.py | 2 +- .../impls/meta_reference/evals/tasks/run_scoring_task.py | 5 ++++- llama_stack/providers/registry/evals.py | 2 ++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index 1db7afac19..b795477132 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -118,7 +118,7 @@ async def run_main(host: str, port: int): response = await client.run_scorer( dataset_config=EvaluateDatasetConfig( dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", - # row_limit=10, + row_limit=10, ), eval_scoring_config=EvaluateScoringConfig( scorer_config_list=[ diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py index 60e03b2fef..7cbe2a4262 100644 --- a/llama_stack/distribution/registry/scorers/__init__.py +++ b/llama_stack/distribution/registry/scorers/__init__.py @@ -16,7 +16,7 @@ SCORER_REGISTRY = { "accuracy": AccuracyScorer, "random": RandomScorer, - "braintrust::factuality": BrainTrustFactualityScorer, + "braintrust::factuality": BraintrustFactualityScorer, "braintrust::answer-correctness": BraintrustAnswerCorrectnessScorer, } diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py index f856debe95..9e4821a73b 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py @@ -33,11 +33,15 @@ def transform_score_input_sample( for x in dataset: expected_answer = x.data["expected_answer"] generated_answer = x.data["generated_answer"] + input_query = None + if "input_query" in x.data: + input_query = x.data["input_query"] scorer_inputs.append( ScorerInputSample( expected_answer=expected_answer, generated_answer=generated_answer, + input_query=input_query, ) ) @@ -74,7 +78,6 @@ async def run( ) scorer_results = scorer.score(postprocessed) - cprint(scorer_results, "magenta") eval_result = scorer.aggregate_results(scorer_results) return eval_result diff --git a/llama_stack/providers/registry/evals.py b/llama_stack/providers/registry/evals.py index 8693ec603a..6ea4c16f56 100644 --- a/llama_stack/providers/registry/evals.py +++ b/llama_stack/providers/registry/evals.py @@ -20,6 +20,8 @@ def available_providers() -> List[ProviderSpec]: "pandas", "scikit-learn", "datasets", + "numpy", + "autoevals", ], module="llama_stack.providers.impls.meta_reference.evals", config_class="llama_stack.providers.impls.meta_reference.evals.MetaReferenceEvalsImplConfig", From ec6c63ba5713533991080026de917afa24bd1284 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 23:36:15 -0700 Subject: [PATCH 21/27] dataset accept file uploads --- llama_stack/apis/evals/client.py | 37 ++++++++++++++++--- .../registry/datasets/dataset_wrappers.py | 25 ++++++++++++- .../evals/tasks/run_scoring_task.py | 4 +- llama_stack/providers/registry/evals.py | 1 + 4 files changed, 58 insertions(+), 9 deletions(-) diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index b795477132..7d812817b4 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -12,9 +12,28 @@ from termcolor import cprint from .evals import * # noqa: F403 +import base64 +import mimetypes +import os + from ..datasets.client import DatasetsClient +def data_url_from_file(file_path: str) -> str: + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found: {file_path}") + + with open(file_path, "rb") as file: + file_content = file.read() + + base64_content = base64.b64encode(file_content).decode("utf-8") + mime_type, _ = mimetypes.guess_type(file_path) + + data_url = f"data:{mime_type};base64,{base64_content}" + + return data_url + + class EvaluationClient(Evals): def __init__(self, base_url: str): self.base_url = base_url @@ -70,9 +89,8 @@ async def run_scorer( return EvaluateResponse(**response.json()) -async def run_main(host: str, port: int): +async def run_main(host: str, port: int, eval_dataset_path: str = ""): client = EvaluationClient(f"http://{host}:{port}") - dataset_client = DatasetsClient(f"http://{host}:{port}") # Full Eval Task @@ -114,10 +132,19 @@ async def run_main(host: str, port: int): ) cprint(response, "cyan") + response = await dataset_client.create_dataset( + dataset_def=CustomDatasetDef( + identifier="rag-evals", + url=data_url_from_file(eval_dataset_path), + ) + ) + cprint(response, "cyan") + # 2. run evals on the registered dataset response = await client.run_scorer( dataset_config=EvaluateDatasetConfig( - dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + dataset_identifier="rag-evals", + # dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", row_limit=10, ), eval_scoring_config=EvaluateScoringConfig( @@ -141,8 +168,8 @@ async def run_main(host: str, port: int): # ) -def main(host: str, port: int): - asyncio.run(run_main(host, port)) +def main(host: str, port: int, eval_dataset_path: str = ""): + asyncio.run(run_main(host, port, eval_dataset_path)) if __name__ == "__main__": diff --git a/llama_stack/distribution/registry/datasets/dataset_wrappers.py b/llama_stack/distribution/registry/datasets/dataset_wrappers.py index 88a487d602..410ad394a3 100644 --- a/llama_stack/distribution/registry/datasets/dataset_wrappers.py +++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py @@ -3,10 +3,13 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import io + import pandas from datasets import Dataset, load_dataset from llama_stack.apis.datasets import * # noqa: F403 +from llama_stack.providers.utils.memory.vector_store import parse_data_url class CustomDataset(BaseDataset[DictSample]): @@ -37,11 +40,31 @@ def load(self, n_samples: Optional[int] = None) -> None: if self.dataset: return - # TODO: better support w/ data url + # TODO: more robust support w/ data url if self.config.url.endswith(".csv"): df = pandas.read_csv(self.config.url) elif self.config.url.endswith(".xlsx"): df = pandas.read_excel(self.config.url) + elif self.config.url.startswith("data:"): + parts = parse_data_url(self.config.url) + data = parts["data"] + if parts["is_base64"]: + data = base64.b64decode(data) + else: + data = unquote(data) + encoding = parts["encoding"] or "utf-8" + data = data.encode(encoding) + + mime_type = parts["mimetype"] + mime_category = mime_type.split("/")[0] + data_bytes = io.BytesIO(data) + + if mime_category == "text": + df = pandas.read_csv(data_bytes) + else: + df = pandas.read_excel(data_bytes) + else: + raise ValueError(f"Unsupported file type: {self.config.url}") if n_samples is not None: df = df.sample(n=n_samples) diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py index 9e4821a73b..9ff6cde4d6 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py @@ -11,7 +11,6 @@ from llama_stack.apis.evals import * # noqa: F403 from llama_stack.apis.inference import * # noqa: F403 -from termcolor import cprint class RunScoringTask(BaseTask): @@ -62,9 +61,8 @@ async def run( dataset.load(n_samples=dataset_config.row_limit) print(f"Running on {len(dataset)} samples") - # transform dataset into + # transform dataset into List[ScorerInputSample] postprocessed = self.transform_score_input_sample(dataset) - cprint(postprocessed, "blue") # F3 - scorer scorer_config_list = eval_scoring_config.scorer_config_list diff --git a/llama_stack/providers/registry/evals.py b/llama_stack/providers/registry/evals.py index 6ea4c16f56..a8a7e735ff 100644 --- a/llama_stack/providers/registry/evals.py +++ b/llama_stack/providers/registry/evals.py @@ -22,6 +22,7 @@ def available_providers() -> List[ProviderSpec]: "datasets", "numpy", "autoevals", + "openpyxl", ], module="llama_stack.providers.impls.meta_reference.evals", config_class="llama_stack.providers.impls.meta_reference.evals.MetaReferenceEvalsImplConfig", From 9cc0a54f0be8c31061ff0ed19e866bcb5fb7bdbc Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 15 Oct 2024 00:42:03 -0700 Subject: [PATCH 22/27] rag correctness scorer w/ custom dataset --- llama_stack/apis/datasets/datasets.py | 5 +++++ llama_stack/apis/evals/client.py | 6 ++++++ .../distribution/registry/datasets/dataset_wrappers.py | 3 +++ .../meta_reference/evals/scorer/braintrust_scorer.py | 8 ++++---- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index ee270b2910..c0aa4d161e 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -103,6 +103,7 @@ class HuggingfaceDatasetDef(BaseModel): ) rename_columns_map: Optional[Dict[str, str]] = Field( description="A map of column names to rename to fit the schema of eval dataset for scoring", + default=None, ) kwargs: Dict[str, Any] = Field( description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)", @@ -119,6 +120,10 @@ class CustomDatasetDef(BaseModel): url: str = Field( description="The URL to the dataset", ) + rename_columns_map: Optional[Dict[str, str]] = Field( + description="A map of column names to rename to fit the schema of eval dataset for scoring", + default=None, + ) DatasetDef = Annotated[ diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index 7d812817b4..07877c13e5 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -136,6 +136,9 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""): dataset_def=CustomDatasetDef( identifier="rag-evals", url=data_url_from_file(eval_dataset_path), + rename_columns_map={ + "query": "input_query", + }, ) ) cprint(response, "cyan") @@ -150,6 +153,9 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""): eval_scoring_config=EvaluateScoringConfig( scorer_config_list=[ EvaluateSingleScorerConfig(scorer_name="accuracy"), + EvaluateSingleScorerConfig( + scorer_name="braintrust::answer-correctness" + ), ] ), ) diff --git a/llama_stack/distribution/registry/datasets/dataset_wrappers.py b/llama_stack/distribution/registry/datasets/dataset_wrappers.py index 410ad394a3..93cbd9ab21 100644 --- a/llama_stack/distribution/registry/datasets/dataset_wrappers.py +++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py @@ -70,6 +70,9 @@ def load(self, n_samples: Optional[int] = None) -> None: df = df.sample(n=n_samples) self.dataset = Dataset.from_pandas(df) + if self.config.rename_columns_map: + for k, v in self.config.rename_columns_map.items(): + self.dataset = self.dataset.rename_column(k, v) class HuggingfaceDataset(BaseDataset[DictSample]): diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py index 5dd4eb383e..c124aaad6a 100644 --- a/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py @@ -14,11 +14,11 @@ class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]): def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: input_query = scorer_input_sample.input_query - extracted_answer = scorer_input_sample.generated_answer + generated_answer = scorer_input_sample.generated_answer expected_answer = scorer_input_sample.expected_answer evaluator = Factuality() - result = evaluator(output, expected, input=input_query) + result = evaluator(generated_answer, expected_answer, input=input_query) factuality = result.score return SingleEvalResult(score_data={"factuality": factuality}) @@ -37,11 +37,11 @@ def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: class BraintrustAnswerCorrectnessScorer(BaseScorer[ScorerInputSample]): def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: input_query = scorer_input_sample.input_query - extracted_answer = scorer_input_sample.generated_answer + generated_answer = scorer_input_sample.generated_answer expected_answer = scorer_input_sample.expected_answer evaluator = AnswerCorrectness() - result = evaluator(output, expected, input=input_query) + result = evaluator(generated_answer, expected_answer, input=input_query) correctness = result.score return SingleEvalResult(score_data={"answer_correctness": correctness}) From d2b62157a3ef7aa6461e5c1857924578890355d9 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 15 Oct 2024 00:44:54 -0700 Subject: [PATCH 23/27] openapi gen --- docs/openapi_generator/generate.py | 4 +- docs/resources/llama-stack-spec.html | 1771 +++++++++-------- docs/resources/llama-stack-spec.yaml | 594 +++--- llama_stack/apis/datasets/datasets.py | 20 + .../apis/post_training/post_training.py | 2 +- 5 files changed, 1370 insertions(+), 1021 deletions(-) diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py index 871c01a80f..994b06e583 100644 --- a/docs/openapi_generator/generate.py +++ b/docs/openapi_generator/generate.py @@ -33,7 +33,7 @@ from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_stack.apis.agents import * # noqa: F403 -from llama_stack.apis.dataset import * # noqa: F403 +from llama_stack.apis.datasets import * # noqa: F403 from llama_stack.apis.evals import * # noqa: F403 from llama_stack.apis.inference import * # noqa: F403 from llama_stack.apis.batch_inference import * # noqa: F403 @@ -61,7 +61,7 @@ class LlamaStack( Telemetry, PostTraining, Memory, - Evaluations, + Evals, Models, Shields, Inspect, diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html index 96ef7e4bb0..ac75dbf049 100644 --- a/docs/resources/llama-stack-spec.html +++ b/docs/resources/llama-stack-spec.html @@ -21,7 +21,7 @@ "info": { "title": "[DRAFT] Llama Stack Specification", "version": "0.0.1", - "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-09 21:10:09.073430" + "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-15 00:44:26.278642" }, "servers": [ { @@ -109,39 +109,6 @@ } } }, - "/evaluate/job/cancel": { - "post": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "Evaluations" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CancelEvaluationJobRequest" - } - } - }, - "required": true - } - } - }, "/post_training/job/cancel": { "post": { "responses": { @@ -355,7 +322,7 @@ "200": { "description": "OK", "content": { - "application/json": { + "text/event-stream": { "schema": { "$ref": "#/components/schemas/AgentTurnResponseStreamChunk" } @@ -393,7 +360,14 @@ "post": { "responses": { "200": { - "description": "OK" + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CreateDatasetResponse" + } + } + } } }, "tags": [ @@ -489,119 +463,6 @@ } }, "/datasets/delete": { - "post": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "Datasets" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/DeleteDatasetRequest" - } - } - }, - "required": true - } - } - }, - "/inference/embeddings": { - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EmbeddingsResponse" - } - } - } - } - }, - "tags": [ - "Inference" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EmbeddingsRequest" - } - } - }, - "required": true - } - } - }, - "/evaluate/question_answering/": { - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluationJob" - } - } - } - } - }, - "tags": [ - "Evaluations" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluateQuestionAnsweringRequest" - } - } - }, - "required": true - } - } - }, - "/evaluate/summarization/": { "post": { "responses": { "200": { @@ -609,14 +470,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluationJob" + "$ref": "#/components/schemas/DeleteDatasetResponse" } } } } }, "tags": [ - "Evaluations" + "Datasets" ], "parameters": [ { @@ -633,7 +494,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluateSummarizationRequest" + "$ref": "#/components/schemas/DeleteDatasetRequest" } } }, @@ -641,7 +502,7 @@ } } }, - "/evaluate/text_generation/": { + "/inference/embeddings": { "post": { "responses": { "200": { @@ -649,14 +510,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluationJob" + "$ref": "#/components/schemas/EmbeddingsResponse" } } } } }, "tags": [ - "Evaluations" + "Inference" ], "parameters": [ { @@ -673,7 +534,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluateTextGenerationRequest" + "$ref": "#/components/schemas/EmbeddingsRequest" } } }, @@ -845,7 +706,21 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/TrainEvalDataset" + "oneOf": [ + { + "oneOf": [ + { + "$ref": "#/components/schemas/HuggingfaceDatasetDef" + }, + { + "$ref": "#/components/schemas/CustomDatasetDef" + } + ] + }, + { + "type": "null" + } + ] } } } @@ -856,7 +731,7 @@ ], "parameters": [ { - "name": "dataset_uuid", + "name": "dataset_identifier", "in": "query", "required": true, "schema": { @@ -875,7 +750,7 @@ ] } }, - "/evaluate/job/artifacts": { + "/memory_banks/get": { "get": { "responses": { "200": { @@ -883,18 +758,38 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluationJobArtifactsResponse" + "oneOf": [ + { + "oneOf": [ + { + "$ref": "#/components/schemas/VectorMemoryBankDef" + }, + { + "$ref": "#/components/schemas/KeyValueMemoryBankDef" + }, + { + "$ref": "#/components/schemas/KeywordMemoryBankDef" + }, + { + "$ref": "#/components/schemas/GraphMemoryBankDef" + } + ] + }, + { + "type": "null" + } + ] } } } } }, "tags": [ - "Evaluations" + "MemoryBanks" ], "parameters": [ { - "name": "job_uuid", + "name": "identifier", "in": "query", "required": true, "schema": { @@ -913,7 +808,7 @@ ] } }, - "/evaluate/job/logs": { + "/models/get": { "get": { "responses": { "200": { @@ -921,18 +816,25 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluationJobLogStream" + "oneOf": [ + { + "$ref": "#/components/schemas/ModelDefWithProvider" + }, + { + "type": "null" + } + ] } } } } }, "tags": [ - "Evaluations" + "Models" ], "parameters": [ { - "name": "job_uuid", + "name": "identifier", "in": "query", "required": true, "schema": { @@ -951,7 +853,7 @@ ] } }, - "/evaluate/job/status": { + "/shields/get": { "get": { "responses": { "200": { @@ -959,18 +861,25 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluationJobStatusResponse" + "oneOf": [ + { + "$ref": "#/components/schemas/ShieldDefWithProvider" + }, + { + "type": "null" + } + ] } } } } }, "tags": [ - "Evaluations" + "Shields" ], "parameters": [ { - "name": "job_uuid", + "name": "shield_type", "in": "query", "required": true, "schema": { @@ -989,24 +898,32 @@ ] } }, - "/evaluate/jobs": { + "/telemetry/get_trace": { "get": { "responses": { "200": { "description": "OK", "content": { - "application/jsonl": { + "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluationJob" + "$ref": "#/components/schemas/Trace" } } } } }, "tags": [ - "Evaluations" + "Telemetry" ], "parameters": [ + { + "name": "trace_id", + "in": "query", + "required": true, + "schema": { + "type": "string" + } + }, { "name": "X-LlamaStack-ProviderData", "in": "header", @@ -1019,7 +936,7 @@ ] } }, - "/memory_banks/get": { + "/post_training/job/artifacts": { "get": { "responses": { "200": { @@ -1027,200 +944,14 @@ "content": { "application/json": { "schema": { - "oneOf": [ - { - "oneOf": [ - { - "$ref": "#/components/schemas/VectorMemoryBankDef" - }, - { - "$ref": "#/components/schemas/KeyValueMemoryBankDef" - }, - { - "$ref": "#/components/schemas/KeywordMemoryBankDef" - }, - { - "$ref": "#/components/schemas/GraphMemoryBankDef" - } - ] - }, - { - "type": "null" - } - ] + "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse" } } } } }, "tags": [ - "MemoryBanks" - ], - "parameters": [ - { - "name": "identifier", - "in": "query", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ] - } - }, - "/models/get": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/ModelDefWithProvider" - }, - { - "type": "null" - } - ] - } - } - } - } - }, - "tags": [ - "Models" - ], - "parameters": [ - { - "name": "identifier", - "in": "query", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ] - } - }, - "/shields/get": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/ShieldDefWithProvider" - }, - { - "type": "null" - } - ] - } - } - } - } - }, - "tags": [ - "Shields" - ], - "parameters": [ - { - "name": "shield_type", - "in": "query", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ] - } - }, - "/telemetry/get_trace": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Trace" - } - } - } - } - }, - "tags": [ - "Telemetry" - ], - "parameters": [ - { - "name": "trace_id", - "in": "query", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ] - } - }, - "/post_training/job/artifacts": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse" - } - } - } - } - }, - "tags": [ - "PostTraining" + "PostTraining" ], "parameters": [ { @@ -1412,6 +1143,43 @@ } } }, + "/datasets/list": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/jsonl": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/HuggingfaceDatasetDef" + }, + { + "$ref": "#/components/schemas/CustomDatasetDef" + } + ] + } + } + } + } + }, + "tags": [ + "Datasets" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ] + } + }, "/memory_banks/list": { "get": { "responses": { @@ -1836,7 +1604,7 @@ } } }, - "/safety/run_shield": { + "/evals/run_eval_task": { "post": { "responses": { "200": { @@ -1844,14 +1612,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RunShieldResponse" + "$ref": "#/components/schemas/EvaluateResponse" } } } } }, "tags": [ - "Safety" + "Evals" ], "parameters": [ { @@ -1868,7 +1636,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RunShieldRequest" + "$ref": "#/components/schemas/RunEvalTaskRequest" } } }, @@ -1876,7 +1644,7 @@ } } }, - "/post_training/supervised_fine_tune": { + "/evals/run_scorer": { "post": { "responses": { "200": { @@ -1884,14 +1652,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/PostTrainingJob" + "$ref": "#/components/schemas/EvaluateResponse" } } } } }, "tags": [ - "PostTraining" + "Evals" ], "parameters": [ { @@ -1908,7 +1676,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SupervisedFineTuneRequest" + "$ref": "#/components/schemas/RunScorerRequest" } } }, @@ -1916,7 +1684,7 @@ } } }, - "/synthetic_data_generation/generate": { + "/safety/run_shield": { "post": { "responses": { "200": { @@ -1924,14 +1692,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SyntheticDataGenerationResponse" + "$ref": "#/components/schemas/RunShieldResponse" } } } } }, "tags": [ - "SyntheticDataGeneration" + "Safety" ], "parameters": [ { @@ -1948,54 +1716,134 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SyntheticDataGenerateRequest" + "$ref": "#/components/schemas/RunShieldRequest" } } }, "required": true } } - } - }, - "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema", - "components": { - "schemas": { - "BuiltinTool": { - "type": "string", - "enum": [ - "brave_search", - "wolfram_alpha", - "photogen", - "code_interpreter" - ] - }, - "CompletionMessage": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "assistant", - "default": "assistant" - }, - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "$ref": "#/components/schemas/ImageMedia" - }, - { - "type": "array", - "items": { - "oneOf": [ - { - "type": "string" - }, - { - "$ref": "#/components/schemas/ImageMedia" - } - ] + }, + "/post_training/supervised_fine_tune": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/PostTrainingJob" + } + } + } + } + }, + "tags": [ + "PostTraining" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupervisedFineTuneRequest" + } + } + }, + "required": true + } + } + }, + "/synthetic_data_generation/generate": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SyntheticDataGenerationResponse" + } + } + } + } + }, + "tags": [ + "SyntheticDataGeneration" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SyntheticDataGenerateRequest" + } + } + }, + "required": true + } + } + } + }, + "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema", + "components": { + "schemas": { + "BuiltinTool": { + "type": "string", + "enum": [ + "brave_search", + "wolfram_alpha", + "photogen", + "code_interpreter" + ] + }, + "CompletionMessage": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "assistant", + "default": "assistant" + }, + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/components/schemas/ImageMedia" + }, + { + "type": "array", + "items": { + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/components/schemas/ImageMedia" + } + ] } } ] @@ -2571,18 +2419,6 @@ "completion_message_batch" ] }, - "CancelEvaluationJobRequest": { - "type": "object", - "properties": { - "job_uuid": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_uuid" - ] - }, "CancelTrainingJobRequest": { "type": "object", "properties": { @@ -4090,19 +3926,58 @@ "error" ] }, - "TrainEvalDataset": { + "CustomDatasetDef": { "type": "object", "properties": { - "columns": { + "type": { + "type": "string", + "const": "custom", + "default": "custom" + }, + "identifier": { + "type": "string" + }, + "url": { + "type": "string" + }, + "rename_columns_map": { "type": "object", "additionalProperties": { - "$ref": "#/components/schemas/TrainEvalDatasetColumnType" + "type": "string" } + } + }, + "additionalProperties": false, + "required": [ + "type", + "identifier", + "url" + ] + }, + "HuggingfaceDatasetDef": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "huggingface", + "default": "huggingface" }, - "content_url": { - "$ref": "#/components/schemas/URL" + "identifier": { + "type": "string" }, - "metadata": { + "dataset_path": { + "type": "string" + }, + "dataset_name": { + "type": "string" + }, + "rename_columns_map": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "kwargs": { "type": "object", "additionalProperties": { "oneOf": [ @@ -4130,35 +4005,48 @@ }, "additionalProperties": false, "required": [ - "columns", - "content_url" - ], - "title": "Dataset to be used for training or evaluating language models." - }, - "TrainEvalDatasetColumnType": { - "type": "string", - "enum": [ - "dialog", - "text", - "media", - "number", - "json" + "type", + "identifier", + "dataset_path", + "kwargs" ] }, "CreateDatasetRequest": { "type": "object", "properties": { - "uuid": { - "type": "string" + "dataset_def": { + "oneOf": [ + { + "$ref": "#/components/schemas/HuggingfaceDatasetDef" + }, + { + "$ref": "#/components/schemas/CustomDatasetDef" + } + ] + } + }, + "additionalProperties": false, + "required": [ + "dataset_def" + ] + }, + "CreateDatasetResponse": { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "success", + "fail" + ] }, - "dataset": { - "$ref": "#/components/schemas/TrainEvalDataset" + "msg": { + "type": "string" } }, "additionalProperties": false, "required": [ - "uuid", - "dataset" + "status" ] }, "DeleteAgentsRequest": { @@ -4192,13 +4080,32 @@ "DeleteDatasetRequest": { "type": "object", "properties": { - "dataset_uuid": { + "dataset_identifier": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "dataset_identifier" + ] + }, + "DeleteDatasetResponse": { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "success", + "fail" + ] + }, + "msg": { "type": "string" } }, "additionalProperties": false, "required": [ - "dataset_uuid" + "status" ] }, "EmbeddingsRequest": { @@ -4258,112 +4165,42 @@ "embeddings" ] }, - "EvaluateQuestionAnsweringRequest": { + "GetAgentsSessionRequest": { "type": "object", "properties": { - "metrics": { + "turn_ids": { "type": "array", "items": { - "type": "string", - "enum": [ - "em", - "f1" - ] + "type": "string" } } }, - "additionalProperties": false, - "required": [ - "metrics" - ] + "additionalProperties": false }, - "EvaluationJob": { + "GraphMemoryBankDef": { "type": "object", "properties": { - "job_uuid": { + "identifier": { "type": "string" + }, + "provider_id": { + "type": "string", + "default": "" + }, + "type": { + "type": "string", + "const": "graph", + "default": "graph" } }, "additionalProperties": false, "required": [ - "job_uuid" + "identifier", + "provider_id", + "type" ] }, - "EvaluateSummarizationRequest": { - "type": "object", - "properties": { - "metrics": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "rouge", - "bleu" - ] - } - } - }, - "additionalProperties": false, - "required": [ - "metrics" - ] - }, - "EvaluateTextGenerationRequest": { - "type": "object", - "properties": { - "metrics": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "perplexity", - "rouge", - "bleu" - ] - } - } - }, - "additionalProperties": false, - "required": [ - "metrics" - ] - }, - "GetAgentsSessionRequest": { - "type": "object", - "properties": { - "turn_ids": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false - }, - "GraphMemoryBankDef": { - "type": "object", - "properties": { - "identifier": { - "type": "string" - }, - "provider_id": { - "type": "string", - "default": "" - }, - "type": { - "type": "string", - "const": "graph", - "default": "graph" - } - }, - "additionalProperties": false, - "required": [ - "identifier", - "provider_id", - "type" - ] - }, - "KeyValueMemoryBankDef": { + "KeyValueMemoryBankDef": { "type": "object", "properties": { "identifier": { @@ -4513,43 +4350,6 @@ "step" ] }, - "EvaluationJobArtifactsResponse": { - "type": "object", - "properties": { - "job_uuid": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_uuid" - ], - "title": "Artifacts of a evaluation job." - }, - "EvaluationJobLogStream": { - "type": "object", - "properties": { - "job_uuid": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_uuid" - ] - }, - "EvaluationJobStatusResponse": { - "type": "object", - "properties": { - "job_uuid": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_uuid" - ] - }, "ModelDefWithProvider": { "type": "object", "properties": { @@ -5265,6 +5065,61 @@ "dpo" ] }, + "TrainEvalDataset": { + "type": "object", + "properties": { + "columns": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/TrainEvalDatasetColumnType" + } + }, + "content_url": { + "$ref": "#/components/schemas/URL" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "columns", + "content_url" + ], + "title": "Dataset to be used for training or evaluating language models." + }, + "TrainEvalDatasetColumnType": { + "type": "string", + "enum": [ + "dialog", + "text", + "media", + "number", + "json" + ] + }, "TrainingConfig": { "type": "object", "properties": { @@ -5491,222 +5346,530 @@ "document_id": { "type": "string" } - }, - "additionalProperties": false, - "required": [ - "content", - "token_count", - "document_id" + }, + "additionalProperties": false, + "required": [ + "content", + "token_count", + "document_id" + ] + } + }, + "scores": { + "type": "array", + "items": { + "type": "number" + } + } + }, + "additionalProperties": false, + "required": [ + "chunks", + "scores" + ] + }, + "RegisterMemoryBankRequest": { + "type": "object", + "properties": { + "memory_bank": { + "oneOf": [ + { + "$ref": "#/components/schemas/VectorMemoryBankDef" + }, + { + "$ref": "#/components/schemas/KeyValueMemoryBankDef" + }, + { + "$ref": "#/components/schemas/KeywordMemoryBankDef" + }, + { + "$ref": "#/components/schemas/GraphMemoryBankDef" + } + ] + } + }, + "additionalProperties": false, + "required": [ + "memory_bank" + ] + }, + "RegisterModelRequest": { + "type": "object", + "properties": { + "model": { + "$ref": "#/components/schemas/ModelDefWithProvider" + } + }, + "additionalProperties": false, + "required": [ + "model" + ] + }, + "RegisterShieldRequest": { + "type": "object", + "properties": { + "shield": { + "$ref": "#/components/schemas/ShieldDefWithProvider" + } + }, + "additionalProperties": false, + "required": [ + "shield" + ] + }, + "DialogGenerations": { + "type": "object", + "properties": { + "dialog": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/UserMessage" + }, + { + "$ref": "#/components/schemas/SystemMessage" + }, + { + "$ref": "#/components/schemas/ToolResponseMessage" + }, + { + "$ref": "#/components/schemas/CompletionMessage" + } + ] + } + }, + "sampled_generations": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/UserMessage" + }, + { + "$ref": "#/components/schemas/SystemMessage" + }, + { + "$ref": "#/components/schemas/ToolResponseMessage" + }, + { + "$ref": "#/components/schemas/CompletionMessage" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "dialog", + "sampled_generations" + ] + }, + "RewardScoreRequest": { + "type": "object", + "properties": { + "dialog_generations": { + "type": "array", + "items": { + "$ref": "#/components/schemas/DialogGenerations" + } + }, + "model": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "dialog_generations", + "model" + ] + }, + "RewardScoringResponse": { + "type": "object", + "properties": { + "scored_generations": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ScoredDialogGenerations" + } + } + }, + "additionalProperties": false, + "required": [ + "scored_generations" + ], + "title": "Response from the reward scoring. Batch of (prompt, response, score) tuples that pass the threshold." + }, + "ScoredDialogGenerations": { + "type": "object", + "properties": { + "dialog": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/UserMessage" + }, + { + "$ref": "#/components/schemas/SystemMessage" + }, + { + "$ref": "#/components/schemas/ToolResponseMessage" + }, + { + "$ref": "#/components/schemas/CompletionMessage" + } + ] + } + }, + "scored_generations": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ScoredMessage" + } + } + }, + "additionalProperties": false, + "required": [ + "dialog", + "scored_generations" + ] + }, + "ScoredMessage": { + "type": "object", + "properties": { + "message": { + "oneOf": [ + { + "$ref": "#/components/schemas/UserMessage" + }, + { + "$ref": "#/components/schemas/SystemMessage" + }, + { + "$ref": "#/components/schemas/ToolResponseMessage" + }, + { + "$ref": "#/components/schemas/CompletionMessage" + } + ] + }, + "score": { + "type": "number" + } + }, + "additionalProperties": false, + "required": [ + "message", + "score" + ] + }, + "EvaluateDatasetConfig": { + "type": "object", + "properties": { + "dataset_identifier": { + "type": "string" + }, + "row_limit": { + "type": "integer" + }, + "kwargs": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "dataset_identifier" + ] + }, + "EvaluateJudgeScoringConfig": { + "type": "object" + }, + "EvaluateModelGenerationConfig": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams" + }, + "kwargs": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "model", + "sampling_params" + ] + }, + "EvaluatePostprocessConfig": { + "type": "object", + "properties": { + "kwargs": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false + }, + "EvaluatePreprocessConfig": { + "type": "object", + "properties": { + "kwargs": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } ] } + } + }, + "additionalProperties": false + }, + "EvaluateProcessorConfig": { + "type": "object", + "properties": { + "processor_identifier": { + "type": "string" }, - "scores": { - "type": "array", - "items": { - "type": "number" - } + "preprocess_config": { + "$ref": "#/components/schemas/EvaluatePreprocessConfig" + }, + "postprocess_config": { + "$ref": "#/components/schemas/EvaluatePostprocessConfig" } }, "additionalProperties": false, "required": [ - "chunks", - "scores" + "processor_identifier" ] }, - "RegisterMemoryBankRequest": { + "EvaluateScoringConfig": { "type": "object", "properties": { - "memory_bank": { - "oneOf": [ - { - "$ref": "#/components/schemas/VectorMemoryBankDef" - }, - { - "$ref": "#/components/schemas/KeyValueMemoryBankDef" - }, - { - "$ref": "#/components/schemas/KeywordMemoryBankDef" - }, - { - "$ref": "#/components/schemas/GraphMemoryBankDef" - } - ] + "scorer_config_list": { + "type": "array", + "items": { + "$ref": "#/components/schemas/EvaluateSingleScorerConfig" + } } }, "additionalProperties": false, "required": [ - "memory_bank" + "scorer_config_list" ] }, - "RegisterModelRequest": { + "EvaluateSingleScorerConfig": { "type": "object", "properties": { - "model": { - "$ref": "#/components/schemas/ModelDefWithProvider" + "scorer_name": { + "type": "string" + }, + "llm_judge_config": { + "$ref": "#/components/schemas/LLMJudgeConfig" } }, "additionalProperties": false, "required": [ - "model" + "scorer_name" ] }, - "RegisterShieldRequest": { + "EvaluateTaskConfig": { "type": "object", "properties": { - "shield": { - "$ref": "#/components/schemas/ShieldDefWithProvider" + "dataset_config": { + "$ref": "#/components/schemas/EvaluateDatasetConfig" + }, + "processor_config": { + "$ref": "#/components/schemas/EvaluateProcessorConfig" + }, + "generation_config": { + "$ref": "#/components/schemas/EvaluateModelGenerationConfig" + }, + "scoring_config": { + "$ref": "#/components/schemas/EvaluateScoringConfig" } }, "additionalProperties": false, "required": [ - "shield" + "dataset_config", + "processor_config", + "generation_config", + "scoring_config" ] }, - "DialogGenerations": { + "LLMJudgeConfig": { "type": "object", "properties": { - "dialog": { - "type": "array", - "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/SystemMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - }, - { - "$ref": "#/components/schemas/CompletionMessage" - } - ] - } + "judge_processor_config": { + "$ref": "#/components/schemas/EvaluateProcessorConfig" }, - "sampled_generations": { - "type": "array", - "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/SystemMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - }, - { - "$ref": "#/components/schemas/CompletionMessage" - } - ] - } + "judge_model_generation_config": { + "$ref": "#/components/schemas/EvaluateModelGenerationConfig" + }, + "judge_scoring_config": { + "$ref": "#/components/schemas/EvaluateJudgeScoringConfig" } }, "additionalProperties": false, "required": [ - "dialog", - "sampled_generations" + "judge_processor_config", + "judge_model_generation_config", + "judge_scoring_config" ] }, - "RewardScoreRequest": { + "RunEvalTaskRequest": { "type": "object", "properties": { - "dialog_generations": { - "type": "array", - "items": { - "$ref": "#/components/schemas/DialogGenerations" - } - }, "model": { "type": "string" + }, + "task": { + "type": "string" + }, + "dataset": { + "type": "string" + }, + "eval_task_config": { + "$ref": "#/components/schemas/EvaluateTaskConfig" } }, "additionalProperties": false, "required": [ - "dialog_generations", - "model" + "model", + "task" ] }, - "RewardScoringResponse": { + "EvalResult": { "type": "object", "properties": { - "scored_generations": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ScoredDialogGenerations" + "metrics": { + "type": "object", + "additionalProperties": { + "type": "number" } } }, "additionalProperties": false, "required": [ - "scored_generations" + "metrics" ], - "title": "Response from the reward scoring. Batch of (prompt, response, score) tuples that pass the threshold." + "title": "Aggregated final evaluation result." }, - "ScoredDialogGenerations": { + "EvaluateResponse": { "type": "object", "properties": { - "dialog": { - "type": "array", - "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/SystemMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - }, - { - "$ref": "#/components/schemas/CompletionMessage" - } - ] - } + "eval_result": { + "$ref": "#/components/schemas/EvalResult" }, - "scored_generations": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ScoredMessage" - } + "formatted_report": { + "type": "string" } }, "additionalProperties": false, "required": [ - "dialog", - "scored_generations" - ] + "eval_result" + ], + "title": "Scores for evaluation." }, - "ScoredMessage": { + "RunScorerRequest": { "type": "object", "properties": { - "message": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/SystemMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - }, - { - "$ref": "#/components/schemas/CompletionMessage" - } - ] + "dataset_config": { + "$ref": "#/components/schemas/EvaluateDatasetConfig" }, - "score": { - "type": "number" + "eval_scoring_config": { + "$ref": "#/components/schemas/EvaluateScoringConfig" } }, "additionalProperties": false, "required": [ - "message", - "score" + "dataset_config", + "eval_scoring_config" ] }, "RunShieldRequest": { @@ -6075,46 +6238,46 @@ ], "tags": [ { - "name": "RewardScoring" + "name": "Models" }, { - "name": "Memory" + "name": "BatchInference" }, { - "name": "SyntheticDataGeneration" + "name": "Inspect" }, { - "name": "Models" + "name": "Evals" }, { "name": "Safety" }, { - "name": "BatchInference" + "name": "Shields" }, { - "name": "Agents" + "name": "Telemetry" }, { - "name": "MemoryBanks" + "name": "Agents" }, { - "name": "Shields" + "name": "Memory" }, { - "name": "Datasets" + "name": "SyntheticDataGeneration" }, { - "name": "Evaluations" + "name": "PostTraining" }, { - "name": "Inspect" + "name": "Datasets" }, { - "name": "PostTraining" + "name": "MemoryBanks" }, { - "name": "Telemetry" + "name": "RewardScoring" }, { "name": "Inference" @@ -6195,10 +6358,6 @@ "name": "BatchCompletionResponse", "description": "" }, - { - "name": "CancelEvaluationJobRequest", - "description": "" - }, { "name": "CancelTrainingJobRequest", "description": "" @@ -6368,17 +6527,21 @@ "description": "" }, { - "name": "TrainEvalDataset", - "description": "Dataset to be used for training or evaluating language models.\n\n" + "name": "CustomDatasetDef", + "description": "" }, { - "name": "TrainEvalDatasetColumnType", - "description": "" + "name": "HuggingfaceDatasetDef", + "description": "" }, { "name": "CreateDatasetRequest", "description": "" }, + { + "name": "CreateDatasetResponse", + "description": "" + }, { "name": "DeleteAgentsRequest", "description": "" @@ -6391,6 +6554,10 @@ "name": "DeleteDatasetRequest", "description": "" }, + { + "name": "DeleteDatasetResponse", + "description": "" + }, { "name": "EmbeddingsRequest", "description": "" @@ -6399,22 +6566,6 @@ "name": "EmbeddingsResponse", "description": "" }, - { - "name": "EvaluateQuestionAnsweringRequest", - "description": "" - }, - { - "name": "EvaluationJob", - "description": "" - }, - { - "name": "EvaluateSummarizationRequest", - "description": "" - }, - { - "name": "EvaluateTextGenerationRequest", - "description": "" - }, { "name": "GetAgentsSessionRequest", "description": "" @@ -6443,18 +6594,6 @@ "name": "AgentStepResponse", "description": "" }, - { - "name": "EvaluationJobArtifactsResponse", - "description": "Artifacts of a evaluation job.\n\n" - }, - { - "name": "EvaluationJobLogStream", - "description": "" - }, - { - "name": "EvaluationJobStatusResponse", - "description": "" - }, { "name": "ModelDefWithProvider", "description": "" @@ -6555,6 +6694,14 @@ "name": "RLHFAlgorithm", "description": "" }, + { + "name": "TrainEvalDataset", + "description": "Dataset to be used for training or evaluating language models.\n\n" + }, + { + "name": "TrainEvalDatasetColumnType", + "description": "" + }, { "name": "TrainingConfig", "description": "" @@ -6603,6 +6750,62 @@ "name": "ScoredMessage", "description": "" }, + { + "name": "EvaluateDatasetConfig", + "description": "" + }, + { + "name": "EvaluateJudgeScoringConfig", + "description": "" + }, + { + "name": "EvaluateModelGenerationConfig", + "description": "" + }, + { + "name": "EvaluatePostprocessConfig", + "description": "" + }, + { + "name": "EvaluatePreprocessConfig", + "description": "" + }, + { + "name": "EvaluateProcessorConfig", + "description": "" + }, + { + "name": "EvaluateScoringConfig", + "description": "" + }, + { + "name": "EvaluateSingleScorerConfig", + "description": "" + }, + { + "name": "EvaluateTaskConfig", + "description": "" + }, + { + "name": "LLMJudgeConfig", + "description": "" + }, + { + "name": "RunEvalTaskRequest", + "description": "" + }, + { + "name": "EvalResult", + "description": "Aggregated final evaluation result.\n\n" + }, + { + "name": "EvaluateResponse", + "description": "Scores for evaluation.\n\n" + }, + { + "name": "RunScorerRequest", + "description": "" + }, { "name": "RunShieldRequest", "description": "" @@ -6647,7 +6850,7 @@ "Agents", "BatchInference", "Datasets", - "Evaluations", + "Evals", "Inference", "Inspect", "Memory", @@ -6681,7 +6884,6 @@ "BatchCompletionRequest", "BatchCompletionResponse", "BuiltinTool", - "CancelEvaluationJobRequest", "CancelTrainingJobRequest", "ChatCompletionRequest", "ChatCompletionResponse", @@ -6698,31 +6900,40 @@ "CreateAgentSessionRequest", "CreateAgentTurnRequest", "CreateDatasetRequest", + "CreateDatasetResponse", + "CustomDatasetDef", "DPOAlignmentConfig", "DeleteAgentsRequest", "DeleteAgentsSessionRequest", "DeleteDatasetRequest", + "DeleteDatasetResponse", "DialogGenerations", "DoraFinetuningConfig", "EmbeddingsRequest", "EmbeddingsResponse", - "EvaluateQuestionAnsweringRequest", - "EvaluateSummarizationRequest", - "EvaluateTextGenerationRequest", - "EvaluationJob", - "EvaluationJobArtifactsResponse", - "EvaluationJobLogStream", - "EvaluationJobStatusResponse", + "EvalResult", + "EvaluateDatasetConfig", + "EvaluateJudgeScoringConfig", + "EvaluateModelGenerationConfig", + "EvaluatePostprocessConfig", + "EvaluatePreprocessConfig", + "EvaluateProcessorConfig", + "EvaluateResponse", + "EvaluateScoringConfig", + "EvaluateSingleScorerConfig", + "EvaluateTaskConfig", "FinetuningAlgorithm", "FunctionCallToolDefinition", "GetAgentsSessionRequest", "GraphMemoryBankDef", "HealthInfo", + "HuggingfaceDatasetDef", "ImageMedia", "InferenceStep", "InsertDocumentsRequest", "KeyValueMemoryBankDef", "KeywordMemoryBankDef", + "LLMJudgeConfig", "LogEventRequest", "LogSeverity", "LoraFinetuningConfig", @@ -6752,6 +6963,8 @@ "RewardScoreRequest", "RewardScoringResponse", "RouteInfo", + "RunEvalTaskRequest", + "RunScorerRequest", "RunShieldRequest", "RunShieldResponse", "SafetyViolation", diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml index 9307ee47b2..ab54c4c09e 100644 --- a/docs/resources/llama-stack-spec.yaml +++ b/docs/resources/llama-stack-spec.yaml @@ -315,14 +315,6 @@ components: - photogen - code_interpreter type: string - CancelEvaluationJobRequest: - additionalProperties: false - properties: - job_uuid: - type: string - required: - - job_uuid - type: object CancelTrainingJobRequest: additionalProperties: false properties: @@ -572,13 +564,45 @@ components: CreateDatasetRequest: additionalProperties: false properties: - dataset: - $ref: '#/components/schemas/TrainEvalDataset' - uuid: + dataset_def: + oneOf: + - $ref: '#/components/schemas/HuggingfaceDatasetDef' + - $ref: '#/components/schemas/CustomDatasetDef' + required: + - dataset_def + type: object + CreateDatasetResponse: + additionalProperties: false + properties: + msg: + type: string + status: + enum: + - success + - fail type: string required: - - uuid - - dataset + - status + type: object + CustomDatasetDef: + additionalProperties: false + properties: + identifier: + type: string + rename_columns_map: + additionalProperties: + type: string + type: object + type: + const: custom + default: custom + type: string + url: + type: string + required: + - type + - identifier + - url type: object DPOAlignmentConfig: additionalProperties: false @@ -619,10 +643,23 @@ components: DeleteDatasetRequest: additionalProperties: false properties: - dataset_uuid: + dataset_identifier: type: string required: - - dataset_uuid + - dataset_identifier + type: object + DeleteDatasetResponse: + additionalProperties: false + properties: + msg: + type: string + status: + enum: + - success + - fail + type: string + required: + - status type: object DialogGenerations: additionalProperties: false @@ -701,78 +738,147 @@ components: required: - embeddings type: object - EvaluateQuestionAnsweringRequest: + EvalResult: additionalProperties: false properties: metrics: - items: - enum: - - em - - f1 - type: string - type: array + additionalProperties: + type: number + type: object required: - metrics + title: Aggregated final evaluation result. type: object - EvaluateSummarizationRequest: + EvaluateDatasetConfig: additionalProperties: false properties: - metrics: - items: - enum: - - rouge - - bleu - type: string - type: array + dataset_identifier: + type: string + kwargs: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + row_limit: + type: integer required: - - metrics + - dataset_identifier + type: object + EvaluateJudgeScoringConfig: type: object - EvaluateTextGenerationRequest: + EvaluateModelGenerationConfig: additionalProperties: false properties: - metrics: - items: - enum: - - perplexity - - rouge - - bleu - type: string - type: array + kwargs: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + model: + type: string + sampling_params: + $ref: '#/components/schemas/SamplingParams' required: - - metrics + - model + - sampling_params type: object - EvaluationJob: + EvaluatePostprocessConfig: additionalProperties: false properties: - job_uuid: + kwargs: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + type: object + EvaluatePreprocessConfig: + additionalProperties: false + properties: + kwargs: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + type: object + EvaluateProcessorConfig: + additionalProperties: false + properties: + postprocess_config: + $ref: '#/components/schemas/EvaluatePostprocessConfig' + preprocess_config: + $ref: '#/components/schemas/EvaluatePreprocessConfig' + processor_identifier: type: string required: - - job_uuid + - processor_identifier type: object - EvaluationJobArtifactsResponse: + EvaluateResponse: additionalProperties: false properties: - job_uuid: + eval_result: + $ref: '#/components/schemas/EvalResult' + formatted_report: type: string required: - - job_uuid - title: Artifacts of a evaluation job. + - eval_result + title: Scores for evaluation. type: object - EvaluationJobLogStream: + EvaluateScoringConfig: additionalProperties: false properties: - job_uuid: - type: string + scorer_config_list: + items: + $ref: '#/components/schemas/EvaluateSingleScorerConfig' + type: array required: - - job_uuid + - scorer_config_list type: object - EvaluationJobStatusResponse: + EvaluateSingleScorerConfig: additionalProperties: false properties: - job_uuid: + llm_judge_config: + $ref: '#/components/schemas/LLMJudgeConfig' + scorer_name: type: string required: - - job_uuid + - scorer_name + type: object + EvaluateTaskConfig: + additionalProperties: false + properties: + dataset_config: + $ref: '#/components/schemas/EvaluateDatasetConfig' + generation_config: + $ref: '#/components/schemas/EvaluateModelGenerationConfig' + processor_config: + $ref: '#/components/schemas/EvaluateProcessorConfig' + scoring_config: + $ref: '#/components/schemas/EvaluateScoringConfig' + required: + - dataset_config + - processor_config + - generation_config + - scoring_config type: object FinetuningAlgorithm: enum: @@ -845,6 +951,39 @@ components: required: - status type: object + HuggingfaceDatasetDef: + additionalProperties: false + properties: + dataset_name: + type: string + dataset_path: + type: string + identifier: + type: string + kwargs: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + rename_columns_map: + additionalProperties: + type: string + type: object + type: + const: huggingface + default: huggingface + type: string + required: + - type + - identifier + - dataset_path + - kwargs + type: object ImageMedia: additionalProperties: false properties: @@ -936,6 +1075,20 @@ components: - provider_id - type type: object + LLMJudgeConfig: + additionalProperties: false + properties: + judge_model_generation_config: + $ref: '#/components/schemas/EvaluateModelGenerationConfig' + judge_processor_config: + $ref: '#/components/schemas/EvaluateProcessorConfig' + judge_scoring_config: + $ref: '#/components/schemas/EvaluateJudgeScoringConfig' + required: + - judge_processor_config + - judge_model_generation_config + - judge_scoring_config + type: object LogEventRequest: additionalProperties: false properties: @@ -1629,6 +1782,32 @@ components: - method - provider_types type: object + RunEvalTaskRequest: + additionalProperties: false + properties: + dataset: + type: string + eval_task_config: + $ref: '#/components/schemas/EvaluateTaskConfig' + model: + type: string + task: + type: string + required: + - model + - task + type: object + RunScorerRequest: + additionalProperties: false + properties: + dataset_config: + $ref: '#/components/schemas/EvaluateDatasetConfig' + eval_scoring_config: + $ref: '#/components/schemas/EvaluateScoringConfig' + required: + - dataset_config + - eval_scoring_config + type: object RunShieldRequest: additionalProperties: false properties: @@ -2507,7 +2686,7 @@ info: description: "This is the specification of the llama stack that provides\n \ \ a set of endpoints and their corresponding interfaces that are tailored\ \ to\n best leverage Llama Models. The specification is still in\ - \ draft and subject to change.\n Generated at 2024-10-09 21:10:09.073430" + \ draft and subject to change.\n Generated at 2024-10-15 00:44:26.278642" title: '[DRAFT] Llama Stack Specification' version: 0.0.1 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema @@ -2693,7 +2872,7 @@ paths: responses: '200': content: - application/json: + text/event-stream: schema: $ref: '#/components/schemas/AgentTurnResponseStreamChunk' description: OK @@ -2794,81 +2973,16 @@ paths: schema: $ref: '#/components/schemas/CreateDatasetRequest' required: true - responses: - '200': - description: OK - tags: - - Datasets - /datasets/delete: - post: - parameters: - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/DeleteDatasetRequest' - required: true - responses: - '200': - description: OK - tags: - - Datasets - /datasets/get: - get: - parameters: - - in: query - name: dataset_uuid - required: true - schema: - type: string - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string responses: '200': content: application/json: schema: - $ref: '#/components/schemas/TrainEvalDataset' + $ref: '#/components/schemas/CreateDatasetResponse' description: OK tags: - Datasets - /evaluate/job/artifacts: - get: - parameters: - - in: query - name: job_uuid - required: true - schema: - type: string - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string - responses: - '200': - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluationJobArtifactsResponse' - description: OK - tags: - - Evaluations - /evaluate/job/cancel: + /datasets/delete: post: parameters: - description: JSON-encoded provider data which will be made available to the @@ -2882,42 +2996,22 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/CancelEvaluationJobRequest' - required: true - responses: - '200': - description: OK - tags: - - Evaluations - /evaluate/job/logs: - get: - parameters: - - in: query - name: job_uuid + $ref: '#/components/schemas/DeleteDatasetRequest' required: true - schema: - type: string - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string responses: '200': content: application/json: schema: - $ref: '#/components/schemas/EvaluationJobLogStream' + $ref: '#/components/schemas/DeleteDatasetResponse' description: OK tags: - - Evaluations - /evaluate/job/status: + - Datasets + /datasets/get: get: parameters: - in: query - name: job_uuid + name: dataset_identifier required: true schema: type: string @@ -2933,11 +3027,15 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/EvaluationJobStatusResponse' + oneOf: + - oneOf: + - $ref: '#/components/schemas/HuggingfaceDatasetDef' + - $ref: '#/components/schemas/CustomDatasetDef' + - type: 'null' description: OK tags: - - Evaluations - /evaluate/jobs: + - Datasets + /datasets/list: get: parameters: - description: JSON-encoded provider data which will be made available to the @@ -2952,36 +3050,13 @@ paths: content: application/jsonl: schema: - $ref: '#/components/schemas/EvaluationJob' - description: OK - tags: - - Evaluations - /evaluate/question_answering/: - post: - parameters: - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateQuestionAnsweringRequest' - required: true - responses: - '200': - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluationJob' + oneOf: + - $ref: '#/components/schemas/HuggingfaceDatasetDef' + - $ref: '#/components/schemas/CustomDatasetDef' description: OK tags: - - Evaluations - /evaluate/summarization/: + - Datasets + /evals/run_eval_task: post: parameters: - description: JSON-encoded provider data which will be made available to the @@ -2995,18 +3070,18 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/EvaluateSummarizationRequest' + $ref: '#/components/schemas/RunEvalTaskRequest' required: true responses: '200': content: application/json: schema: - $ref: '#/components/schemas/EvaluationJob' + $ref: '#/components/schemas/EvaluateResponse' description: OK tags: - - Evaluations - /evaluate/text_generation/: + - Evals + /evals/run_scorer: post: parameters: - description: JSON-encoded provider data which will be made available to the @@ -3020,17 +3095,17 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/EvaluateTextGenerationRequest' + $ref: '#/components/schemas/RunScorerRequest' required: true responses: '200': content: application/json: schema: - $ref: '#/components/schemas/EvaluationJob' + $ref: '#/components/schemas/EvaluateResponse' description: OK tags: - - Evaluations + - Evals /health: get: parameters: @@ -3712,20 +3787,20 @@ security: servers: - url: http://any-hosted-llama-stack.com tags: -- name: RewardScoring -- name: Memory -- name: SyntheticDataGeneration - name: Models -- name: Safety - name: BatchInference -- name: Agents -- name: MemoryBanks -- name: Shields -- name: Datasets -- name: Evaluations - name: Inspect -- name: PostTraining +- name: Evals +- name: Safety +- name: Shields - name: Telemetry +- name: Agents +- name: Memory +- name: SyntheticDataGeneration +- name: PostTraining +- name: Datasets +- name: MemoryBanks +- name: RewardScoring - name: Inference - description: name: BuiltinTool @@ -3782,9 +3857,6 @@ tags: - description: name: BatchCompletionResponse -- description: - name: CancelEvaluationJobRequest - description: name: CancelTrainingJobRequest @@ -3919,17 +3991,18 @@ tags: name: Turn - description: name: ViolationLevel -- description: 'Dataset to be used for training or evaluating language models. - - - ' - name: TrainEvalDataset -- description: - name: TrainEvalDatasetColumnType + name: CustomDatasetDef +- description: + name: HuggingfaceDatasetDef - description: name: CreateDatasetRequest +- description: + name: CreateDatasetResponse - description: name: DeleteAgentsRequest @@ -3939,23 +4012,15 @@ tags: - description: name: DeleteDatasetRequest +- description: + name: DeleteDatasetResponse - description: name: EmbeddingsRequest - description: name: EmbeddingsResponse -- description: - name: EvaluateQuestionAnsweringRequest -- description: - name: EvaluationJob -- description: - name: EvaluateSummarizationRequest -- description: - name: EvaluateTextGenerationRequest - description: name: GetAgentsSessionRequest @@ -3979,18 +4044,6 @@ tags: - description: name: AgentStepResponse -- description: 'Artifacts of a evaluation job. - - - ' - name: EvaluationJobArtifactsResponse -- description: - name: EvaluationJobLogStream -- description: - name: EvaluationJobStatusResponse - description: name: ModelDefWithProvider @@ -4067,6 +4120,14 @@ tags: name: OptimizerConfig - description: name: RLHFAlgorithm +- description: 'Dataset to be used for training or evaluating language models. + + + ' + name: TrainEvalDataset +- description: + name: TrainEvalDatasetColumnType - description: name: TrainingConfig - description: name: ScoredMessage +- description: + name: EvaluateDatasetConfig +- description: + name: EvaluateJudgeScoringConfig +- description: + name: EvaluateModelGenerationConfig +- description: + name: EvaluatePostprocessConfig +- description: + name: EvaluatePreprocessConfig +- description: + name: EvaluateProcessorConfig +- description: + name: EvaluateScoringConfig +- description: + name: EvaluateSingleScorerConfig +- description: + name: EvaluateTaskConfig +- description: + name: LLMJudgeConfig +- description: + name: RunEvalTaskRequest +- description: 'Aggregated final evaluation result. + + + ' + name: EvalResult +- description: 'Scores for evaluation. + + + ' + name: EvaluateResponse +- description: + name: RunScorerRequest - description: name: RunShieldRequest @@ -4141,7 +4247,7 @@ x-tagGroups: - Agents - BatchInference - Datasets - - Evaluations + - Evals - Inference - Inspect - Memory @@ -4172,7 +4278,6 @@ x-tagGroups: - BatchCompletionRequest - BatchCompletionResponse - BuiltinTool - - CancelEvaluationJobRequest - CancelTrainingJobRequest - ChatCompletionRequest - ChatCompletionResponse @@ -4189,31 +4294,40 @@ x-tagGroups: - CreateAgentSessionRequest - CreateAgentTurnRequest - CreateDatasetRequest + - CreateDatasetResponse + - CustomDatasetDef - DPOAlignmentConfig - DeleteAgentsRequest - DeleteAgentsSessionRequest - DeleteDatasetRequest + - DeleteDatasetResponse - DialogGenerations - DoraFinetuningConfig - EmbeddingsRequest - EmbeddingsResponse - - EvaluateQuestionAnsweringRequest - - EvaluateSummarizationRequest - - EvaluateTextGenerationRequest - - EvaluationJob - - EvaluationJobArtifactsResponse - - EvaluationJobLogStream - - EvaluationJobStatusResponse + - EvalResult + - EvaluateDatasetConfig + - EvaluateJudgeScoringConfig + - EvaluateModelGenerationConfig + - EvaluatePostprocessConfig + - EvaluatePreprocessConfig + - EvaluateProcessorConfig + - EvaluateResponse + - EvaluateScoringConfig + - EvaluateSingleScorerConfig + - EvaluateTaskConfig - FinetuningAlgorithm - FunctionCallToolDefinition - GetAgentsSessionRequest - GraphMemoryBankDef - HealthInfo + - HuggingfaceDatasetDef - ImageMedia - InferenceStep - InsertDocumentsRequest - KeyValueMemoryBankDef - KeywordMemoryBankDef + - LLMJudgeConfig - LogEventRequest - LogSeverity - LoraFinetuningConfig @@ -4243,6 +4357,8 @@ x-tagGroups: - RewardScoreRequest - RewardScoringResponse - RouteInfo + - RunEvalTaskRequest + - RunScorerRequest - RunShieldRequest - RunShieldResponse - SafetyViolation diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index c0aa4d161e..f5991c52e1 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -15,6 +15,26 @@ from typing_extensions import Annotated +@json_schema_type +class TrainEvalDatasetColumnType(Enum): + dialog = "dialog" + text = "text" + media = "media" + number = "number" + json = "json" + + +@json_schema_type +class TrainEvalDataset(BaseModel): + """Dataset to be used for training or evaluating language models.""" + + # TODO(ashwin): figure out if we need to add an enum for a "dataset type" + + columns: Dict[str, TrainEvalDatasetColumnType] + content_url: URL + metadata: Optional[Dict[str, Any]] = None + + @json_schema_type class GenerationInput(BaseModel): messages: List[Message] diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py index d943f48b20..cdfe5c4673 100644 --- a/llama_stack/apis/post_training/post_training.py +++ b/llama_stack/apis/post_training/post_training.py @@ -14,7 +14,7 @@ from pydantic import BaseModel, Field from llama_models.llama3.api.datatypes import * # noqa: F403 -from llama_stack.apis.dataset import * # noqa: F403 +from llama_stack.apis.datasets import * # noqa: F403 from llama_stack.apis.common.training_types import * # noqa: F403 From cccd5be090b36fadf68f1b355556a4820dac2397 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 15 Oct 2024 10:14:35 -0700 Subject: [PATCH 24/27] move eval_task_config to client --- llama_stack/apis/evals/client.py | 141 +++++++++--------- llama_stack/apis/evals/evals.py | 5 +- .../impls/meta_reference/evals/evals.py | 36 +---- 3 files changed, 71 insertions(+), 111 deletions(-) diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index 07877c13e5..1e76812c6c 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -46,23 +46,13 @@ async def shutdown(self) -> None: async def run_evals( self, - model: str, - task: str, - dataset: Optional[str] = None, - eval_task_config: Optional[EvaluateTaskConfig] = None, + eval_task_config: EvaluateTaskConfig, ) -> EvaluateResponse: async with httpx.AsyncClient() as client: response = await client.post( f"{self.base_url}/evals/run_eval_task", json={ - "model": model, - "task": task, - "dataset": dataset, - "eval_task_config": ( - json.loads(eval_task_config.json()) - if eval_task_config - else None - ), + "eval_task_config": json.loads(eval_task_config.json()), }, headers={"Content-Type": "application/json"}, timeout=3600, @@ -94,85 +84,88 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""): dataset_client = DatasetsClient(f"http://{host}:{port}") # Full Eval Task - - # # 1. register custom dataset - # response = await dataset_client.create_dataset( - # dataset_def=CustomDatasetDef( - # identifier="mmlu-simple-eval-en", - # url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", - # ), - # ) - # cprint(f"datasets/create: {response}", "cyan") - - # # 2. run evals on the registered dataset - # response = await client.run_evals( - # model="Llama3.1-8B-Instruct", - # dataset="mmlu-simple-eval-en", - # task="mmlu", - # ) - - # if response.formatted_report: - # cprint(response.formatted_report, "green") - # else: - # cprint(f"Response: {response}", "green") - - # Scoring Task - # 1. register huggingface dataset - response = await dataset_client.create_dataset( - dataset_def=HuggingfaceDatasetDef( - identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", - dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals", - dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", - rename_columns_map={ - "output_parsed_answer": "generated_answer", - "input_correct_responses": "expected_answer", - }, - kwargs={"split": "latest"}, - ) - ) - cprint(response, "cyan") - + # 1. register custom dataset response = await dataset_client.create_dataset( dataset_def=CustomDatasetDef( - identifier="rag-evals", - url=data_url_from_file(eval_dataset_path), - rename_columns_map={ - "query": "input_query", - }, - ) + identifier="mmlu-simple-eval-en", + url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", + ), ) - cprint(response, "cyan") + cprint(f"datasets/create: {response}", "cyan") - # 2. run evals on the registered dataset - response = await client.run_scorer( + # # 2. run evals on the registered dataset + eval_task_config = EvaluateTaskConfig( dataset_config=EvaluateDatasetConfig( - dataset_identifier="rag-evals", - # dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", - row_limit=10, + dataset_identifier="mmlu-simple-eval-en", + row_limit=3, ), - eval_scoring_config=EvaluateScoringConfig( + processor_config=EvaluateProcessorConfig( + processor_identifier="mmlu", + ), + generation_config=EvaluateModelGenerationConfig( + model="Llama3.1-8B-Instruct", + ), + scoring_config=EvaluateScoringConfig( scorer_config_list=[ EvaluateSingleScorerConfig(scorer_name="accuracy"), - EvaluateSingleScorerConfig( - scorer_name="braintrust::answer-correctness" - ), + EvaluateSingleScorerConfig(scorer_name="random"), ] ), ) - + response = await client.run_evals( + eval_task_config=eval_task_config, + ) for k, v in response.eval_result.metrics.items(): cprint(f"{k}: {v}", "green") - # Eleuther Eval Task - # response = await client.run_evals( - # model="Llama3.1-8B-Instruct", - # # task="meta_mmlu_pro_instruct", - # task="meta_ifeval", - # eval_task_config=EvaluateTaskConfig( - # n_samples=2, + # Scoring Task + # # 1. register huggingface dataset + # response = await dataset_client.create_dataset( + # dataset_def=HuggingfaceDatasetDef( + # identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + # dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals", + # dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + # rename_columns_map={ + # "output_parsed_answer": "generated_answer", + # "input_correct_responses": "expected_answer", + # }, + # kwargs={"split": "latest"}, + # ) + # ) + # cprint(response, "cyan") + + # # register custom dataset from file path + # response = await dataset_client.create_dataset( + # dataset_def=CustomDatasetDef( + # identifier="rag-evals", + # url=data_url_from_file(eval_dataset_path), + # rename_columns_map={ + # "query": "input_query", + # }, + # ) + # ) + # cprint(response, "cyan") + + # # 2. run evals on the registered dataset + # response = await client.run_scorer( + # dataset_config=EvaluateDatasetConfig( + # dataset_identifier="rag-evals", + # # dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + # row_limit=10, + # ), + # eval_scoring_config=EvaluateScoringConfig( + # scorer_config_list=[ + # EvaluateSingleScorerConfig(scorer_name="accuracy"), + # EvaluateSingleScorerConfig( + # scorer_name="braintrust::answer-correctness" + # ), + # ] # ), # ) + # for k, v in response.eval_result.metrics.items(): + # cprint(f"{k}: {v}", "green") + def main(host: str, port: int, eval_dataset_path: str = ""): asyncio.run(run_main(host, port, eval_dataset_path)) diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index a02394ee40..c484db734f 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -228,10 +228,7 @@ class Evals(Protocol): @webmethod(route="/evals/run_eval_task") async def run_eval_task( self, - model: str, - task: str, - dataset: Optional[str] = None, - eval_task_config: Optional[EvaluateTaskConfig] = None, + eval_task_config: EvaluateTaskConfig, ) -> EvaluateResponse: ... @webmethod(route="/evals/run_scorer") diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index 916e40e3ac..a9e2c641f9 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -28,39 +28,9 @@ async def shutdown(self) -> None: async def run_eval_task( self, - model: str, - task: str, - dataset: Optional[str] = None, - eval_task_config: Optional[EvaluateTaskConfig] = None, + eval_task_config: EvaluateTaskConfig, ) -> EvaluateResponse: - cprint( - f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}", - "red", - ) - - if not dataset: - raise ValueError("dataset must be specified for mete-reference evals") - - if not eval_task_config: - # construct eval task config from inputs - eval_task_config = EvaluateTaskConfig( - dataset_config=EvaluateDatasetConfig( - dataset_identifier=dataset, - row_limit=3, - ), - processor_config=EvaluateProcessorConfig( - processor_identifier="mmlu", - ), - generation_config=EvaluateModelGenerationConfig( - model=model, - ), - scoring_config=EvaluateScoringConfig( - scorer_config_list=[ - EvaluateSingleScorerConfig(scorer_name="accuracy"), - EvaluateSingleScorerConfig(scorer_name="random"), - ] - ), - ) + cprint(f"run_eval_task: on {eval_task_config}", "green") run_task = RunEvalTask() eval_result = await run_task.run(eval_task_config, self.inference_api) @@ -75,7 +45,7 @@ async def run_scorer( dataset_config: EvaluateDatasetConfig, eval_scoring_config: EvaluateScoringConfig, ) -> EvaluateResponse: - cprint("run_scorer") + cprint(f"run_scorer: on {dataset_config} with {eval_scoring_config}", "green") run_task = RunScoringTask() eval_result = await run_task.run(dataset_config, eval_scoring_config) From be4f395032930f8ba9b7a21da6d8a9644396a631 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 15 Oct 2024 10:17:45 -0700 Subject: [PATCH 25/27] full evals / full scoring flow --- docs/resources/llama-stack-spec.html | 44 ++++------ docs/resources/llama-stack-spec.yaml | 31 +++---- llama_stack/apis/evals/client.py | 86 +++++++++---------- .../registry/datasets/dataset_wrappers.py | 2 +- 4 files changed, 71 insertions(+), 92 deletions(-) diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html index ac75dbf049..7787001ffb 100644 --- a/docs/resources/llama-stack-spec.html +++ b/docs/resources/llama-stack-spec.html @@ -21,7 +21,7 @@ "info": { "title": "[DRAFT] Llama Stack Specification", "version": "0.0.1", - "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-15 00:44:26.278642" + "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-15 10:15:15.195382" }, "servers": [ { @@ -5805,23 +5805,13 @@ "RunEvalTaskRequest": { "type": "object", "properties": { - "model": { - "type": "string" - }, - "task": { - "type": "string" - }, - "dataset": { - "type": "string" - }, "eval_task_config": { "$ref": "#/components/schemas/EvaluateTaskConfig" } }, "additionalProperties": false, "required": [ - "model", - "task" + "eval_task_config" ] }, "EvalResult": { @@ -6238,49 +6228,49 @@ ], "tags": [ { - "name": "Models" + "name": "Inference" }, { - "name": "BatchInference" + "name": "PostTraining" }, { - "name": "Inspect" + "name": "Agents" }, { - "name": "Evals" + "name": "MemoryBanks" }, { - "name": "Safety" + "name": "Inspect" }, { - "name": "Shields" + "name": "Models" }, { - "name": "Telemetry" + "name": "Safety" }, { - "name": "Agents" + "name": "Evals" }, { - "name": "Memory" + "name": "BatchInference" }, { - "name": "SyntheticDataGeneration" + "name": "Shields" }, { - "name": "PostTraining" + "name": "SyntheticDataGeneration" }, { - "name": "Datasets" + "name": "Telemetry" }, { - "name": "MemoryBanks" + "name": "RewardScoring" }, { - "name": "RewardScoring" + "name": "Datasets" }, { - "name": "Inference" + "name": "Memory" }, { "name": "BuiltinTool", diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml index ab54c4c09e..d601435d79 100644 --- a/docs/resources/llama-stack-spec.yaml +++ b/docs/resources/llama-stack-spec.yaml @@ -1785,17 +1785,10 @@ components: RunEvalTaskRequest: additionalProperties: false properties: - dataset: - type: string eval_task_config: $ref: '#/components/schemas/EvaluateTaskConfig' - model: - type: string - task: - type: string required: - - model - - task + - eval_task_config type: object RunScorerRequest: additionalProperties: false @@ -2686,7 +2679,7 @@ info: description: "This is the specification of the llama stack that provides\n \ \ a set of endpoints and their corresponding interfaces that are tailored\ \ to\n best leverage Llama Models. The specification is still in\ - \ draft and subject to change.\n Generated at 2024-10-15 00:44:26.278642" + \ draft and subject to change.\n Generated at 2024-10-15 10:15:15.195382" title: '[DRAFT] Llama Stack Specification' version: 0.0.1 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema @@ -3787,21 +3780,21 @@ security: servers: - url: http://any-hosted-llama-stack.com tags: -- name: Models -- name: BatchInference +- name: Inference +- name: PostTraining +- name: Agents +- name: MemoryBanks - name: Inspect -- name: Evals +- name: Models - name: Safety +- name: Evals +- name: BatchInference - name: Shields -- name: Telemetry -- name: Agents -- name: Memory - name: SyntheticDataGeneration -- name: PostTraining -- name: Datasets -- name: MemoryBanks +- name: Telemetry - name: RewardScoring -- name: Inference +- name: Datasets +- name: Memory - description: name: BuiltinTool - description: None: raise ValueError(f"Unsupported file type: {self.config.url}") if n_samples is not None: - df = df.sample(n=n_samples) + df = df.sample(n=min(n_samples, len(df))) self.dataset = Dataset.from_pandas(df) if self.config.rename_columns_map: From 0c4ed66ecc512b9cf4a1b55315d4f602e3d0f9a8 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 15 Oct 2024 10:20:30 -0700 Subject: [PATCH 26/27] regen openapi --- docs/resources/llama-stack-spec.html | 46 ++++++++++++++++++++++++++-- docs/resources/llama-stack-spec.yaml | 19 ++++++------ 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html index c845309964..7ce99db3a7 100644 --- a/docs/resources/llama-stack-spec.html +++ b/docs/resources/llama-stack-spec.html @@ -21,7 +21,7 @@ "info": { "title": "[DRAFT] Llama Stack Specification", "version": "0.0.1", - "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-10 15:29:56.831109" + "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-15 10:20:19.984531" }, "servers": [ { @@ -6228,7 +6228,49 @@ ], "tags": [ { - + "name": "Agents" + }, + { + "name": "Telemetry" + }, + { + "name": "Safety" + }, + { + "name": "MemoryBanks" + }, + { + "name": "Datasets" + }, + { + "name": "Shields" + }, + { + "name": "RewardScoring" + }, + { + "name": "PostTraining" + }, + { + "name": "Models" + }, + { + "name": "Inspect" + }, + { + "name": "Evals" + }, + { + "name": "BatchInference" + }, + { + "name": "Inference" + }, + { + "name": "Memory" + }, + { + "name": "SyntheticDataGeneration" }, { "name": "BuiltinTool", diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml index 8dab4f31d9..c116742243 100644 --- a/docs/resources/llama-stack-spec.yaml +++ b/docs/resources/llama-stack-spec.yaml @@ -2679,6 +2679,7 @@ info: description: "This is the specification of the llama stack that provides\n \ \ a set of endpoints and their corresponding interfaces that are tailored\ \ to\n best leverage Llama Models. The specification is still in\ + \ draft and subject to change.\n Generated at 2024-10-15 10:20:19.984531" title: '[DRAFT] Llama Stack Specification' version: 0.0.1 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema @@ -3779,21 +3780,21 @@ security: servers: - url: http://any-hosted-llama-stack.com tags: -- name: Inference -- name: PostTraining - name: Agents +- name: Telemetry +- name: Safety - name: MemoryBanks -- name: Inspect +- name: Datasets +- name: Shields +- name: RewardScoring +- name: PostTraining - name: Models -- name: Safety +- name: Inspect - name: Evals - name: BatchInference -- name: Shields -- name: SyntheticDataGeneration -- name: Telemetry -- name: RewardScoring -- name: Datasets +- name: Inference - name: Memory +- name: SyntheticDataGeneration - description: name: BuiltinTool - description: Date: Tue, 15 Oct 2024 13:25:46 -0700 Subject: [PATCH 27/27] llm judge llamastack scorer --- llama_stack/apis/evals/client.py | 18 +++- .../registry/generator_processors/__init__.py | 1 + .../distribution/registry/scorers/__init__.py | 2 + .../impls/meta_reference/evals/evals.py | 4 +- .../evals/generator/inference_generator.py | 1 - .../evals/processor/__init__.py | 1 + .../evals/processor/judge_processor.py | 75 +++++++++++++++++ .../evals/scorer/llm_judge_scorer.py | 83 +++++++++++++++++++ .../evals/tasks/run_eval_task.py | 10 ++- .../evals/tasks/run_scoring_task.py | 11 ++- 10 files changed, 199 insertions(+), 7 deletions(-) create mode 100644 llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py create mode 100644 llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index 4756a570ae..fc4820232f 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -93,7 +93,7 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""): ) cprint(f"datasets/create: {response}", "cyan") - # # 2. run evals on the registered dataset + # 2. run evals on the registered dataset eval_task_config = EvaluateTaskConfig( dataset_config=EvaluateDatasetConfig( dataset_identifier="mmlu-simple-eval-en", @@ -151,9 +151,21 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""): ), eval_scoring_config=EvaluateScoringConfig( scorer_config_list=[ - EvaluateSingleScorerConfig(scorer_name="accuracy"), + # EvaluateSingleScorerConfig(scorer_name="accuracy"), + # EvaluateSingleScorerConfig( + # scorer_name="braintrust::answer-correctness" + # ), EvaluateSingleScorerConfig( - scorer_name="braintrust::answer-correctness" + scorer_name="llamastack-llm-judge", + llm_judge_config=LLMJudgeConfig( + judge_processor_config=EvaluateProcessorConfig( + processor_identifier="judge", + ), + judge_model_generation_config=EvaluateModelGenerationConfig( + model="Llama3.1-8B-Instruct", + ), + judge_scoring_config=EvaluateJudgeScoringConfig(), + ), ), ] ), diff --git a/llama_stack/distribution/registry/generator_processors/__init__.py b/llama_stack/distribution/registry/generator_processors/__init__.py index 44972cf03e..862984f548 100644 --- a/llama_stack/distribution/registry/generator_processors/__init__.py +++ b/llama_stack/distribution/registry/generator_processors/__init__.py @@ -13,6 +13,7 @@ PROCESSOR_REGISTRY = { "mmlu": MMLUProcessor, + "judge": JudgeProcessor, } for k, v in PROCESSOR_REGISTRY.items(): diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py index 7cbe2a4262..dda71d4e00 100644 --- a/llama_stack/distribution/registry/scorers/__init__.py +++ b/llama_stack/distribution/registry/scorers/__init__.py @@ -7,6 +7,7 @@ from llama_stack.apis.evals import * # noqa: F403 from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 from llama_stack.providers.impls.meta_reference.evals.scorer.braintrust_scorer import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.scorer.llm_judge_scorer import * # noqa: F403 from ..registry import Registry @@ -16,6 +17,7 @@ SCORER_REGISTRY = { "accuracy": AccuracyScorer, "random": RandomScorer, + "llamastack-llm-judge": LlamaStackLLMJudgeScorer, "braintrust::factuality": BraintrustFactualityScorer, "braintrust::answer-correctness": BraintrustAnswerCorrectnessScorer, } diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index a9e2c641f9..7d3eaa85d8 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -48,7 +48,9 @@ async def run_scorer( cprint(f"run_scorer: on {dataset_config} with {eval_scoring_config}", "green") run_task = RunScoringTask() - eval_result = await run_task.run(dataset_config, eval_scoring_config) + eval_result = await run_task.run( + dataset_config, eval_scoring_config, self.inference_api + ) return EvaluateResponse( eval_result=eval_result, diff --git a/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py b/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py index adc181e237..dafbb16f5b 100644 --- a/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py +++ b/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py @@ -30,7 +30,6 @@ async def generate( ) -> List[GenerationResponseSample]: generation_outputs = [] for sample in preprocessed_dataset: - print("generation: ", sample) response = await self.inference_api.chat_completion( model=self.model, messages=sample.generation_input.messages, diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py index f782f9320a..5a7ca27958 100644 --- a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py +++ b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py @@ -3,4 +3,5 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from .judge_processor import JudgeProcessor # noqa: F401 from .mmlu_processor import MMLUProcessor # noqa: F401 diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py b/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py new file mode 100644 index 0000000000..d7d6ae3eb2 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py @@ -0,0 +1,75 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import re + +from llama_stack.apis.evals import * # noqa: F403 + +JUDGE_PROMPT = """ +You will be given a question, a expected_answer, and a system_answer. +Your task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question. +Give your answer as a integer on a scale of 0 to 5, where 0 means that the system_answer is not correct at all compared with expected_answer, and 5 means that the answer completely and correctly answers the question. + +Provide your feedback as follows: + +Feedback::: +Total rating: (your rating, as a int between 0 and 5) + +Now here are the question, expected_answer, system_answer. + +Question: {question} +Expected Answer: {expected_answer} +System Answer: {answer} + +Feedback::: +Total rating: +""" + + +class JudgeProcessor( + BaseGeneratorProcessor[ + DictSample, PreprocessedSample, GenerationResponseSample, ScorerInputSample + ] +): + """ + Generator processor for LLM Judge + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def preprocess_sample(self, sample: DictSample) -> PreprocessedSample: + content = JUDGE_PROMPT.format( + question=sample.data["input_query"], + expected_answer=sample.data["expected_answer"], + answer=sample.data["generated_answer"], + ) + preprocessed_msgs = [ + { + "role": "user", + "content": content, + } + ] + processed_sample = PreprocessedSample( + generation_input=GenerationInput( + messages=preprocessed_msgs, + ) + ) + return processed_sample + + def postprocess_sample( + self, generation_sample: GenerationResponseSample, dataset_sample: DictSample + ) -> ScorerInputSample: + response_text = generation_sample.generation_output.completion_message + match = re.search(r"Total rating: (\d+)", response_text) + judge_rating = int(match.group(1)) + + return ScorerInputSample( + generated_answer=str(judge_rating), + expected_answer=dataset_sample.data["expected_answer"], + generation_output=PostprocessedGeneration( + completion_message=response_text, + ), + ) diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py new file mode 100644 index 0000000000..f5f56b435f --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py @@ -0,0 +1,83 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import asyncio +import threading + +import numpy as np + +from llama_stack.distribution.registry.generator_processors import ( + GeneratorProcessorRegistry, +) +from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import ( + InferenceGenerator, +) + +from llama_stack.apis.evals.evals import * # noqa: F401 F403 +from llama_stack.apis.datasets.datasets import * # noqa: F401 F403 +from llama_stack.apis.inference import * # noqa: F403 + + +class LlamaStackLLMJudgeScorer(BaseScorer[ScorerInputSample]): + def __init__(self, llm_judge_config: LLMJudgeConfig, inference_api: Inference): + self.llm_judge_config = llm_judge_config + self.inference_api = inference_api + # https://stackoverflow.com/questions/74703727/how-to-call-async-function-from-sync-funcion-and-get-result-while-a-loop-is-alr + # We will use another thread wih its own event loop to run the async api within sync function + self._loop = asyncio.new_event_loop() + self._thr = threading.Thread( + target=self._loop.run_forever, name="Async Runner", daemon=True + ) + if not self._thr.is_alive(): + self._thr.start() + + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + input_query = scorer_input_sample.input_query + generated_answer = scorer_input_sample.generated_answer + expected_answer = scorer_input_sample.expected_answer + + # Judge F1 + processor = GeneratorProcessorRegistry.get( + self.llm_judge_config.judge_processor_config.processor_identifier + )() + data_sample = DictSample( + data={ + "input_query": input_query, + "generated_answer": generated_answer, + "expected_answer": expected_answer, + } + ) + preprocessed_sample = processor.preprocess_sample(data_sample) + + # Judge Generation + generator = InferenceGenerator( + model=self.llm_judge_config.judge_model_generation_config.model, + inference_api=self.inference_api, + ) + + future = asyncio.run_coroutine_threadsafe( + generator.generate([preprocessed_sample]), self._loop + ) + generation_outputs = future.result() + # Judge F2 + postprocessed_sample = processor.postprocess_sample( + generation_outputs[0], data_sample + ) + + # Judge F3 + score = float(postprocessed_sample.generated_answer) + + return SingleEvalResult(score_data={"judge_score": score}) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + avg_score = np.average( + [result.score_data["judge_score"] for result in eval_results] + ) + + return EvalResult( + metrics={ + "avg_judge_score": avg_score, + } + ) diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py index bcd842c420..fbd98128f1 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py @@ -72,7 +72,15 @@ async def run( scorer_list = [] for s_conf in scorer_config_list: scorer = ScorerRegistry.get(s_conf.scorer_name) - scorer_list.append(scorer()) + if s_conf.llm_judge_config: + scorer_list.append( + scorer( + llm_judge_config=s_conf.llm_judge_config, + inference_api=inference_api, + ) + ) + else: + scorer_list.append(scorer()) scorer = AggregateScorer( scorers=scorer_list, diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py index 9ff6cde4d6..6b11191f1e 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py @@ -50,6 +50,7 @@ async def run( self, dataset_config: EvaluateDatasetConfig, eval_scoring_config: EvaluateScoringConfig, + inference_api: Inference, *args, **kwargs, ) -> EvalResult: @@ -69,7 +70,15 @@ async def run( scorer_list = [] for s_conf in scorer_config_list: scorer = ScorerRegistry.get(s_conf.scorer_name) - scorer_list.append(scorer()) + if s_conf.llm_judge_config: + scorer_list.append( + scorer( + llm_judge_config=s_conf.llm_judge_config, + inference_api=inference_api, + ) + ) + else: + scorer_list.append(scorer()) scorer = AggregateScorer( scorers=scorer_list,