diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py
index 871c01a80f..994b06e583 100644
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@@ -33,7 +33,7 @@
from llama_models.llama3.api.datatypes import * # noqa: F403
from llama_stack.apis.agents import * # noqa: F403
-from llama_stack.apis.dataset import * # noqa: F403
+from llama_stack.apis.datasets import * # noqa: F403
from llama_stack.apis.evals import * # noqa: F403
from llama_stack.apis.inference import * # noqa: F403
from llama_stack.apis.batch_inference import * # noqa: F403
@@ -61,7 +61,7 @@ class LlamaStack(
Telemetry,
PostTraining,
Memory,
- Evaluations,
+ Evals,
Models,
Shields,
Inspect,
diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index a2f92b6e42..7ce99db3a7 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -21,7 +21,7 @@
"info": {
"title": "[DRAFT] Llama Stack Specification",
"version": "0.0.1",
- "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-10 15:29:56.831109"
+ "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-15 10:20:19.984531"
},
"servers": [
{
@@ -109,39 +109,6 @@
}
}
},
- "/evaluate/job/cancel": {
- "post": {
- "responses": {
- "200": {
- "description": "OK"
- }
- },
- "tags": [
- "Evaluations"
- ],
- "parameters": [
- {
- "name": "X-LlamaStack-ProviderData",
- "in": "header",
- "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
- "required": false,
- "schema": {
- "type": "string"
- }
- }
- ],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/CancelEvaluationJobRequest"
- }
- }
- },
- "required": true
- }
- }
- },
"/post_training/job/cancel": {
"post": {
"responses": {
@@ -393,7 +360,14 @@
"post": {
"responses": {
"200": {
- "description": "OK"
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/CreateDatasetResponse"
+ }
+ }
+ }
}
},
"tags": [
@@ -489,119 +463,6 @@
}
},
"/datasets/delete": {
- "post": {
- "responses": {
- "200": {
- "description": "OK"
- }
- },
- "tags": [
- "Datasets"
- ],
- "parameters": [
- {
- "name": "X-LlamaStack-ProviderData",
- "in": "header",
- "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
- "required": false,
- "schema": {
- "type": "string"
- }
- }
- ],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/DeleteDatasetRequest"
- }
- }
- },
- "required": true
- }
- }
- },
- "/inference/embeddings": {
- "post": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/EmbeddingsResponse"
- }
- }
- }
- }
- },
- "tags": [
- "Inference"
- ],
- "parameters": [
- {
- "name": "X-LlamaStack-ProviderData",
- "in": "header",
- "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
- "required": false,
- "schema": {
- "type": "string"
- }
- }
- ],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/EmbeddingsRequest"
- }
- }
- },
- "required": true
- }
- }
- },
- "/evaluate/question_answering/": {
- "post": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/EvaluationJob"
- }
- }
- }
- }
- },
- "tags": [
- "Evaluations"
- ],
- "parameters": [
- {
- "name": "X-LlamaStack-ProviderData",
- "in": "header",
- "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
- "required": false,
- "schema": {
- "type": "string"
- }
- }
- ],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/EvaluateQuestionAnsweringRequest"
- }
- }
- },
- "required": true
- }
- }
- },
- "/evaluate/summarization/": {
"post": {
"responses": {
"200": {
@@ -609,14 +470,14 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/EvaluationJob"
+ "$ref": "#/components/schemas/DeleteDatasetResponse"
}
}
}
}
},
"tags": [
- "Evaluations"
+ "Datasets"
],
"parameters": [
{
@@ -633,7 +494,7 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/EvaluateSummarizationRequest"
+ "$ref": "#/components/schemas/DeleteDatasetRequest"
}
}
},
@@ -641,7 +502,7 @@
}
}
},
- "/evaluate/text_generation/": {
+ "/inference/embeddings": {
"post": {
"responses": {
"200": {
@@ -649,14 +510,14 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/EvaluationJob"
+ "$ref": "#/components/schemas/EmbeddingsResponse"
}
}
}
}
},
"tags": [
- "Evaluations"
+ "Inference"
],
"parameters": [
{
@@ -673,7 +534,7 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/EvaluateTextGenerationRequest"
+ "$ref": "#/components/schemas/EmbeddingsRequest"
}
}
},
@@ -845,7 +706,21 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/TrainEvalDataset"
+ "oneOf": [
+ {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/HuggingfaceDatasetDef"
+ },
+ {
+ "$ref": "#/components/schemas/CustomDatasetDef"
+ }
+ ]
+ },
+ {
+ "type": "null"
+ }
+ ]
}
}
}
@@ -856,7 +731,7 @@
],
"parameters": [
{
- "name": "dataset_uuid",
+ "name": "dataset_identifier",
"in": "query",
"required": true,
"schema": {
@@ -875,7 +750,7 @@
]
}
},
- "/evaluate/job/artifacts": {
+ "/memory_banks/get": {
"get": {
"responses": {
"200": {
@@ -883,18 +758,38 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/EvaluationJobArtifactsResponse"
+ "oneOf": [
+ {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/VectorMemoryBankDef"
+ },
+ {
+ "$ref": "#/components/schemas/KeyValueMemoryBankDef"
+ },
+ {
+ "$ref": "#/components/schemas/KeywordMemoryBankDef"
+ },
+ {
+ "$ref": "#/components/schemas/GraphMemoryBankDef"
+ }
+ ]
+ },
+ {
+ "type": "null"
+ }
+ ]
}
}
}
}
},
"tags": [
- "Evaluations"
+ "MemoryBanks"
],
"parameters": [
{
- "name": "job_uuid",
+ "name": "identifier",
"in": "query",
"required": true,
"schema": {
@@ -913,7 +808,7 @@
]
}
},
- "/evaluate/job/logs": {
+ "/models/get": {
"get": {
"responses": {
"200": {
@@ -921,18 +816,25 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/EvaluationJobLogStream"
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/ModelDefWithProvider"
+ },
+ {
+ "type": "null"
+ }
+ ]
}
}
}
}
},
"tags": [
- "Evaluations"
+ "Models"
],
"parameters": [
{
- "name": "job_uuid",
+ "name": "identifier",
"in": "query",
"required": true,
"schema": {
@@ -951,7 +853,7 @@
]
}
},
- "/evaluate/job/status": {
+ "/shields/get": {
"get": {
"responses": {
"200": {
@@ -959,18 +861,25 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/EvaluationJobStatusResponse"
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/ShieldDefWithProvider"
+ },
+ {
+ "type": "null"
+ }
+ ]
}
}
}
}
},
"tags": [
- "Evaluations"
+ "Shields"
],
"parameters": [
{
- "name": "job_uuid",
+ "name": "shield_type",
"in": "query",
"required": true,
"schema": {
@@ -989,24 +898,32 @@
]
}
},
- "/evaluate/jobs": {
+ "/telemetry/get_trace": {
"get": {
"responses": {
"200": {
"description": "OK",
"content": {
- "application/jsonl": {
+ "application/json": {
"schema": {
- "$ref": "#/components/schemas/EvaluationJob"
+ "$ref": "#/components/schemas/Trace"
}
}
}
}
},
"tags": [
- "Evaluations"
+ "Telemetry"
],
"parameters": [
+ {
+ "name": "trace_id",
+ "in": "query",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ },
{
"name": "X-LlamaStack-ProviderData",
"in": "header",
@@ -1019,7 +936,7 @@
]
}
},
- "/memory_banks/get": {
+ "/post_training/job/artifacts": {
"get": {
"responses": {
"200": {
@@ -1027,204 +944,18 @@
"content": {
"application/json": {
"schema": {
- "oneOf": [
- {
- "oneOf": [
- {
- "$ref": "#/components/schemas/VectorMemoryBankDef"
- },
- {
- "$ref": "#/components/schemas/KeyValueMemoryBankDef"
- },
- {
- "$ref": "#/components/schemas/KeywordMemoryBankDef"
- },
- {
- "$ref": "#/components/schemas/GraphMemoryBankDef"
- }
- ]
- },
- {
- "type": "null"
- }
- ]
+ "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse"
}
}
}
}
},
"tags": [
- "MemoryBanks"
+ "PostTraining"
],
"parameters": [
{
- "name": "identifier",
- "in": "query",
- "required": true,
- "schema": {
- "type": "string"
- }
- },
- {
- "name": "X-LlamaStack-ProviderData",
- "in": "header",
- "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
- "required": false,
- "schema": {
- "type": "string"
- }
- }
- ]
- }
- },
- "/models/get": {
- "get": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/ModelDefWithProvider"
- },
- {
- "type": "null"
- }
- ]
- }
- }
- }
- }
- },
- "tags": [
- "Models"
- ],
- "parameters": [
- {
- "name": "identifier",
- "in": "query",
- "required": true,
- "schema": {
- "type": "string"
- }
- },
- {
- "name": "X-LlamaStack-ProviderData",
- "in": "header",
- "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
- "required": false,
- "schema": {
- "type": "string"
- }
- }
- ]
- }
- },
- "/shields/get": {
- "get": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/ShieldDefWithProvider"
- },
- {
- "type": "null"
- }
- ]
- }
- }
- }
- }
- },
- "tags": [
- "Shields"
- ],
- "parameters": [
- {
- "name": "shield_type",
- "in": "query",
- "required": true,
- "schema": {
- "type": "string"
- }
- },
- {
- "name": "X-LlamaStack-ProviderData",
- "in": "header",
- "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
- "required": false,
- "schema": {
- "type": "string"
- }
- }
- ]
- }
- },
- "/telemetry/get_trace": {
- "get": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/Trace"
- }
- }
- }
- }
- },
- "tags": [
- "Telemetry"
- ],
- "parameters": [
- {
- "name": "trace_id",
- "in": "query",
- "required": true,
- "schema": {
- "type": "string"
- }
- },
- {
- "name": "X-LlamaStack-ProviderData",
- "in": "header",
- "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
- "required": false,
- "schema": {
- "type": "string"
- }
- }
- ]
- }
- },
- "/post_training/job/artifacts": {
- "get": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse"
- }
- }
- }
- }
- },
- "tags": [
- "PostTraining"
- ],
- "parameters": [
- {
- "name": "job_uuid",
+ "name": "job_uuid",
"in": "query",
"required": true,
"schema": {
@@ -1412,6 +1143,43 @@
}
}
},
+ "/datasets/list": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/jsonl": {
+ "schema": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/HuggingfaceDatasetDef"
+ },
+ {
+ "$ref": "#/components/schemas/CustomDatasetDef"
+ }
+ ]
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Datasets"
+ ],
+ "parameters": [
+ {
+ "name": "X-LlamaStack-ProviderData",
+ "in": "header",
+ "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
+ "required": false,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
+ }
+ },
"/memory_banks/list": {
"get": {
"responses": {
@@ -1836,7 +1604,7 @@
}
}
},
- "/safety/run_shield": {
+ "/evals/run_eval_task": {
"post": {
"responses": {
"200": {
@@ -1844,14 +1612,14 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/RunShieldResponse"
+ "$ref": "#/components/schemas/EvaluateResponse"
}
}
}
}
},
"tags": [
- "Safety"
+ "Evals"
],
"parameters": [
{
@@ -1868,7 +1636,7 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/RunShieldRequest"
+ "$ref": "#/components/schemas/RunEvalTaskRequest"
}
}
},
@@ -1876,7 +1644,7 @@
}
}
},
- "/post_training/supervised_fine_tune": {
+ "/evals/run_scorer": {
"post": {
"responses": {
"200": {
@@ -1884,14 +1652,14 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/PostTrainingJob"
+ "$ref": "#/components/schemas/EvaluateResponse"
}
}
}
}
},
"tags": [
- "PostTraining"
+ "Evals"
],
"parameters": [
{
@@ -1908,7 +1676,7 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/SupervisedFineTuneRequest"
+ "$ref": "#/components/schemas/RunScorerRequest"
}
}
},
@@ -1916,7 +1684,7 @@
}
}
},
- "/synthetic_data_generation/generate": {
+ "/safety/run_shield": {
"post": {
"responses": {
"200": {
@@ -1924,14 +1692,14 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/SyntheticDataGenerationResponse"
+ "$ref": "#/components/schemas/RunShieldResponse"
}
}
}
}
},
"tags": [
- "SyntheticDataGeneration"
+ "Safety"
],
"parameters": [
{
@@ -1948,54 +1716,134 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/SyntheticDataGenerateRequest"
+ "$ref": "#/components/schemas/RunShieldRequest"
}
}
},
"required": true
}
}
- }
- },
- "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
- "components": {
- "schemas": {
- "BuiltinTool": {
- "type": "string",
- "enum": [
- "brave_search",
- "wolfram_alpha",
- "photogen",
- "code_interpreter"
- ]
- },
- "CompletionMessage": {
- "type": "object",
- "properties": {
- "role": {
- "type": "string",
- "const": "assistant",
- "default": "assistant"
- },
- "content": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "$ref": "#/components/schemas/ImageMedia"
- },
- {
- "type": "array",
- "items": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "$ref": "#/components/schemas/ImageMedia"
- }
- ]
+ },
+ "/post_training/supervised_fine_tune": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/PostTrainingJob"
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "PostTraining"
+ ],
+ "parameters": [
+ {
+ "name": "X-LlamaStack-ProviderData",
+ "in": "header",
+ "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
+ "required": false,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/SupervisedFineTuneRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
+ "/synthetic_data_generation/generate": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/SyntheticDataGenerationResponse"
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "SyntheticDataGeneration"
+ ],
+ "parameters": [
+ {
+ "name": "X-LlamaStack-ProviderData",
+ "in": "header",
+ "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
+ "required": false,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/SyntheticDataGenerateRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ }
+ },
+ "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
+ "components": {
+ "schemas": {
+ "BuiltinTool": {
+ "type": "string",
+ "enum": [
+ "brave_search",
+ "wolfram_alpha",
+ "photogen",
+ "code_interpreter"
+ ]
+ },
+ "CompletionMessage": {
+ "type": "object",
+ "properties": {
+ "role": {
+ "type": "string",
+ "const": "assistant",
+ "default": "assistant"
+ },
+ "content": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "$ref": "#/components/schemas/ImageMedia"
+ },
+ {
+ "type": "array",
+ "items": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "$ref": "#/components/schemas/ImageMedia"
+ }
+ ]
}
}
]
@@ -2571,18 +2419,6 @@
"completion_message_batch"
]
},
- "CancelEvaluationJobRequest": {
- "type": "object",
- "properties": {
- "job_uuid": {
- "type": "string"
- }
- },
- "additionalProperties": false,
- "required": [
- "job_uuid"
- ]
- },
"CancelTrainingJobRequest": {
"type": "object",
"properties": {
@@ -4090,19 +3926,58 @@
"error"
]
},
- "TrainEvalDataset": {
+ "CustomDatasetDef": {
"type": "object",
"properties": {
- "columns": {
+ "type": {
+ "type": "string",
+ "const": "custom",
+ "default": "custom"
+ },
+ "identifier": {
+ "type": "string"
+ },
+ "url": {
+ "type": "string"
+ },
+ "rename_columns_map": {
"type": "object",
"additionalProperties": {
- "$ref": "#/components/schemas/TrainEvalDatasetColumnType"
+ "type": "string"
}
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "identifier",
+ "url"
+ ]
+ },
+ "HuggingfaceDatasetDef": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "huggingface",
+ "default": "huggingface"
},
- "content_url": {
- "$ref": "#/components/schemas/URL"
+ "identifier": {
+ "type": "string"
},
- "metadata": {
+ "dataset_path": {
+ "type": "string"
+ },
+ "dataset_name": {
+ "type": "string"
+ },
+ "rename_columns_map": {
+ "type": "object",
+ "additionalProperties": {
+ "type": "string"
+ }
+ },
+ "kwargs": {
"type": "object",
"additionalProperties": {
"oneOf": [
@@ -4130,35 +4005,48 @@
},
"additionalProperties": false,
"required": [
- "columns",
- "content_url"
- ],
- "title": "Dataset to be used for training or evaluating language models."
- },
- "TrainEvalDatasetColumnType": {
- "type": "string",
- "enum": [
- "dialog",
- "text",
- "media",
- "number",
- "json"
+ "type",
+ "identifier",
+ "dataset_path",
+ "kwargs"
]
},
"CreateDatasetRequest": {
"type": "object",
"properties": {
- "uuid": {
- "type": "string"
+ "dataset_def": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/HuggingfaceDatasetDef"
+ },
+ {
+ "$ref": "#/components/schemas/CustomDatasetDef"
+ }
+ ]
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "dataset_def"
+ ]
+ },
+ "CreateDatasetResponse": {
+ "type": "object",
+ "properties": {
+ "status": {
+ "type": "string",
+ "enum": [
+ "success",
+ "fail"
+ ]
},
- "dataset": {
- "$ref": "#/components/schemas/TrainEvalDataset"
+ "msg": {
+ "type": "string"
}
},
"additionalProperties": false,
"required": [
- "uuid",
- "dataset"
+ "status"
]
},
"DeleteAgentsRequest": {
@@ -4192,13 +4080,32 @@
"DeleteDatasetRequest": {
"type": "object",
"properties": {
- "dataset_uuid": {
+ "dataset_identifier": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "dataset_identifier"
+ ]
+ },
+ "DeleteDatasetResponse": {
+ "type": "object",
+ "properties": {
+ "status": {
+ "type": "string",
+ "enum": [
+ "success",
+ "fail"
+ ]
+ },
+ "msg": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
- "dataset_uuid"
+ "status"
]
},
"EmbeddingsRequest": {
@@ -4258,112 +4165,42 @@
"embeddings"
]
},
- "EvaluateQuestionAnsweringRequest": {
+ "GetAgentsSessionRequest": {
"type": "object",
"properties": {
- "metrics": {
+ "turn_ids": {
"type": "array",
"items": {
- "type": "string",
- "enum": [
- "em",
- "f1"
- ]
+ "type": "string"
}
}
},
- "additionalProperties": false,
- "required": [
- "metrics"
- ]
+ "additionalProperties": false
},
- "EvaluationJob": {
+ "GraphMemoryBankDef": {
"type": "object",
"properties": {
- "job_uuid": {
+ "identifier": {
"type": "string"
+ },
+ "provider_id": {
+ "type": "string",
+ "default": ""
+ },
+ "type": {
+ "type": "string",
+ "const": "graph",
+ "default": "graph"
}
},
"additionalProperties": false,
"required": [
- "job_uuid"
+ "identifier",
+ "provider_id",
+ "type"
]
},
- "EvaluateSummarizationRequest": {
- "type": "object",
- "properties": {
- "metrics": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "rouge",
- "bleu"
- ]
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "metrics"
- ]
- },
- "EvaluateTextGenerationRequest": {
- "type": "object",
- "properties": {
- "metrics": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "perplexity",
- "rouge",
- "bleu"
- ]
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "metrics"
- ]
- },
- "GetAgentsSessionRequest": {
- "type": "object",
- "properties": {
- "turn_ids": {
- "type": "array",
- "items": {
- "type": "string"
- }
- }
- },
- "additionalProperties": false
- },
- "GraphMemoryBankDef": {
- "type": "object",
- "properties": {
- "identifier": {
- "type": "string"
- },
- "provider_id": {
- "type": "string",
- "default": ""
- },
- "type": {
- "type": "string",
- "const": "graph",
- "default": "graph"
- }
- },
- "additionalProperties": false,
- "required": [
- "identifier",
- "provider_id",
- "type"
- ]
- },
- "KeyValueMemoryBankDef": {
+ "KeyValueMemoryBankDef": {
"type": "object",
"properties": {
"identifier": {
@@ -4513,43 +4350,6 @@
"step"
]
},
- "EvaluationJobArtifactsResponse": {
- "type": "object",
- "properties": {
- "job_uuid": {
- "type": "string"
- }
- },
- "additionalProperties": false,
- "required": [
- "job_uuid"
- ],
- "title": "Artifacts of a evaluation job."
- },
- "EvaluationJobLogStream": {
- "type": "object",
- "properties": {
- "job_uuid": {
- "type": "string"
- }
- },
- "additionalProperties": false,
- "required": [
- "job_uuid"
- ]
- },
- "EvaluationJobStatusResponse": {
- "type": "object",
- "properties": {
- "job_uuid": {
- "type": "string"
- }
- },
- "additionalProperties": false,
- "required": [
- "job_uuid"
- ]
- },
"ModelDefWithProvider": {
"type": "object",
"properties": {
@@ -5265,6 +5065,61 @@
"dpo"
]
},
+ "TrainEvalDataset": {
+ "type": "object",
+ "properties": {
+ "columns": {
+ "type": "object",
+ "additionalProperties": {
+ "$ref": "#/components/schemas/TrainEvalDatasetColumnType"
+ }
+ },
+ "content_url": {
+ "$ref": "#/components/schemas/URL"
+ },
+ "metadata": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "columns",
+ "content_url"
+ ],
+ "title": "Dataset to be used for training or evaluating language models."
+ },
+ "TrainEvalDatasetColumnType": {
+ "type": "string",
+ "enum": [
+ "dialog",
+ "text",
+ "media",
+ "number",
+ "json"
+ ]
+ },
"TrainingConfig": {
"type": "object",
"properties": {
@@ -5491,222 +5346,520 @@
"document_id": {
"type": "string"
}
- },
- "additionalProperties": false,
- "required": [
- "content",
- "token_count",
- "document_id"
+ },
+ "additionalProperties": false,
+ "required": [
+ "content",
+ "token_count",
+ "document_id"
+ ]
+ }
+ },
+ "scores": {
+ "type": "array",
+ "items": {
+ "type": "number"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "chunks",
+ "scores"
+ ]
+ },
+ "RegisterMemoryBankRequest": {
+ "type": "object",
+ "properties": {
+ "memory_bank": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/VectorMemoryBankDef"
+ },
+ {
+ "$ref": "#/components/schemas/KeyValueMemoryBankDef"
+ },
+ {
+ "$ref": "#/components/schemas/KeywordMemoryBankDef"
+ },
+ {
+ "$ref": "#/components/schemas/GraphMemoryBankDef"
+ }
+ ]
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "memory_bank"
+ ]
+ },
+ "RegisterModelRequest": {
+ "type": "object",
+ "properties": {
+ "model": {
+ "$ref": "#/components/schemas/ModelDefWithProvider"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "model"
+ ]
+ },
+ "RegisterShieldRequest": {
+ "type": "object",
+ "properties": {
+ "shield": {
+ "$ref": "#/components/schemas/ShieldDefWithProvider"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "shield"
+ ]
+ },
+ "DialogGenerations": {
+ "type": "object",
+ "properties": {
+ "dialog": {
+ "type": "array",
+ "items": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/UserMessage"
+ },
+ {
+ "$ref": "#/components/schemas/SystemMessage"
+ },
+ {
+ "$ref": "#/components/schemas/ToolResponseMessage"
+ },
+ {
+ "$ref": "#/components/schemas/CompletionMessage"
+ }
+ ]
+ }
+ },
+ "sampled_generations": {
+ "type": "array",
+ "items": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/UserMessage"
+ },
+ {
+ "$ref": "#/components/schemas/SystemMessage"
+ },
+ {
+ "$ref": "#/components/schemas/ToolResponseMessage"
+ },
+ {
+ "$ref": "#/components/schemas/CompletionMessage"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "dialog",
+ "sampled_generations"
+ ]
+ },
+ "RewardScoreRequest": {
+ "type": "object",
+ "properties": {
+ "dialog_generations": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/DialogGenerations"
+ }
+ },
+ "model": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "dialog_generations",
+ "model"
+ ]
+ },
+ "RewardScoringResponse": {
+ "type": "object",
+ "properties": {
+ "scored_generations": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ScoredDialogGenerations"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "scored_generations"
+ ],
+ "title": "Response from the reward scoring. Batch of (prompt, response, score) tuples that pass the threshold."
+ },
+ "ScoredDialogGenerations": {
+ "type": "object",
+ "properties": {
+ "dialog": {
+ "type": "array",
+ "items": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/UserMessage"
+ },
+ {
+ "$ref": "#/components/schemas/SystemMessage"
+ },
+ {
+ "$ref": "#/components/schemas/ToolResponseMessage"
+ },
+ {
+ "$ref": "#/components/schemas/CompletionMessage"
+ }
+ ]
+ }
+ },
+ "scored_generations": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ScoredMessage"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "dialog",
+ "scored_generations"
+ ]
+ },
+ "ScoredMessage": {
+ "type": "object",
+ "properties": {
+ "message": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/UserMessage"
+ },
+ {
+ "$ref": "#/components/schemas/SystemMessage"
+ },
+ {
+ "$ref": "#/components/schemas/ToolResponseMessage"
+ },
+ {
+ "$ref": "#/components/schemas/CompletionMessage"
+ }
+ ]
+ },
+ "score": {
+ "type": "number"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "message",
+ "score"
+ ]
+ },
+ "EvaluateDatasetConfig": {
+ "type": "object",
+ "properties": {
+ "dataset_identifier": {
+ "type": "string"
+ },
+ "row_limit": {
+ "type": "integer"
+ },
+ "kwargs": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "dataset_identifier"
+ ]
+ },
+ "EvaluateJudgeScoringConfig": {
+ "type": "object"
+ },
+ "EvaluateModelGenerationConfig": {
+ "type": "object",
+ "properties": {
+ "model": {
+ "type": "string"
+ },
+ "sampling_params": {
+ "$ref": "#/components/schemas/SamplingParams"
+ },
+ "kwargs": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "model",
+ "sampling_params"
+ ]
+ },
+ "EvaluatePostprocessConfig": {
+ "type": "object",
+ "properties": {
+ "kwargs": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false
+ },
+ "EvaluatePreprocessConfig": {
+ "type": "object",
+ "properties": {
+ "kwargs": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
]
}
+ }
+ },
+ "additionalProperties": false
+ },
+ "EvaluateProcessorConfig": {
+ "type": "object",
+ "properties": {
+ "processor_identifier": {
+ "type": "string"
},
- "scores": {
- "type": "array",
- "items": {
- "type": "number"
- }
+ "preprocess_config": {
+ "$ref": "#/components/schemas/EvaluatePreprocessConfig"
+ },
+ "postprocess_config": {
+ "$ref": "#/components/schemas/EvaluatePostprocessConfig"
}
},
"additionalProperties": false,
"required": [
- "chunks",
- "scores"
+ "processor_identifier"
]
},
- "RegisterMemoryBankRequest": {
+ "EvaluateScoringConfig": {
"type": "object",
"properties": {
- "memory_bank": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/VectorMemoryBankDef"
- },
- {
- "$ref": "#/components/schemas/KeyValueMemoryBankDef"
- },
- {
- "$ref": "#/components/schemas/KeywordMemoryBankDef"
- },
- {
- "$ref": "#/components/schemas/GraphMemoryBankDef"
- }
- ]
+ "scorer_config_list": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/EvaluateSingleScorerConfig"
+ }
}
},
"additionalProperties": false,
"required": [
- "memory_bank"
+ "scorer_config_list"
]
},
- "RegisterModelRequest": {
+ "EvaluateSingleScorerConfig": {
"type": "object",
"properties": {
- "model": {
- "$ref": "#/components/schemas/ModelDefWithProvider"
+ "scorer_name": {
+ "type": "string"
+ },
+ "llm_judge_config": {
+ "$ref": "#/components/schemas/LLMJudgeConfig"
}
},
"additionalProperties": false,
"required": [
- "model"
+ "scorer_name"
]
},
- "RegisterShieldRequest": {
+ "EvaluateTaskConfig": {
"type": "object",
"properties": {
- "shield": {
- "$ref": "#/components/schemas/ShieldDefWithProvider"
+ "dataset_config": {
+ "$ref": "#/components/schemas/EvaluateDatasetConfig"
+ },
+ "processor_config": {
+ "$ref": "#/components/schemas/EvaluateProcessorConfig"
+ },
+ "generation_config": {
+ "$ref": "#/components/schemas/EvaluateModelGenerationConfig"
+ },
+ "scoring_config": {
+ "$ref": "#/components/schemas/EvaluateScoringConfig"
}
},
"additionalProperties": false,
"required": [
- "shield"
+ "dataset_config",
+ "processor_config",
+ "generation_config",
+ "scoring_config"
]
},
- "DialogGenerations": {
+ "LLMJudgeConfig": {
"type": "object",
"properties": {
- "dialog": {
- "type": "array",
- "items": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/UserMessage"
- },
- {
- "$ref": "#/components/schemas/SystemMessage"
- },
- {
- "$ref": "#/components/schemas/ToolResponseMessage"
- },
- {
- "$ref": "#/components/schemas/CompletionMessage"
- }
- ]
- }
+ "judge_processor_config": {
+ "$ref": "#/components/schemas/EvaluateProcessorConfig"
},
- "sampled_generations": {
- "type": "array",
- "items": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/UserMessage"
- },
- {
- "$ref": "#/components/schemas/SystemMessage"
- },
- {
- "$ref": "#/components/schemas/ToolResponseMessage"
- },
- {
- "$ref": "#/components/schemas/CompletionMessage"
- }
- ]
- }
+ "judge_model_generation_config": {
+ "$ref": "#/components/schemas/EvaluateModelGenerationConfig"
+ },
+ "judge_scoring_config": {
+ "$ref": "#/components/schemas/EvaluateJudgeScoringConfig"
}
},
"additionalProperties": false,
"required": [
- "dialog",
- "sampled_generations"
+ "judge_processor_config",
+ "judge_model_generation_config",
+ "judge_scoring_config"
]
},
- "RewardScoreRequest": {
+ "RunEvalTaskRequest": {
"type": "object",
"properties": {
- "dialog_generations": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/DialogGenerations"
- }
- },
- "model": {
- "type": "string"
+ "eval_task_config": {
+ "$ref": "#/components/schemas/EvaluateTaskConfig"
}
},
"additionalProperties": false,
"required": [
- "dialog_generations",
- "model"
+ "eval_task_config"
]
},
- "RewardScoringResponse": {
+ "EvalResult": {
"type": "object",
"properties": {
- "scored_generations": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/ScoredDialogGenerations"
+ "metrics": {
+ "type": "object",
+ "additionalProperties": {
+ "type": "number"
}
}
},
"additionalProperties": false,
"required": [
- "scored_generations"
+ "metrics"
],
- "title": "Response from the reward scoring. Batch of (prompt, response, score) tuples that pass the threshold."
+ "title": "Aggregated final evaluation result."
},
- "ScoredDialogGenerations": {
+ "EvaluateResponse": {
"type": "object",
"properties": {
- "dialog": {
- "type": "array",
- "items": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/UserMessage"
- },
- {
- "$ref": "#/components/schemas/SystemMessage"
- },
- {
- "$ref": "#/components/schemas/ToolResponseMessage"
- },
- {
- "$ref": "#/components/schemas/CompletionMessage"
- }
- ]
- }
+ "eval_result": {
+ "$ref": "#/components/schemas/EvalResult"
},
- "scored_generations": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/ScoredMessage"
- }
+ "formatted_report": {
+ "type": "string"
}
},
"additionalProperties": false,
"required": [
- "dialog",
- "scored_generations"
- ]
+ "eval_result"
+ ],
+ "title": "Scores for evaluation."
},
- "ScoredMessage": {
+ "RunScorerRequest": {
"type": "object",
"properties": {
- "message": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/UserMessage"
- },
- {
- "$ref": "#/components/schemas/SystemMessage"
- },
- {
- "$ref": "#/components/schemas/ToolResponseMessage"
- },
- {
- "$ref": "#/components/schemas/CompletionMessage"
- }
- ]
+ "dataset_config": {
+ "$ref": "#/components/schemas/EvaluateDatasetConfig"
},
- "score": {
- "type": "number"
+ "eval_scoring_config": {
+ "$ref": "#/components/schemas/EvaluateScoringConfig"
}
},
"additionalProperties": false,
"required": [
- "message",
- "score"
+ "dataset_config",
+ "eval_scoring_config"
]
},
"RunShieldRequest": {
@@ -6075,49 +6228,49 @@
],
"tags": [
{
- "name": "Evaluations"
+ "name": "Agents"
},
{
- "name": "Inspect"
+ "name": "Telemetry"
},
{
- "name": "RewardScoring"
+ "name": "Safety"
},
{
- "name": "Datasets"
+ "name": "MemoryBanks"
},
{
- "name": "Models"
+ "name": "Datasets"
},
{
- "name": "Telemetry"
+ "name": "Shields"
},
{
- "name": "PostTraining"
+ "name": "RewardScoring"
},
{
- "name": "SyntheticDataGeneration"
+ "name": "PostTraining"
},
{
- "name": "BatchInference"
+ "name": "Models"
},
{
- "name": "Inference"
+ "name": "Inspect"
},
{
- "name": "Agents"
+ "name": "Evals"
},
{
- "name": "Memory"
+ "name": "BatchInference"
},
{
- "name": "Safety"
+ "name": "Inference"
},
{
- "name": "Shields"
+ "name": "Memory"
},
{
- "name": "MemoryBanks"
+ "name": "SyntheticDataGeneration"
},
{
"name": "BuiltinTool",
@@ -6195,10 +6348,6 @@
"name": "BatchCompletionResponse",
"description": ""
},
- {
- "name": "CancelEvaluationJobRequest",
- "description": ""
- },
{
"name": "CancelTrainingJobRequest",
"description": ""
@@ -6368,17 +6517,21 @@
"description": ""
},
{
- "name": "TrainEvalDataset",
- "description": "Dataset to be used for training or evaluating language models.\n\n"
+ "name": "CustomDatasetDef",
+ "description": ""
},
{
- "name": "TrainEvalDatasetColumnType",
- "description": ""
+ "name": "HuggingfaceDatasetDef",
+ "description": ""
},
{
"name": "CreateDatasetRequest",
"description": ""
},
+ {
+ "name": "CreateDatasetResponse",
+ "description": ""
+ },
{
"name": "DeleteAgentsRequest",
"description": ""
@@ -6391,6 +6544,10 @@
"name": "DeleteDatasetRequest",
"description": ""
},
+ {
+ "name": "DeleteDatasetResponse",
+ "description": ""
+ },
{
"name": "EmbeddingsRequest",
"description": ""
@@ -6399,22 +6556,6 @@
"name": "EmbeddingsResponse",
"description": ""
},
- {
- "name": "EvaluateQuestionAnsweringRequest",
- "description": ""
- },
- {
- "name": "EvaluationJob",
- "description": ""
- },
- {
- "name": "EvaluateSummarizationRequest",
- "description": ""
- },
- {
- "name": "EvaluateTextGenerationRequest",
- "description": ""
- },
{
"name": "GetAgentsSessionRequest",
"description": ""
@@ -6443,18 +6584,6 @@
"name": "AgentStepResponse",
"description": ""
},
- {
- "name": "EvaluationJobArtifactsResponse",
- "description": "Artifacts of a evaluation job.\n\n"
- },
- {
- "name": "EvaluationJobLogStream",
- "description": ""
- },
- {
- "name": "EvaluationJobStatusResponse",
- "description": ""
- },
{
"name": "ModelDefWithProvider",
"description": ""
@@ -6555,6 +6684,14 @@
"name": "RLHFAlgorithm",
"description": ""
},
+ {
+ "name": "TrainEvalDataset",
+ "description": "Dataset to be used for training or evaluating language models.\n\n"
+ },
+ {
+ "name": "TrainEvalDatasetColumnType",
+ "description": ""
+ },
{
"name": "TrainingConfig",
"description": ""
@@ -6603,6 +6740,62 @@
"name": "ScoredMessage",
"description": ""
},
+ {
+ "name": "EvaluateDatasetConfig",
+ "description": ""
+ },
+ {
+ "name": "EvaluateJudgeScoringConfig",
+ "description": ""
+ },
+ {
+ "name": "EvaluateModelGenerationConfig",
+ "description": ""
+ },
+ {
+ "name": "EvaluatePostprocessConfig",
+ "description": ""
+ },
+ {
+ "name": "EvaluatePreprocessConfig",
+ "description": ""
+ },
+ {
+ "name": "EvaluateProcessorConfig",
+ "description": ""
+ },
+ {
+ "name": "EvaluateScoringConfig",
+ "description": ""
+ },
+ {
+ "name": "EvaluateSingleScorerConfig",
+ "description": ""
+ },
+ {
+ "name": "EvaluateTaskConfig",
+ "description": ""
+ },
+ {
+ "name": "LLMJudgeConfig",
+ "description": ""
+ },
+ {
+ "name": "RunEvalTaskRequest",
+ "description": ""
+ },
+ {
+ "name": "EvalResult",
+ "description": "Aggregated final evaluation result.\n\n"
+ },
+ {
+ "name": "EvaluateResponse",
+ "description": "Scores for evaluation.\n\n"
+ },
+ {
+ "name": "RunScorerRequest",
+ "description": ""
+ },
{
"name": "RunShieldRequest",
"description": ""
@@ -6647,7 +6840,7 @@
"Agents",
"BatchInference",
"Datasets",
- "Evaluations",
+ "Evals",
"Inference",
"Inspect",
"Memory",
@@ -6681,7 +6874,6 @@
"BatchCompletionRequest",
"BatchCompletionResponse",
"BuiltinTool",
- "CancelEvaluationJobRequest",
"CancelTrainingJobRequest",
"ChatCompletionRequest",
"ChatCompletionResponse",
@@ -6698,31 +6890,40 @@
"CreateAgentSessionRequest",
"CreateAgentTurnRequest",
"CreateDatasetRequest",
+ "CreateDatasetResponse",
+ "CustomDatasetDef",
"DPOAlignmentConfig",
"DeleteAgentsRequest",
"DeleteAgentsSessionRequest",
"DeleteDatasetRequest",
+ "DeleteDatasetResponse",
"DialogGenerations",
"DoraFinetuningConfig",
"EmbeddingsRequest",
"EmbeddingsResponse",
- "EvaluateQuestionAnsweringRequest",
- "EvaluateSummarizationRequest",
- "EvaluateTextGenerationRequest",
- "EvaluationJob",
- "EvaluationJobArtifactsResponse",
- "EvaluationJobLogStream",
- "EvaluationJobStatusResponse",
+ "EvalResult",
+ "EvaluateDatasetConfig",
+ "EvaluateJudgeScoringConfig",
+ "EvaluateModelGenerationConfig",
+ "EvaluatePostprocessConfig",
+ "EvaluatePreprocessConfig",
+ "EvaluateProcessorConfig",
+ "EvaluateResponse",
+ "EvaluateScoringConfig",
+ "EvaluateSingleScorerConfig",
+ "EvaluateTaskConfig",
"FinetuningAlgorithm",
"FunctionCallToolDefinition",
"GetAgentsSessionRequest",
"GraphMemoryBankDef",
"HealthInfo",
+ "HuggingfaceDatasetDef",
"ImageMedia",
"InferenceStep",
"InsertDocumentsRequest",
"KeyValueMemoryBankDef",
"KeywordMemoryBankDef",
+ "LLMJudgeConfig",
"LogEventRequest",
"LogSeverity",
"LoraFinetuningConfig",
@@ -6752,6 +6953,8 @@
"RewardScoreRequest",
"RewardScoringResponse",
"RouteInfo",
+ "RunEvalTaskRequest",
+ "RunScorerRequest",
"RunShieldRequest",
"RunShieldResponse",
"SafetyViolation",
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index c9822d6ca9..c116742243 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -315,14 +315,6 @@ components:
- photogen
- code_interpreter
type: string
- CancelEvaluationJobRequest:
- additionalProperties: false
- properties:
- job_uuid:
- type: string
- required:
- - job_uuid
- type: object
CancelTrainingJobRequest:
additionalProperties: false
properties:
@@ -572,13 +564,45 @@ components:
CreateDatasetRequest:
additionalProperties: false
properties:
- dataset:
- $ref: '#/components/schemas/TrainEvalDataset'
- uuid:
+ dataset_def:
+ oneOf:
+ - $ref: '#/components/schemas/HuggingfaceDatasetDef'
+ - $ref: '#/components/schemas/CustomDatasetDef'
+ required:
+ - dataset_def
+ type: object
+ CreateDatasetResponse:
+ additionalProperties: false
+ properties:
+ msg:
+ type: string
+ status:
+ enum:
+ - success
+ - fail
type: string
required:
- - uuid
- - dataset
+ - status
+ type: object
+ CustomDatasetDef:
+ additionalProperties: false
+ properties:
+ identifier:
+ type: string
+ rename_columns_map:
+ additionalProperties:
+ type: string
+ type: object
+ type:
+ const: custom
+ default: custom
+ type: string
+ url:
+ type: string
+ required:
+ - type
+ - identifier
+ - url
type: object
DPOAlignmentConfig:
additionalProperties: false
@@ -619,10 +643,23 @@ components:
DeleteDatasetRequest:
additionalProperties: false
properties:
- dataset_uuid:
+ dataset_identifier:
type: string
required:
- - dataset_uuid
+ - dataset_identifier
+ type: object
+ DeleteDatasetResponse:
+ additionalProperties: false
+ properties:
+ msg:
+ type: string
+ status:
+ enum:
+ - success
+ - fail
+ type: string
+ required:
+ - status
type: object
DialogGenerations:
additionalProperties: false
@@ -701,78 +738,147 @@ components:
required:
- embeddings
type: object
- EvaluateQuestionAnsweringRequest:
+ EvalResult:
additionalProperties: false
properties:
metrics:
- items:
- enum:
- - em
- - f1
- type: string
- type: array
+ additionalProperties:
+ type: number
+ type: object
required:
- metrics
+ title: Aggregated final evaluation result.
type: object
- EvaluateSummarizationRequest:
+ EvaluateDatasetConfig:
additionalProperties: false
properties:
- metrics:
- items:
- enum:
- - rouge
- - bleu
- type: string
- type: array
+ dataset_identifier:
+ type: string
+ kwargs:
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ type: object
+ row_limit:
+ type: integer
required:
- - metrics
+ - dataset_identifier
type: object
- EvaluateTextGenerationRequest:
+ EvaluateJudgeScoringConfig:
+ type: object
+ EvaluateModelGenerationConfig:
additionalProperties: false
properties:
- metrics:
- items:
- enum:
- - perplexity
- - rouge
- - bleu
- type: string
- type: array
+ kwargs:
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ type: object
+ model:
+ type: string
+ sampling_params:
+ $ref: '#/components/schemas/SamplingParams'
required:
- - metrics
+ - model
+ - sampling_params
type: object
- EvaluationJob:
+ EvaluatePostprocessConfig:
additionalProperties: false
properties:
- job_uuid:
+ kwargs:
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ type: object
+ type: object
+ EvaluatePreprocessConfig:
+ additionalProperties: false
+ properties:
+ kwargs:
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ type: object
+ type: object
+ EvaluateProcessorConfig:
+ additionalProperties: false
+ properties:
+ postprocess_config:
+ $ref: '#/components/schemas/EvaluatePostprocessConfig'
+ preprocess_config:
+ $ref: '#/components/schemas/EvaluatePreprocessConfig'
+ processor_identifier:
type: string
required:
- - job_uuid
+ - processor_identifier
type: object
- EvaluationJobArtifactsResponse:
+ EvaluateResponse:
additionalProperties: false
properties:
- job_uuid:
+ eval_result:
+ $ref: '#/components/schemas/EvalResult'
+ formatted_report:
type: string
required:
- - job_uuid
- title: Artifacts of a evaluation job.
+ - eval_result
+ title: Scores for evaluation.
type: object
- EvaluationJobLogStream:
+ EvaluateScoringConfig:
additionalProperties: false
properties:
- job_uuid:
- type: string
+ scorer_config_list:
+ items:
+ $ref: '#/components/schemas/EvaluateSingleScorerConfig'
+ type: array
required:
- - job_uuid
+ - scorer_config_list
type: object
- EvaluationJobStatusResponse:
+ EvaluateSingleScorerConfig:
additionalProperties: false
properties:
- job_uuid:
+ llm_judge_config:
+ $ref: '#/components/schemas/LLMJudgeConfig'
+ scorer_name:
type: string
required:
- - job_uuid
+ - scorer_name
+ type: object
+ EvaluateTaskConfig:
+ additionalProperties: false
+ properties:
+ dataset_config:
+ $ref: '#/components/schemas/EvaluateDatasetConfig'
+ generation_config:
+ $ref: '#/components/schemas/EvaluateModelGenerationConfig'
+ processor_config:
+ $ref: '#/components/schemas/EvaluateProcessorConfig'
+ scoring_config:
+ $ref: '#/components/schemas/EvaluateScoringConfig'
+ required:
+ - dataset_config
+ - processor_config
+ - generation_config
+ - scoring_config
type: object
FinetuningAlgorithm:
enum:
@@ -845,6 +951,39 @@ components:
required:
- status
type: object
+ HuggingfaceDatasetDef:
+ additionalProperties: false
+ properties:
+ dataset_name:
+ type: string
+ dataset_path:
+ type: string
+ identifier:
+ type: string
+ kwargs:
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ type: object
+ rename_columns_map:
+ additionalProperties:
+ type: string
+ type: object
+ type:
+ const: huggingface
+ default: huggingface
+ type: string
+ required:
+ - type
+ - identifier
+ - dataset_path
+ - kwargs
+ type: object
ImageMedia:
additionalProperties: false
properties:
@@ -936,6 +1075,20 @@ components:
- provider_id
- type
type: object
+ LLMJudgeConfig:
+ additionalProperties: false
+ properties:
+ judge_model_generation_config:
+ $ref: '#/components/schemas/EvaluateModelGenerationConfig'
+ judge_processor_config:
+ $ref: '#/components/schemas/EvaluateProcessorConfig'
+ judge_scoring_config:
+ $ref: '#/components/schemas/EvaluateJudgeScoringConfig'
+ required:
+ - judge_processor_config
+ - judge_model_generation_config
+ - judge_scoring_config
+ type: object
LogEventRequest:
additionalProperties: false
properties:
@@ -1629,6 +1782,25 @@ components:
- method
- provider_types
type: object
+ RunEvalTaskRequest:
+ additionalProperties: false
+ properties:
+ eval_task_config:
+ $ref: '#/components/schemas/EvaluateTaskConfig'
+ required:
+ - eval_task_config
+ type: object
+ RunScorerRequest:
+ additionalProperties: false
+ properties:
+ dataset_config:
+ $ref: '#/components/schemas/EvaluateDatasetConfig'
+ eval_scoring_config:
+ $ref: '#/components/schemas/EvaluateScoringConfig'
+ required:
+ - dataset_config
+ - eval_scoring_config
+ type: object
RunShieldRequest:
additionalProperties: false
properties:
@@ -2507,7 +2679,7 @@ info:
description: "This is the specification of the llama stack that provides\n \
\ a set of endpoints and their corresponding interfaces that are tailored\
\ to\n best leverage Llama Models. The specification is still in\
- \ draft and subject to change.\n Generated at 2024-10-10 15:29:56.831109"
+ \ draft and subject to change.\n Generated at 2024-10-15 10:20:19.984531"
title: '[DRAFT] Llama Stack Specification'
version: 0.0.1
jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
@@ -2794,81 +2966,16 @@ paths:
schema:
$ref: '#/components/schemas/CreateDatasetRequest'
required: true
- responses:
- '200':
- description: OK
- tags:
- - Datasets
- /datasets/delete:
- post:
- parameters:
- - description: JSON-encoded provider data which will be made available to the
- adapter servicing the API
- in: header
- name: X-LlamaStack-ProviderData
- required: false
- schema:
- type: string
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/DeleteDatasetRequest'
- required: true
- responses:
- '200':
- description: OK
- tags:
- - Datasets
- /datasets/get:
- get:
- parameters:
- - in: query
- name: dataset_uuid
- required: true
- schema:
- type: string
- - description: JSON-encoded provider data which will be made available to the
- adapter servicing the API
- in: header
- name: X-LlamaStack-ProviderData
- required: false
- schema:
- type: string
responses:
'200':
content:
application/json:
schema:
- $ref: '#/components/schemas/TrainEvalDataset'
+ $ref: '#/components/schemas/CreateDatasetResponse'
description: OK
tags:
- Datasets
- /evaluate/job/artifacts:
- get:
- parameters:
- - in: query
- name: job_uuid
- required: true
- schema:
- type: string
- - description: JSON-encoded provider data which will be made available to the
- adapter servicing the API
- in: header
- name: X-LlamaStack-ProviderData
- required: false
- schema:
- type: string
- responses:
- '200':
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/EvaluationJobArtifactsResponse'
- description: OK
- tags:
- - Evaluations
- /evaluate/job/cancel:
+ /datasets/delete:
post:
parameters:
- description: JSON-encoded provider data which will be made available to the
@@ -2882,42 +2989,22 @@ paths:
content:
application/json:
schema:
- $ref: '#/components/schemas/CancelEvaluationJobRequest'
- required: true
- responses:
- '200':
- description: OK
- tags:
- - Evaluations
- /evaluate/job/logs:
- get:
- parameters:
- - in: query
- name: job_uuid
+ $ref: '#/components/schemas/DeleteDatasetRequest'
required: true
- schema:
- type: string
- - description: JSON-encoded provider data which will be made available to the
- adapter servicing the API
- in: header
- name: X-LlamaStack-ProviderData
- required: false
- schema:
- type: string
responses:
'200':
content:
application/json:
schema:
- $ref: '#/components/schemas/EvaluationJobLogStream'
+ $ref: '#/components/schemas/DeleteDatasetResponse'
description: OK
tags:
- - Evaluations
- /evaluate/job/status:
+ - Datasets
+ /datasets/get:
get:
parameters:
- in: query
- name: job_uuid
+ name: dataset_identifier
required: true
schema:
type: string
@@ -2933,11 +3020,15 @@ paths:
content:
application/json:
schema:
- $ref: '#/components/schemas/EvaluationJobStatusResponse'
+ oneOf:
+ - oneOf:
+ - $ref: '#/components/schemas/HuggingfaceDatasetDef'
+ - $ref: '#/components/schemas/CustomDatasetDef'
+ - type: 'null'
description: OK
tags:
- - Evaluations
- /evaluate/jobs:
+ - Datasets
+ /datasets/list:
get:
parameters:
- description: JSON-encoded provider data which will be made available to the
@@ -2952,36 +3043,13 @@ paths:
content:
application/jsonl:
schema:
- $ref: '#/components/schemas/EvaluationJob'
- description: OK
- tags:
- - Evaluations
- /evaluate/question_answering/:
- post:
- parameters:
- - description: JSON-encoded provider data which will be made available to the
- adapter servicing the API
- in: header
- name: X-LlamaStack-ProviderData
- required: false
- schema:
- type: string
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/EvaluateQuestionAnsweringRequest'
- required: true
- responses:
- '200':
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/EvaluationJob'
+ oneOf:
+ - $ref: '#/components/schemas/HuggingfaceDatasetDef'
+ - $ref: '#/components/schemas/CustomDatasetDef'
description: OK
tags:
- - Evaluations
- /evaluate/summarization/:
+ - Datasets
+ /evals/run_eval_task:
post:
parameters:
- description: JSON-encoded provider data which will be made available to the
@@ -2995,18 +3063,18 @@ paths:
content:
application/json:
schema:
- $ref: '#/components/schemas/EvaluateSummarizationRequest'
+ $ref: '#/components/schemas/RunEvalTaskRequest'
required: true
responses:
'200':
content:
application/json:
schema:
- $ref: '#/components/schemas/EvaluationJob'
+ $ref: '#/components/schemas/EvaluateResponse'
description: OK
tags:
- - Evaluations
- /evaluate/text_generation/:
+ - Evals
+ /evals/run_scorer:
post:
parameters:
- description: JSON-encoded provider data which will be made available to the
@@ -3020,17 +3088,17 @@ paths:
content:
application/json:
schema:
- $ref: '#/components/schemas/EvaluateTextGenerationRequest'
+ $ref: '#/components/schemas/RunScorerRequest'
required: true
responses:
'200':
content:
application/json:
schema:
- $ref: '#/components/schemas/EvaluationJob'
+ $ref: '#/components/schemas/EvaluateResponse'
description: OK
tags:
- - Evaluations
+ - Evals
/health:
get:
parameters:
@@ -3712,21 +3780,21 @@ security:
servers:
- url: http://any-hosted-llama-stack.com
tags:
-- name: Evaluations
-- name: Inspect
-- name: RewardScoring
-- name: Datasets
-- name: Models
+- name: Agents
- name: Telemetry
+- name: Safety
+- name: MemoryBanks
+- name: Datasets
+- name: Shields
+- name: RewardScoring
- name: PostTraining
-- name: SyntheticDataGeneration
+- name: Models
+- name: Inspect
+- name: Evals
- name: BatchInference
- name: Inference
-- name: Agents
- name: Memory
-- name: Safety
-- name: Shields
-- name: MemoryBanks
+- name: SyntheticDataGeneration
- description:
name: BuiltinTool
- description:
name: BatchCompletionResponse
-- description:
- name: CancelEvaluationJobRequest
- description:
name: CancelTrainingJobRequest
@@ -3919,17 +3984,18 @@ tags:
name: Turn
- description:
name: ViolationLevel
-- description: 'Dataset to be used for training or evaluating language models.
-
-
- '
- name: TrainEvalDataset
-- description:
- name: TrainEvalDatasetColumnType
+ name: CustomDatasetDef
+- description:
+ name: HuggingfaceDatasetDef
- description:
name: CreateDatasetRequest
+- description:
+ name: CreateDatasetResponse
- description:
name: DeleteAgentsRequest
@@ -3939,23 +4005,15 @@ tags:
- description:
name: DeleteDatasetRequest
+- description:
+ name: DeleteDatasetResponse
- description:
name: EmbeddingsRequest
- description:
name: EmbeddingsResponse
-- description:
- name: EvaluateQuestionAnsweringRequest
-- description:
- name: EvaluationJob
-- description:
- name: EvaluateSummarizationRequest
-- description:
- name: EvaluateTextGenerationRequest
- description:
name: GetAgentsSessionRequest
@@ -3979,18 +4037,6 @@ tags:
- description:
name: AgentStepResponse
-- description: 'Artifacts of a evaluation job.
-
-
- '
- name: EvaluationJobArtifactsResponse
-- description:
- name: EvaluationJobLogStream
-- description:
- name: EvaluationJobStatusResponse
- description:
name: ModelDefWithProvider
@@ -4067,6 +4113,14 @@ tags:
name: OptimizerConfig
- description:
name: RLHFAlgorithm
+- description: 'Dataset to be used for training or evaluating language models.
+
+
+ '
+ name: TrainEvalDataset
+- description:
+ name: TrainEvalDatasetColumnType
- description:
name: TrainingConfig
- description:
name: ScoredMessage
+- description:
+ name: EvaluateDatasetConfig
+- description:
+ name: EvaluateJudgeScoringConfig
+- description:
+ name: EvaluateModelGenerationConfig
+- description:
+ name: EvaluatePostprocessConfig
+- description:
+ name: EvaluatePreprocessConfig
+- description:
+ name: EvaluateProcessorConfig
+- description:
+ name: EvaluateScoringConfig
+- description:
+ name: EvaluateSingleScorerConfig
+- description:
+ name: EvaluateTaskConfig
+- description:
+ name: LLMJudgeConfig
+- description:
+ name: RunEvalTaskRequest
+- description: 'Aggregated final evaluation result.
+
+
+ '
+ name: EvalResult
+- description: 'Scores for evaluation.
+
+
+ '
+ name: EvaluateResponse
+- description:
+ name: RunScorerRequest
- description:
name: RunShieldRequest
@@ -4141,7 +4240,7 @@ x-tagGroups:
- Agents
- BatchInference
- Datasets
- - Evaluations
+ - Evals
- Inference
- Inspect
- Memory
@@ -4172,7 +4271,6 @@ x-tagGroups:
- BatchCompletionRequest
- BatchCompletionResponse
- BuiltinTool
- - CancelEvaluationJobRequest
- CancelTrainingJobRequest
- ChatCompletionRequest
- ChatCompletionResponse
@@ -4189,31 +4287,40 @@ x-tagGroups:
- CreateAgentSessionRequest
- CreateAgentTurnRequest
- CreateDatasetRequest
+ - CreateDatasetResponse
+ - CustomDatasetDef
- DPOAlignmentConfig
- DeleteAgentsRequest
- DeleteAgentsSessionRequest
- DeleteDatasetRequest
+ - DeleteDatasetResponse
- DialogGenerations
- DoraFinetuningConfig
- EmbeddingsRequest
- EmbeddingsResponse
- - EvaluateQuestionAnsweringRequest
- - EvaluateSummarizationRequest
- - EvaluateTextGenerationRequest
- - EvaluationJob
- - EvaluationJobArtifactsResponse
- - EvaluationJobLogStream
- - EvaluationJobStatusResponse
+ - EvalResult
+ - EvaluateDatasetConfig
+ - EvaluateJudgeScoringConfig
+ - EvaluateModelGenerationConfig
+ - EvaluatePostprocessConfig
+ - EvaluatePreprocessConfig
+ - EvaluateProcessorConfig
+ - EvaluateResponse
+ - EvaluateScoringConfig
+ - EvaluateSingleScorerConfig
+ - EvaluateTaskConfig
- FinetuningAlgorithm
- FunctionCallToolDefinition
- GetAgentsSessionRequest
- GraphMemoryBankDef
- HealthInfo
+ - HuggingfaceDatasetDef
- ImageMedia
- InferenceStep
- InsertDocumentsRequest
- KeyValueMemoryBankDef
- KeywordMemoryBankDef
+ - LLMJudgeConfig
- LogEventRequest
- LogSeverity
- LoraFinetuningConfig
@@ -4243,6 +4350,8 @@ x-tagGroups:
- RewardScoreRequest
- RewardScoringResponse
- RouteInfo
+ - RunEvalTaskRequest
+ - RunScorerRequest
- RunShieldRequest
- RunShieldResponse
- SafetyViolation
diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py
deleted file mode 100644
index 2fa8bb4e5e..0000000000
--- a/llama_stack/apis/dataset/dataset.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import Any, Dict, Optional, Protocol
-
-from llama_models.llama3.api.datatypes import URL
-
-from llama_models.schema_utils import json_schema_type, webmethod
-
-from pydantic import BaseModel
-
-
-@json_schema_type
-class TrainEvalDatasetColumnType(Enum):
- dialog = "dialog"
- text = "text"
- media = "media"
- number = "number"
- json = "json"
-
-
-@json_schema_type
-class TrainEvalDataset(BaseModel):
- """Dataset to be used for training or evaluating language models."""
-
- # TODO(ashwin): figure out if we need to add an enum for a "dataset type"
-
- columns: Dict[str, TrainEvalDatasetColumnType]
- content_url: URL
- metadata: Optional[Dict[str, Any]] = None
-
-
-@json_schema_type
-class CreateDatasetRequest(BaseModel):
- """Request to create a dataset."""
-
- uuid: str
- dataset: TrainEvalDataset
-
-
-class Datasets(Protocol):
- @webmethod(route="/datasets/create")
- def create_dataset(
- self,
- uuid: str,
- dataset: TrainEvalDataset,
- ) -> None: ...
-
- @webmethod(route="/datasets/get")
- def get_dataset(
- self,
- dataset_uuid: str,
- ) -> TrainEvalDataset: ...
-
- @webmethod(route="/datasets/delete")
- def delete_dataset(
- self,
- dataset_uuid: str,
- ) -> None: ...
diff --git a/llama_stack/apis/dataset/__init__.py b/llama_stack/apis/datasets/__init__.py
similarity index 82%
rename from llama_stack/apis/dataset/__init__.py
rename to llama_stack/apis/datasets/__init__.py
index 33557a0ab1..102b9927f3 100644
--- a/llama_stack/apis/dataset/__init__.py
+++ b/llama_stack/apis/datasets/__init__.py
@@ -4,4 +4,4 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
-from .dataset import * # noqa: F401 F403
+from .datasets import * # noqa: F401 F403
diff --git a/llama_stack/apis/datasets/client.py b/llama_stack/apis/datasets/client.py
new file mode 100644
index 0000000000..e292b14d8c
--- /dev/null
+++ b/llama_stack/apis/datasets/client.py
@@ -0,0 +1,156 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import json
+from typing import Optional
+
+import fire
+import httpx
+from termcolor import cprint
+
+from .datasets import * # noqa: F403
+
+
+def deserialize_dataset_def(j: Optional[Dict[str, Any]]) -> Optional[DatasetDef]:
+ if not j:
+ return None
+ if j["type"] == "huggingface":
+ return HuggingfaceDatasetDef(**j)
+ elif j["type"] == "custom":
+ return CustomDatasetDef(**j)
+ else:
+ raise ValueError(f"Unknown dataset type: {j['type']}")
+
+
+class DatasetsClient(Datasets):
+ def __init__(self, base_url: str):
+ self.base_url = base_url
+
+ async def initialize(self) -> None:
+ pass
+
+ async def shutdown(self) -> None:
+ pass
+
+ async def create_dataset(
+ self,
+ dataset_def: DatasetDef,
+ ) -> CreateDatasetResponse:
+ async with httpx.AsyncClient() as client:
+ response = await client.post(
+ f"{self.base_url}/datasets/create",
+ json={
+ "dataset_def": json.loads(dataset_def.json()),
+ },
+ headers={"Content-Type": "application/json"},
+ timeout=60,
+ )
+ response.raise_for_status()
+ return CreateDatasetResponse(**response.json())
+
+ async def get_dataset(
+ self,
+ dataset_identifier: str,
+ ) -> Optional[DatasetDef]:
+ async with httpx.AsyncClient() as client:
+ response = await client.get(
+ f"{self.base_url}/datasets/get",
+ params={
+ "dataset_identifier": dataset_identifier,
+ },
+ headers={"Content-Type": "application/json"},
+ timeout=60,
+ )
+ response.raise_for_status()
+ if not response.json():
+ return
+
+ return deserialize_dataset_def(response.json())
+
+ async def delete_dataset(
+ self,
+ dataset_identifier: str,
+ ) -> DeleteDatasetResponse:
+ async with httpx.AsyncClient() as client:
+ response = await client.post(
+ f"{self.base_url}/datasets/delete",
+ json={
+ "dataset_identifier": dataset_identifier,
+ },
+ headers={"Content-Type": "application/json"},
+ timeout=60,
+ )
+ response.raise_for_status()
+ return DeleteDatasetResponse(**response.json())
+
+ async def list_dataset(
+ self,
+ ) -> List[DatasetDef]:
+ async with httpx.AsyncClient() as client:
+ response = await client.get(
+ f"{self.base_url}/datasets/list",
+ headers={"Content-Type": "application/json"},
+ timeout=60,
+ )
+ response.raise_for_status()
+ if not response.json():
+ return
+
+ return [deserialize_dataset_def(x) for x in response.json()]
+
+
+async def run_main(host: str, port: int):
+ client = DatasetsClient(f"http://{host}:{port}")
+
+ # register dataset
+ response = await client.create_dataset(
+ dataset_def=CustomDatasetDef(
+ identifier="test-dataset",
+ url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
+ ),
+ )
+ cprint(response, "green")
+
+ # register HF dataset
+ response = await client.create_dataset(
+ dataset_def=HuggingfaceDatasetDef(
+ identifier="hellaswag",
+ dataset_name="hellaswag",
+ kwargs={"split": "validation", "trust_remote_code": True},
+ )
+ )
+ cprint(response, "green")
+
+ # get dataset
+ get_dataset = await client.get_dataset(
+ dataset_identifier="test-dataset",
+ )
+ cprint(get_dataset, "cyan")
+
+ # delete dataset
+ delete_dataset = await client.delete_dataset(
+ dataset_identifier="test-dataset",
+ )
+ cprint(delete_dataset, "red")
+
+ # get again after deletion
+ get_dataset = await client.get_dataset(
+ dataset_identifier="test-dataset",
+ )
+ cprint(get_dataset, "yellow")
+
+ # list datasets
+ list_dataset = await client.list_dataset()
+ cprint(list_dataset, "blue")
+
+
+def main(host: str, port: int):
+ asyncio.run(run_main(host, port))
+
+
+if __name__ == "__main__":
+ fire.Fire(main)
diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
new file mode 100644
index 0000000000..f5991c52e1
--- /dev/null
+++ b/llama_stack/apis/datasets/datasets.py
@@ -0,0 +1,225 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Any, Dict, Generic, Iterator, Literal, Protocol, TypeVar, Union
+
+from llama_models.schema_utils import json_schema_type, webmethod
+from llama_models.llama3.api.datatypes import * # noqa: F403
+
+from pydantic import BaseModel, Field
+from typing_extensions import Annotated
+
+
+@json_schema_type
+class TrainEvalDatasetColumnType(Enum):
+ dialog = "dialog"
+ text = "text"
+ media = "media"
+ number = "number"
+ json = "json"
+
+
+@json_schema_type
+class TrainEvalDataset(BaseModel):
+ """Dataset to be used for training or evaluating language models."""
+
+ # TODO(ashwin): figure out if we need to add an enum for a "dataset type"
+
+ columns: Dict[str, TrainEvalDatasetColumnType]
+ content_url: URL
+ metadata: Optional[Dict[str, Any]] = None
+
+
+@json_schema_type
+class GenerationInput(BaseModel):
+ messages: List[Message]
+
+
+@json_schema_type
+class GenerationOutput(BaseModel):
+ completion_message: str
+ logprobs: Optional[List[TokenLogProbs]] = None
+
+
+@json_schema_type
+class PostprocessedGeneration(BaseModel):
+ completion_message: str
+ logprobs: Optional[List[TokenLogProbs]] = None
+
+
+# A sample (row) from dataset
+TDatasetSample = TypeVar("TDatasetSample")
+
+
+@json_schema_type
+class DatasetSample(BaseModel): ...
+
+
+@json_schema_type
+class DictSample(DatasetSample):
+ data: Dict[str, Any]
+
+
+# A sample (row) from evals intermediate dataset after preprocessing
+TPreprocessedSample = TypeVar("TPreprocessedSample")
+
+
+@json_schema_type
+class PreprocessedSample(DatasetSample):
+ generation_input: GenerationInput
+
+
+# A sample (row) from evals intermediate dataset after inference
+TGenerationResponseSample = TypeVar("TGenerationResponseSample")
+
+
+@json_schema_type
+class GenerationResponseSample(DatasetSample):
+ generation_output: GenerationOutput
+
+
+# A sample (row) for prepared evals dataset ready for scoring
+TScorerInputSample = TypeVar("TScorerInputSample")
+
+
+@json_schema_type
+class ScorerInputSample(DatasetSample):
+ """
+ A dataset is required to have the following columns to be used for scoring:
+ - generated_answer: str
+ - expected_answer: Union[str, List[str]]
+ - (optional) input_query: str
+ - (optional) generation_output: PostprocessedGeneration
+ """
+
+ generated_answer: str
+ expected_answer: Union[str, List[str]]
+ input_query: Optional[str] = None
+ generation_output: Optional[PostprocessedGeneration] = None
+
+
+@json_schema_type
+class DatasetType(Enum):
+ custom = "custom"
+ huggingface = "huggingface"
+
+
+@json_schema_type
+class HuggingfaceDatasetDef(BaseModel):
+ type: Literal[DatasetType.huggingface.value] = DatasetType.huggingface.value
+ identifier: str = Field(
+ description="A unique name for the dataset",
+ )
+ dataset_path: str = Field(
+ description="The name of the dataset into HF (e.g. meta-llama/Llama-3.1-8B-Instruct-evals)",
+ )
+ dataset_name: Optional[str] = Field(
+ description="The name of the dataset into HF (e.g. Llama-3.1-8B-Instruct-evals__ifeval__strict__details)",
+ )
+ rename_columns_map: Optional[Dict[str, str]] = Field(
+ description="A map of column names to rename to fit the schema of eval dataset for scoring",
+ default=None,
+ )
+ kwargs: Dict[str, Any] = Field(
+ description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)",
+ default_factory=dict,
+ )
+
+
+@json_schema_type
+class CustomDatasetDef(BaseModel):
+ type: Literal[DatasetType.custom.value] = DatasetType.custom.value
+ identifier: str = Field(
+ description="A unique name for the dataset",
+ )
+ url: str = Field(
+ description="The URL to the dataset",
+ )
+ rename_columns_map: Optional[Dict[str, str]] = Field(
+ description="A map of column names to rename to fit the schema of eval dataset for scoring",
+ default=None,
+ )
+
+
+DatasetDef = Annotated[
+ Union[
+ HuggingfaceDatasetDef,
+ CustomDatasetDef,
+ ],
+ Field(discriminator="type"),
+]
+
+
+class DatasetsResponseStatus(Enum):
+ success = "success"
+ fail = "fail"
+
+
+@json_schema_type
+class CreateDatasetResponse(BaseModel):
+ status: DatasetsResponseStatus = Field(
+ description="Return status of the dataset creation",
+ )
+ msg: Optional[str] = None
+
+
+@json_schema_type
+class DeleteDatasetResponse(BaseModel):
+ status: DatasetsResponseStatus = Field(
+ description="Return status of the dataset creation",
+ )
+ msg: Optional[str] = None
+
+
+class BaseDataset(ABC, Generic[TDatasetSample]):
+ def __init__(self) -> None:
+ self.type: str = self.__class__.__name__
+
+ @property
+ @abstractmethod
+ def dataset_id(self) -> str:
+ raise NotImplementedError()
+
+ @abstractmethod
+ def __iter__(self) -> Iterator[TDatasetSample]:
+ raise NotImplementedError()
+
+ @abstractmethod
+ def __str__(self) -> str:
+ raise NotImplementedError()
+
+ @abstractmethod
+ def __len__(self) -> int:
+ raise NotImplementedError()
+
+ @abstractmethod
+ def load(self) -> None:
+ raise NotImplementedError()
+
+
+class Datasets(Protocol):
+ @webmethod(route="/datasets/create")
+ async def create_dataset(
+ self,
+ dataset_def: DatasetDef,
+ ) -> CreateDatasetResponse: ...
+
+ @webmethod(route="/datasets/get", method="GET")
+ async def get_dataset(
+ self,
+ dataset_identifier: str,
+ ) -> Optional[DatasetDef]: ...
+
+ @webmethod(route="/datasets/delete")
+ async def delete_dataset(
+ self,
+ dataset_identifier: str,
+ ) -> DeleteDatasetResponse: ...
+
+ @webmethod(route="/datasets/list", method="GET")
+ async def list_datasets(self) -> List[DatasetDef]: ...
diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py
new file mode 100644
index 0000000000..fc4820232f
--- /dev/null
+++ b/llama_stack/apis/evals/client.py
@@ -0,0 +1,183 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import json
+
+import fire
+import httpx
+from termcolor import cprint
+
+from .evals import * # noqa: F403
+import base64
+import mimetypes
+import os
+
+from ..datasets.client import DatasetsClient
+
+
+def data_url_from_file(file_path: str) -> str:
+ if not os.path.exists(file_path):
+ raise FileNotFoundError(f"File not found: {file_path}")
+
+ with open(file_path, "rb") as file:
+ file_content = file.read()
+
+ base64_content = base64.b64encode(file_content).decode("utf-8")
+ mime_type, _ = mimetypes.guess_type(file_path)
+
+ data_url = f"data:{mime_type};base64,{base64_content}"
+
+ return data_url
+
+
+class EvaluationClient(Evals):
+ def __init__(self, base_url: str):
+ self.base_url = base_url
+
+ async def initialize(self) -> None:
+ pass
+
+ async def shutdown(self) -> None:
+ pass
+
+ async def run_evals(
+ self,
+ eval_task_config: EvaluateTaskConfig,
+ ) -> EvaluateResponse:
+ async with httpx.AsyncClient() as client:
+ response = await client.post(
+ f"{self.base_url}/evals/run_eval_task",
+ json={
+ "eval_task_config": json.loads(eval_task_config.json()),
+ },
+ headers={"Content-Type": "application/json"},
+ timeout=3600,
+ )
+ response.raise_for_status()
+ return EvaluateResponse(**response.json())
+
+ async def run_scorer(
+ self,
+ dataset_config: EvaluateDatasetConfig,
+ eval_scoring_config: EvaluateScoringConfig,
+ ) -> EvaluateResponse:
+ async with httpx.AsyncClient() as client:
+ response = await client.post(
+ f"{self.base_url}/evals/run_scorer",
+ json={
+ "dataset_config": json.loads(dataset_config.json()),
+ "eval_scoring_config": json.loads(eval_scoring_config.json()),
+ },
+ headers={"Content-Type": "application/json"},
+ timeout=3600,
+ )
+ response.raise_for_status()
+ return EvaluateResponse(**response.json())
+
+
+async def run_main(host: str, port: int, eval_dataset_path: str = ""):
+ client = EvaluationClient(f"http://{host}:{port}")
+ dataset_client = DatasetsClient(f"http://{host}:{port}")
+
+ # Full Eval Task
+ # 1. register custom dataset
+ response = await dataset_client.create_dataset(
+ dataset_def=CustomDatasetDef(
+ identifier="mmlu-simple-eval-en",
+ url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
+ ),
+ )
+ cprint(f"datasets/create: {response}", "cyan")
+
+ # 2. run evals on the registered dataset
+ eval_task_config = EvaluateTaskConfig(
+ dataset_config=EvaluateDatasetConfig(
+ dataset_identifier="mmlu-simple-eval-en",
+ row_limit=3,
+ ),
+ processor_config=EvaluateProcessorConfig(
+ processor_identifier="mmlu",
+ ),
+ generation_config=EvaluateModelGenerationConfig(
+ model="Llama3.1-8B-Instruct",
+ ),
+ scoring_config=EvaluateScoringConfig(
+ scorer_config_list=[
+ EvaluateSingleScorerConfig(scorer_name="accuracy"),
+ EvaluateSingleScorerConfig(scorer_name="random"),
+ ]
+ ),
+ )
+ response = await client.run_evals(
+ eval_task_config=eval_task_config,
+ )
+ for k, v in response.eval_result.metrics.items():
+ cprint(f"{k}: {v}", "green")
+
+ # Scoring Task
+ # 1. register huggingface dataset
+ response = await dataset_client.create_dataset(
+ dataset_def=HuggingfaceDatasetDef(
+ identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
+ dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals",
+ dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
+ rename_columns_map={
+ "output_parsed_answer": "generated_answer",
+ "input_correct_responses": "expected_answer",
+ },
+ kwargs={"split": "latest"},
+ )
+ )
+ cprint(response, "cyan")
+
+ # register custom dataset from file path
+ response = await dataset_client.create_dataset(
+ dataset_def=CustomDatasetDef(
+ identifier="rag-evals",
+ url=data_url_from_file(eval_dataset_path),
+ )
+ )
+ cprint(response, "cyan")
+
+ # 2. run evals on the registered dataset
+ response = await client.run_scorer(
+ dataset_config=EvaluateDatasetConfig(
+ dataset_identifier="rag-evals",
+ row_limit=10,
+ ),
+ eval_scoring_config=EvaluateScoringConfig(
+ scorer_config_list=[
+ # EvaluateSingleScorerConfig(scorer_name="accuracy"),
+ # EvaluateSingleScorerConfig(
+ # scorer_name="braintrust::answer-correctness"
+ # ),
+ EvaluateSingleScorerConfig(
+ scorer_name="llamastack-llm-judge",
+ llm_judge_config=LLMJudgeConfig(
+ judge_processor_config=EvaluateProcessorConfig(
+ processor_identifier="judge",
+ ),
+ judge_model_generation_config=EvaluateModelGenerationConfig(
+ model="Llama3.1-8B-Instruct",
+ ),
+ judge_scoring_config=EvaluateJudgeScoringConfig(),
+ ),
+ ),
+ ]
+ ),
+ )
+
+ for k, v in response.eval_result.metrics.items():
+ cprint(f"{k}: {v}", "green")
+
+
+def main(host: str, port: int, eval_dataset_path: str = ""):
+ asyncio.run(run_main(host, port, eval_dataset_path))
+
+
+if __name__ == "__main__":
+ fire.Fire(main)
diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index 0be2243ab1..c484db734f 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -4,119 +4,256 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
-from enum import Enum
-from typing import List, Protocol
+from abc import ABC, abstractmethod
+from typing import Dict, Generic, List, Optional, Protocol
from llama_models.schema_utils import webmethod
-
from pydantic import BaseModel
from llama_models.llama3.api.datatypes import * # noqa: F403
-from llama_stack.apis.dataset import * # noqa: F403
-from llama_stack.apis.common.training_types import * # noqa: F403
+from llama_stack.apis.datasets import * # noqa: F403
-class TextGenerationMetric(Enum):
- perplexity = "perplexity"
- rouge = "rouge"
- bleu = "bleu"
+class EvaluationJob(BaseModel):
+ job_uuid: str
-class QuestionAnsweringMetric(Enum):
- em = "em"
- f1 = "f1"
+class EvaluationJobLogStream(BaseModel):
+ job_uuid: str
-class SummarizationMetric(Enum):
- rouge = "rouge"
- bleu = "bleu"
+@json_schema_type
+class EvalResult(BaseModel):
+ """Aggregated final evaluation result."""
+ metrics: Dict[str, float]
-class EvaluationJob(BaseModel):
- job_uuid: str
+@json_schema_type
+class SingleEvalResult(BaseModel):
+ """Single evaluation result. Contains a scorer name, and corresponding metrics from scorer."""
-class EvaluationJobLogStream(BaseModel):
- job_uuid: str
+ score_data: Dict[str, float]
+
+
+@json_schema_type
+class EvaluateResponse(BaseModel):
+ """Scores for evaluation."""
+ eval_result: EvalResult
+ formatted_report: Optional[str] = None
-class EvaluateTaskRequestCommon(BaseModel):
+
+@json_schema_type
+class EvaluationJobStatusResponse(BaseModel):
job_uuid: str
- dataset: TrainEvalDataset
- checkpoint: Checkpoint
- # generation params
- sampling_params: SamplingParams = SamplingParams()
+@json_schema_type
+class EvaluationJobCreateResponse(BaseModel):
+ """Response to create a evaluation job."""
+
+ job_uuid: str
@json_schema_type
-class EvaluateTextGenerationRequest(EvaluateTaskRequestCommon):
- """Request to evaluate text generation."""
+class EvaluateDatasetConfig(BaseModel):
+ # identifier to previously registered dataset via DatasetDef
+ dataset_identifier: str
+ # limit number of rows to evaluate
+ row_limit: Optional[int] = None
+ kwargs: Optional[Dict[str, Any]] = None
- metrics: List[TextGenerationMetric]
+
+@json_schema_type
+class EvaluatePreprocessConfig(BaseModel):
+ kwargs: Optional[Dict[str, Any]] = None
@json_schema_type
-class EvaluateQuestionAnsweringRequest(EvaluateTaskRequestCommon):
- """Request to evaluate question answering."""
+class EvaluateModelGenerationConfig(BaseModel):
+ model: str
+ sampling_params: SamplingParams = SamplingParams()
+ kwargs: Optional[Dict[str, Any]] = None
- metrics: List[QuestionAnsweringMetric]
+
+@json_schema_type
+class EvaluatePostprocessConfig(BaseModel):
+ kwargs: Optional[Dict[str, Any]] = None
@json_schema_type
-class EvaluateSummarizationRequest(EvaluateTaskRequestCommon):
- """Request to evaluate summarization."""
+class EvaluateProcessorConfig(BaseModel):
+ processor_identifier: str
+ preprocess_config: Optional[EvaluatePreprocessConfig] = None
+ postprocess_config: Optional[EvaluatePostprocessConfig] = None
- metrics: List[SummarizationMetric]
+@json_schema_type
+class EvaluateJudgeScoringConfig(BaseModel): ...
-class EvaluationJobStatusResponse(BaseModel):
- job_uuid: str
+
+@json_schema_type
+class LLMJudgeConfig(BaseModel):
+ judge_processor_config: EvaluateProcessorConfig
+ judge_model_generation_config: EvaluateModelGenerationConfig
+ judge_scoring_config: EvaluateJudgeScoringConfig
@json_schema_type
-class EvaluationJobArtifactsResponse(BaseModel):
- """Artifacts of a evaluation job."""
+class EvaluateSingleScorerConfig(BaseModel):
+ scorer_name: str
+ llm_judge_config: Optional[LLMJudgeConfig] = None
- job_uuid: str
+
+@json_schema_type
+class EvaluateScoringConfig(BaseModel):
+ # list of scorer (metrics) names to use
+ scorer_config_list: List[EvaluateSingleScorerConfig]
-class Evaluations(Protocol):
- @webmethod(route="/evaluate/text_generation/")
- def evaluate_text_generation(
+@json_schema_type
+class EvaluateTaskConfig(BaseModel):
+ dataset_config: EvaluateDatasetConfig
+ processor_config: EvaluateProcessorConfig
+ generation_config: EvaluateModelGenerationConfig
+ scoring_config: EvaluateScoringConfig
+
+
+class BaseGeneratorProcessor(
+ ABC,
+ Generic[
+ TDatasetSample,
+ TPreprocessedSample,
+ TGenerationResponseSample,
+ TScorerInputSample,
+ ],
+):
+ """
+ Base class for all generator processors. Each processor needs to implement the following methods:
+ - F1: preprocess_sample(self, dataset)
+ - F2: postprocess_sample(self)
+ """
+
+ def __init__(self, *args, **kwargs) -> None:
+ super().__init__(*args, **kwargs)
+
+ def __str__(self) -> str:
+ return self.__class__.__name__
+
+ def preprocess(
+ self, dataset: BaseDataset[TDatasetSample]
+ ) -> List[TPreprocessedSample]:
+ return [self.preprocess_sample(sample) for sample in dataset]
+
+ def postprocess(
+ self,
+ generation: List[TGenerationResponseSample],
+ dataset: BaseDataset[TDatasetSample],
+ ) -> List[TScorerInputSample]:
+ return [
+ self.postprocess_sample(generation_sample, dataset_sample)
+ for generation_sample, dataset_sample in zip(generation, dataset)
+ ]
+
+ @abstractmethod
+ def preprocess_sample(self, sample: TDatasetSample) -> TPreprocessedSample:
+ raise NotImplementedError()
+
+ @abstractmethod
+ def postprocess_sample(
self,
- metrics: List[TextGenerationMetric],
- ) -> EvaluationJob: ...
+ generation_sample: TGenerationResponseSample,
+ dataset_sample: TDatasetSample,
+ ) -> TScorerInputSample:
+ raise NotImplementedError()
+
+
+class BaseGenerator(ABC, Generic[TPreprocessedSample, TGenerationResponseSample]):
+ """
+ Base class for all generators. Each generator needs to implement the following methods:
+ - generate(self, preprocessed_dataset)
+ """
+
+ def __init__(self, *args, **kwargs) -> None:
+ super().__init__(*args, **kwargs)
+
+ def __str__(self) -> str:
+ return self.__class__.__name__
+
+ @abstractmethod
+ async def generate(
+ self, preprocessed_dataset: List[TPreprocessedSample]
+ ) -> List[TGenerationResponseSample]:
+ raise NotImplementedError()
+
+
+class BaseScorer(ABC, Generic[TScorerInputSample]):
+ """
+ Base class for all scorers. Each scorer needs to implement the following methods:
+ - score_sample(self, scorer_input_sample)
+ """
+
+ def __init__(self, *args, **kwargs) -> None:
+ super().__init__(*args, **kwargs)
+
+ def __str__(self) -> str:
+ return self.__class__.__name__
+
+ @abstractmethod
+ def score_sample(self, scorer_input_sample: TScorerInputSample) -> SingleEvalResult:
+ raise NotImplementedError()
+
+ @abstractmethod
+ def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+ raise NotImplementedError()
+
+ def score(
+ self, prepared_eval_dataset: List[TScorerInputSample]
+ ) -> List[SingleEvalResult]:
+ return [self.score_sample(sample) for sample in prepared_eval_dataset]
+
+
+class BaseTask(ABC):
+ def __init__(self, *args, **kwargs) -> None:
+ super().__init__(*args, **kwargs)
+
+ @abstractmethod
+ async def run(self, *args, **kwargs) -> EvalResult:
+ raise NotImplementedError()
+
+
+class Evals(Protocol):
- @webmethod(route="/evaluate/question_answering/")
- def evaluate_question_answering(
+ @webmethod(route="/evals/run_eval_task")
+ async def run_eval_task(
self,
- metrics: List[QuestionAnsweringMetric],
- ) -> EvaluationJob: ...
+ eval_task_config: EvaluateTaskConfig,
+ ) -> EvaluateResponse: ...
- @webmethod(route="/evaluate/summarization/")
- def evaluate_summarization(
+ @webmethod(route="/evals/run_scorer")
+ async def run_scorer(
self,
- metrics: List[SummarizationMetric],
- ) -> EvaluationJob: ...
+ dataset_config: EvaluateDatasetConfig,
+ eval_scoring_config: EvaluateScoringConfig,
+ ) -> EvaluateResponse: ...
- @webmethod(route="/evaluate/jobs")
- def get_evaluation_jobs(self) -> List[EvaluationJob]: ...
+ # @webmethod(route="/evals/jobs")
+ # def get_evaluation_jobs(self) -> List[EvaluationJob]: ...
- @webmethod(route="/evaluate/job/status")
- def get_evaluation_job_status(
- self, job_uuid: str
- ) -> EvaluationJobStatusResponse: ...
+ # @webmethod(route="/evals/job/create")
+ # async def create_evaluation_job(
+ # self, model: str, dataset: str, task: str
+ # ) -> EvaluationJob: ...
- # sends SSE stream of logs
- @webmethod(route="/evaluate/job/logs")
- def get_evaluation_job_logstream(self, job_uuid: str) -> EvaluationJobLogStream: ...
+ # @webmethod(route="/evals/job/status")
+ # def get_evaluation_job_status(
+ # self, job_uuid: str
+ # ) -> EvaluationJobStatusResponse: ...
- @webmethod(route="/evaluate/job/cancel")
- def cancel_evaluation_job(self, job_uuid: str) -> None: ...
+ # # sends SSE stream of logs
+ # @webmethod(route="/evals/job/logs")
+ # def get_evaluation_job_logstream(self, job_uuid: str) -> EvaluationJobLogStream: ...
- @webmethod(route="/evaluate/job/artifacts")
- def get_evaluation_job_artifacts(
- self, job_uuid: str
- ) -> EvaluationJobArtifactsResponse: ...
+ # @webmethod(route="/evals/job/cancel")
+ # def cancel_evaluation_job(self, job_uuid: str) -> None: ...
diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py
index d943f48b20..cdfe5c4673 100644
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@@ -14,7 +14,7 @@
from pydantic import BaseModel, Field
from llama_models.llama3.api.datatypes import * # noqa: F403
-from llama_stack.apis.dataset import * # noqa: F403
+from llama_stack.apis.datasets import * # noqa: F403
from llama_stack.apis.common.training_types import * # noqa: F403
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index 0044de09ee..ce7f5a8e50 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -73,6 +73,16 @@ class RoutingTableProviderSpec(ProviderSpec):
pip_packages: List[str] = Field(default_factory=list)
+# Example: /datasets
+class RegistryProviderSpec(ProviderSpec):
+ provider_type: str = "registry"
+ config_class: str = ""
+ docker_image: Optional[str] = None
+
+ module: str
+ pip_packages: List[str] = Field(default_factory=list)
+
+
class DistributionSpec(BaseModel):
description: Optional[str] = Field(
default="",
diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py
index 999646cc06..d96db23b46 100644
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@@ -21,6 +21,19 @@ class AutoRoutedApiInfo(BaseModel):
router_api: Api
+class RegistryApiInfo(BaseModel):
+ registry_api: Api
+ # registry: Registry
+
+
+def builtin_registry_apis() -> List[RegistryApiInfo]:
+ return [
+ RegistryApiInfo(
+ registry_api=Api.datasets,
+ )
+ ]
+
+
def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
return [
AutoRoutedApiInfo(
@@ -42,7 +55,12 @@ def providable_apis() -> List[Api]:
routing_table_apis = set(
x.routing_table_api for x in builtin_automatically_routed_apis()
)
- return [api for api in Api if api not in routing_table_apis and api != Api.inspect]
+ registry_apis = set(
+ x.registry_api for x in builtin_registry_apis() if x.registry_api
+ )
+ non_providable_apis = routing_table_apis | registry_apis | {Api.inspect}
+
+ return [api for api in Api if api not in non_providable_apis]
def get_provider_registry() -> Dict[Api, Dict[str, ProviderSpec]]:
diff --git a/llama_stack/distribution/registry/__init__.py b/llama_stack/distribution/registry/__init__.py
new file mode 100644
index 0000000000..6e68333280
--- /dev/null
+++ b/llama_stack/distribution/registry/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any
+
+from llama_stack.providers.datatypes import Api
+from .datasets.dataset import DatasetRegistryImpl
+
+
+async def get_registry_impl(api: Api, _deps) -> Any:
+ api_to_registry = {
+ "datasets": DatasetRegistryImpl,
+ }
+
+ if api.value not in api_to_registry:
+ raise ValueError(f"API {api.value} not found in registry map")
+
+ impl = api_to_registry[api.value]()
+ await impl.initialize()
+ return impl
diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py
new file mode 100644
index 0000000000..4474c8d7d8
--- /dev/null
+++ b/llama_stack/distribution/registry/datasets/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.datasets import * # noqa: F403
+from ..registry import Registry
+
+
+DatasetRegistry = Registry[BaseDataset]()
diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py
new file mode 100644
index 0000000000..838e8c65fa
--- /dev/null
+++ b/llama_stack/distribution/registry/datasets/dataset.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.datasets import * # noqa: F403
+from llama_stack.distribution.registry.datasets import DatasetRegistry
+from llama_stack.distribution.registry.datasets.dataset_wrappers import (
+ CustomDataset,
+ HuggingfaceDataset,
+)
+
+
+class DatasetRegistryImpl(Datasets):
+ """API Impl to interact with underlying dataset registry"""
+
+ def __init__(
+ self,
+ ) -> None:
+ pass
+
+ async def initialize(self) -> None:
+ pass
+
+ async def shutdown(self) -> None:
+ pass
+
+ async def create_dataset(
+ self,
+ dataset_def: DatasetDef,
+ ) -> CreateDatasetResponse:
+ if dataset_def.type == DatasetType.huggingface.value:
+ dataset_cls = HuggingfaceDataset(dataset_def)
+ else:
+ dataset_cls = CustomDataset(dataset_def)
+
+ try:
+ DatasetRegistry.register(
+ dataset_def.identifier,
+ dataset_cls,
+ )
+ except ValueError as e:
+ return CreateDatasetResponse(
+ status=DatasetsResponseStatus.fail,
+ msg=str(e),
+ )
+
+ return CreateDatasetResponse(
+ status=DatasetsResponseStatus.success,
+ msg=f"Dataset '{dataset_def.identifier}' registered",
+ )
+
+ async def get_dataset(
+ self,
+ dataset_identifier: str,
+ ) -> Optional[DatasetDef]:
+ try:
+ dataset_ref = DatasetRegistry.get(dataset_identifier).config
+ except ValueError as e:
+ return None
+
+ return dataset_ref
+
+ async def delete_dataset(self, dataset_identifier: str) -> DeleteDatasetResponse:
+ try:
+ DatasetRegistry.delete(dataset_identifier)
+ except ValueError as e:
+ return DeleteDatasetResponse(
+ status=DatasetsResponseStatus.fail,
+ msg=str(e),
+ )
+
+ return DeleteDatasetResponse(
+ status=DatasetsResponseStatus.success,
+ msg=f"Dataset '{dataset_identifier}' deleted",
+ )
+
+ async def list_datasets(self) -> List[DatasetDef]:
+ return [
+ DatasetRegistry.get(dataset_identifier).config
+ for dataset_identifier in DatasetRegistry.names()
+ ]
diff --git a/llama_stack/distribution/registry/datasets/dataset_wrappers.py b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
new file mode 100644
index 0000000000..6c9af5887c
--- /dev/null
+++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
@@ -0,0 +1,115 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import io
+
+import pandas
+from datasets import Dataset, load_dataset
+
+from llama_stack.apis.datasets import * # noqa: F403
+from llama_stack.providers.utils.memory.vector_store import parse_data_url
+
+
+class CustomDataset(BaseDataset[DictSample]):
+ def __init__(self, config: CustomDatasetDef) -> None:
+ super().__init__()
+ self.config = config
+ self.dataset = None
+ self.index = 0
+
+ @property
+ def dataset_id(self) -> str:
+ return self.config.identifier
+
+ def __iter__(self) -> Iterator[DictSample]:
+ if not self.dataset:
+ self.load()
+ return (DictSample(data=x) for x in self.dataset)
+
+ def __str__(self) -> str:
+ return f"CustomDataset({self.config})"
+
+ def __len__(self) -> int:
+ if not self.dataset:
+ self.load()
+ return len(self.dataset)
+
+ def load(self, n_samples: Optional[int] = None) -> None:
+ if self.dataset:
+ return
+
+ # TODO: more robust support w/ data url
+ if self.config.url.endswith(".csv"):
+ df = pandas.read_csv(self.config.url)
+ elif self.config.url.endswith(".xlsx"):
+ df = pandas.read_excel(self.config.url)
+ elif self.config.url.startswith("data:"):
+ parts = parse_data_url(self.config.url)
+ data = parts["data"]
+ if parts["is_base64"]:
+ data = base64.b64decode(data)
+ else:
+ data = unquote(data)
+ encoding = parts["encoding"] or "utf-8"
+ data = data.encode(encoding)
+
+ mime_type = parts["mimetype"]
+ mime_category = mime_type.split("/")[0]
+ data_bytes = io.BytesIO(data)
+
+ if mime_category == "text":
+ df = pandas.read_csv(data_bytes)
+ else:
+ df = pandas.read_excel(data_bytes)
+ else:
+ raise ValueError(f"Unsupported file type: {self.config.url}")
+
+ if n_samples is not None:
+ df = df.sample(n=min(n_samples, len(df)))
+
+ self.dataset = Dataset.from_pandas(df)
+ if self.config.rename_columns_map:
+ for k, v in self.config.rename_columns_map.items():
+ self.dataset = self.dataset.rename_column(k, v)
+
+
+class HuggingfaceDataset(BaseDataset[DictSample]):
+ def __init__(self, config: HuggingfaceDatasetDef):
+ super().__init__()
+ self.config = config
+ self.dataset = None
+
+ @property
+ def dataset_id(self) -> str:
+ return self.config.identifier
+
+ def __iter__(self) -> Iterator[DictSample]:
+ if not self.dataset:
+ self.load()
+ return (DictSample(data=x) for x in self.dataset)
+
+ def __str__(self):
+ return f"HuggingfaceDataset({self.config})"
+
+ def __len__(self):
+ if not self.dataset:
+ self.load()
+ return len(self.dataset)
+
+ def load(self, n_samples: Optional[int] = None):
+ if self.dataset:
+ return
+
+ if self.config.dataset_name:
+ self.config.kwargs["name"] = self.config.dataset_name
+
+ self.dataset = load_dataset(self.config.dataset_path, **self.config.kwargs)
+
+ if n_samples:
+ self.dataset = self.dataset.select(range(n_samples))
+
+ if self.config.rename_columns_map:
+ for k, v in self.config.rename_columns_map.items():
+ self.dataset = self.dataset.rename_column(k, v)
diff --git a/llama_stack/distribution/registry/generator_processors/__init__.py b/llama_stack/distribution/registry/generator_processors/__init__.py
new file mode 100644
index 0000000000..862984f548
--- /dev/null
+++ b/llama_stack/distribution/registry/generator_processors/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.evals import * # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.processor import * # noqa: F403
+
+from ..registry import Registry
+
+# TODO: decide whether we should group dataset+processor together via Tasks
+GeneratorProcessorRegistry = Registry[BaseGeneratorProcessor]()
+
+PROCESSOR_REGISTRY = {
+ "mmlu": MMLUProcessor,
+ "judge": JudgeProcessor,
+}
+
+for k, v in PROCESSOR_REGISTRY.items():
+ GeneratorProcessorRegistry.register(k, v)
diff --git a/llama_stack/distribution/registry/registry.py b/llama_stack/distribution/registry/registry.py
new file mode 100644
index 0000000000..702ed7d869
--- /dev/null
+++ b/llama_stack/distribution/registry/registry.py
@@ -0,0 +1,36 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import AbstractSet, Generic, TypeVar
+
+TRegistry = TypeVar("TRegistry")
+
+
+class Registry(Generic[TRegistry]):
+
+ def __init__(self) -> None:
+ super().__init__()
+ self.registry = {}
+
+ def names(self) -> AbstractSet[str]:
+ return self.registry.keys()
+
+ def register(self, name: str, task: TRegistry) -> None:
+ if name in self.registry:
+ raise ValueError(f"Dataset {name} already exists.")
+ self.registry[name] = task
+
+ def get(self, name: str) -> TRegistry:
+ if name not in self.registry:
+ raise ValueError(f"Dataset {name} not found.")
+ return self.registry[name]
+
+ def delete(self, name: str) -> None:
+ if name not in self.registry:
+ raise ValueError(f"Dataset {name} not found.")
+ del self.registry[name]
+
+ def reset(self) -> None:
+ self.registry = {}
diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py
new file mode 100644
index 0000000000..dda71d4e00
--- /dev/null
+++ b/llama_stack/distribution/registry/scorers/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+# TODO: make these import config based
+from llama_stack.apis.evals import * # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.scorer.braintrust_scorer import * # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.scorer.llm_judge_scorer import * # noqa: F403
+
+from ..registry import Registry
+
+# TODO: make these import config based
+ScorerRegistry = Registry[BaseScorer]()
+
+SCORER_REGISTRY = {
+ "accuracy": AccuracyScorer,
+ "random": RandomScorer,
+ "llamastack-llm-judge": LlamaStackLLMJudgeScorer,
+ "braintrust::factuality": BraintrustFactualityScorer,
+ "braintrust::answer-correctness": BraintrustAnswerCorrectnessScorer,
+}
+
+for k, v in SCORER_REGISTRY.items():
+ ScorerRegistry.register(k, v)
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index a05e08cd7c..e71c3fd8ce 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -12,6 +12,8 @@
from llama_stack.distribution.datatypes import * # noqa: F403
from llama_stack.apis.agents import Agents
+from llama_stack.apis.datasets import Datasets
+from llama_stack.apis.evals import Evals
from llama_stack.apis.inference import Inference
from llama_stack.apis.inspect import Inspect
from llama_stack.apis.memory import Memory
@@ -22,6 +24,7 @@
from llama_stack.apis.telemetry import Telemetry
from llama_stack.distribution.distribution import (
builtin_automatically_routed_apis,
+ builtin_registry_apis,
get_provider_registry,
)
from llama_stack.distribution.utils.dynamic import instantiate_class_type
@@ -38,6 +41,8 @@ def api_protocol_map() -> Dict[Api, Any]:
Api.safety: Safety,
Api.shields: Shields,
Api.telemetry: Telemetry,
+ Api.evals: Evals,
+ Api.datasets: Datasets,
}
@@ -137,6 +142,20 @@ async def resolve_impls_with_routing(run_config: StackRunConfig) -> Dict[Api, An
)
}
+ for info in builtin_registry_apis():
+ providers_with_specs[info.registry_api.value] = {
+ "__builtin__": ProviderWithSpec(
+ provider_id="__registry__",
+ provider_type="__registry__",
+ config={},
+ spec=RegistryProviderSpec(
+ api=info.registry_api,
+ module="llama_stack.distribution.registry",
+ deps__=[],
+ ),
+ )
+ }
+
sorted_providers = topological_sort(
{k: v.values() for k, v in providers_with_specs.items()}
)
@@ -257,6 +276,12 @@ async def instantiate_provider(
config = None
args = [provider_spec.api, inner_impls, deps]
+ elif isinstance(provider_spec, RegistryProviderSpec):
+ print("ROUTER PROVIDER SPEC")
+ method = "get_registry_impl"
+
+ config = None
+ args = [provider_spec.api, deps]
else:
method = "get_provider_impl"
diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py
index 777cd855b7..1d397c9e73 100644
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@@ -32,6 +32,9 @@ class Api(Enum):
# built-in API
inspect = "inspect"
+ evals = "evals"
+ datasets = "datasets"
+
class ModelsProtocolPrivate(Protocol):
async def list_models(self) -> List[ModelDef]: ...
diff --git a/llama_stack/providers/impls/meta_reference/evals/__init__.py b/llama_stack/providers/impls/meta_reference/evals/__init__.py
new file mode 100644
index 0000000000..f4dd4b79d6
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import MetaReferenceEvalsImplConfig # noqa
+from llama_stack.apis.inference import * # noqa: F403
+from llama_stack.distribution.datatypes import Api, ProviderSpec
+
+
+async def get_provider_impl(
+ config: MetaReferenceEvalsImplConfig, deps: Dict[Api, ProviderSpec]
+):
+ from .evals import MetaReferenceEvalsImpl
+
+ impl = MetaReferenceEvalsImpl(config, deps[Api.inference])
+ await impl.initialize()
+ return impl
diff --git a/llama_stack/providers/impls/meta_reference/evals/config.py b/llama_stack/providers/impls/meta_reference/evals/config.py
new file mode 100644
index 0000000000..05dee366ed
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/config.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pydantic import BaseModel
+
+
+class MetaReferenceEvalsImplConfig(BaseModel): ...
diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py
new file mode 100644
index 0000000000..7d3eaa85d8
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import json
+
+from termcolor import cprint
+
+from llama_stack.apis.inference import * # noqa: F403
+from llama_stack.apis.evals import * # noqa: F403
+from llama_stack.apis.datasets import * # noqa: F403
+
+from .config import MetaReferenceEvalsImplConfig
+from .tasks.run_eval_task import RunEvalTask
+from .tasks.run_scoring_task import RunScoringTask
+
+
+class MetaReferenceEvalsImpl(Evals):
+ def __init__(self, config: MetaReferenceEvalsImplConfig, inference_api: Inference):
+ self.inference_api = inference_api
+
+ async def initialize(self) -> None:
+ pass
+
+ async def shutdown(self) -> None:
+ pass
+
+ async def run_eval_task(
+ self,
+ eval_task_config: EvaluateTaskConfig,
+ ) -> EvaluateResponse:
+ cprint(f"run_eval_task: on {eval_task_config}", "green")
+
+ run_task = RunEvalTask()
+ eval_result = await run_task.run(eval_task_config, self.inference_api)
+
+ return EvaluateResponse(
+ eval_result=eval_result,
+ formatted_report=json.dumps(eval_result.json(), indent=4),
+ )
+
+ async def run_scorer(
+ self,
+ dataset_config: EvaluateDatasetConfig,
+ eval_scoring_config: EvaluateScoringConfig,
+ ) -> EvaluateResponse:
+ cprint(f"run_scorer: on {dataset_config} with {eval_scoring_config}", "green")
+
+ run_task = RunScoringTask()
+ eval_result = await run_task.run(
+ dataset_config, eval_scoring_config, self.inference_api
+ )
+
+ return EvaluateResponse(
+ eval_result=eval_result,
+ formatted_report=json.dumps(eval_result.json(), indent=4),
+ )
diff --git a/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py b/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py
new file mode 100644
index 0000000000..756f351d88
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py b/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py
new file mode 100644
index 0000000000..dafbb16f5b
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py
@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from termcolor import cprint
+
+from llama_stack.apis.evals import * # noqa: F403
+from llama_stack.apis.inference import * # noqa: F403
+
+
+class InferenceGenerator(BaseGenerator[PreprocessedSample, GenerationResponseSample]):
+ """
+ InferenceGenerator for LlamaStack
+ """
+
+ def __init__(
+ self,
+ model,
+ inference_api,
+ *args,
+ **kwargs,
+ ) -> None:
+ super().__init__(*args, **kwargs)
+ self.model = model
+ self.inference_api = inference_api
+
+ async def generate(
+ self, preprocessed_dataset: List[PreprocessedSample]
+ ) -> List[GenerationResponseSample]:
+ generation_outputs = []
+ for sample in preprocessed_dataset:
+ response = await self.inference_api.chat_completion(
+ model=self.model,
+ messages=sample.generation_input.messages,
+ stream=False,
+ )
+ cprint(f"response: {response}", "cyan")
+
+ generation_outputs.append(
+ GenerationResponseSample(
+ generation_output=GenerationOutput(
+ completion_message=response.completion_message.content
+ )
+ )
+ )
+ return generation_outputs
diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py
new file mode 100644
index 0000000000..5a7ca27958
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from .judge_processor import JudgeProcessor # noqa: F401
+from .mmlu_processor import MMLUProcessor # noqa: F401
diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py b/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py
new file mode 100644
index 0000000000..d7d6ae3eb2
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import re
+
+from llama_stack.apis.evals import * # noqa: F403
+
+JUDGE_PROMPT = """
+You will be given a question, a expected_answer, and a system_answer.
+Your task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question.
+Give your answer as a integer on a scale of 0 to 5, where 0 means that the system_answer is not correct at all compared with expected_answer, and 5 means that the answer completely and correctly answers the question.
+
+Provide your feedback as follows:
+
+Feedback:::
+Total rating: (your rating, as a int between 0 and 5)
+
+Now here are the question, expected_answer, system_answer.
+
+Question: {question}
+Expected Answer: {expected_answer}
+System Answer: {answer}
+
+Feedback:::
+Total rating:
+"""
+
+
+class JudgeProcessor(
+ BaseGeneratorProcessor[
+ DictSample, PreprocessedSample, GenerationResponseSample, ScorerInputSample
+ ]
+):
+ """
+ Generator processor for LLM Judge
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ def preprocess_sample(self, sample: DictSample) -> PreprocessedSample:
+ content = JUDGE_PROMPT.format(
+ question=sample.data["input_query"],
+ expected_answer=sample.data["expected_answer"],
+ answer=sample.data["generated_answer"],
+ )
+ preprocessed_msgs = [
+ {
+ "role": "user",
+ "content": content,
+ }
+ ]
+ processed_sample = PreprocessedSample(
+ generation_input=GenerationInput(
+ messages=preprocessed_msgs,
+ )
+ )
+ return processed_sample
+
+ def postprocess_sample(
+ self, generation_sample: GenerationResponseSample, dataset_sample: DictSample
+ ) -> ScorerInputSample:
+ response_text = generation_sample.generation_output.completion_message
+ match = re.search(r"Total rating: (\d+)", response_text)
+ judge_rating = int(match.group(1))
+
+ return ScorerInputSample(
+ generated_answer=str(judge_rating),
+ expected_answer=dataset_sample.data["expected_answer"],
+ generation_output=PostprocessedGeneration(
+ completion_message=response_text,
+ ),
+ )
diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py b/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py
new file mode 100644
index 0000000000..fc2d9eb642
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py
@@ -0,0 +1,161 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import re
+
+from llama_stack.apis.evals import * # noqa: F403
+
+QUERY_TEMPLATE_MULTICHOICE = """
+Answer the following multiple choice question and make the answer very simple. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD.
+
+{Question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+MULTILINGUAL_ANSWER_REGEXES = [
+ r"Answer\s*:",
+ r"Answer\s*:", # Korean invisible character
+ r"উত্তর\s*:",
+ r"उत्तर\s*:",
+ r"উত্তরঃ",
+ r"উত্তর\s*:",
+ r"Antwort\s*:",
+ r"답변\s*:",
+ r"정답\s*:",
+ r"답\s*:",
+ r"答案\s*:",
+ r"答案\s*:",
+ r"答\s*:",
+ r"答\s*:",
+ r"答复\s*:",
+ r"答曰\s*:",
+ r"الإجابة:",
+ r"الجواب:",
+ r"إجابة:",
+ r"الإجابة النهائية:",
+ r"الإجابة الصحيحة:",
+ r"الإجابة الصحيحة هي:",
+ r"الإجابة هي:",
+ r"Respuesta\s*:",
+ r"Risposta\s*:",
+ r"答え\s*:",
+ r"答え\s*:",
+ r"回答\s*:",
+ r"回答\s*:",
+ r"解答\s*:",
+ r"Jawaban\s*:",
+ r"Réponse\s*:",
+ r"Resposta\s*:",
+ r"Jibu\s*:",
+ r"Idahun\s*:",
+ r"Ìdáhùn\s*:",
+ r"Idáhùn\s*:",
+ r"Àmọ̀nà\s*:",
+ r"Àdáhùn\s*:",
+ r"Ànúgọ\s*:",
+ r"Àṣàyàn\s*:",
+]
+
+MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = (
+ r"(?i){}\s*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[A]|[B]|[C]|[D])"
+)
+
+
+def normalize_response(response: str) -> str:
+ """
+ Normalize the response by removing markdown and LaTeX formatting that may prevent a match.
+ """
+
+ return (
+ response.replace("**", "")
+ .replace("$\\boxed{", "")
+ .replace("}$", "")
+ .replace("\\$", "")
+ .replace("$\\text{", "")
+ .replace("$", "")
+ .replace("\\mathrm{", "")
+ .replace("\\{", "")
+ .replace("\\text", "")
+ .replace("\\(", "")
+ .replace("\\mathbf{", "")
+ .replace("{", "")
+ .replace("\\boxed", "")
+ )
+
+
+def normalize_extracted_answer(extracted_answer: str) -> str:
+ return (
+ # In arabic these are the letters used for A-D in multiple choice questions
+ extracted_answer.replace("أ", " A")
+ .replace("ب", " B")
+ .replace("ج", " C")
+ .replace("د", " D")
+ # In Bengali these are the letters used for A-D in multiple choice questions
+ .replace("অ", " A")
+ .replace("ব", " B")
+ .replace("ড", " C")
+ .replace("ঢ", " D")
+ # In Japanese these are the letters sometimes used for A-D in multiple choice questions
+ .replace("A", " A")
+ .replace("B", " B")
+ .replace("C", " C")
+ .replace("D", " D")
+ .strip()
+ )
+
+
+class MMLUProcessor(
+ BaseGeneratorProcessor[
+ DictSample, PreprocessedSample, GenerationResponseSample, ScorerInputSample
+ ]
+):
+ """
+ Generator processor for MMLU
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ def preprocess_sample(self, sample: DictSample) -> PreprocessedSample:
+ content = QUERY_TEMPLATE_MULTICHOICE.format(**sample.data)
+ preprocessed_msgs = [
+ {
+ "role": "user",
+ "content": content,
+ }
+ ]
+ processed_sample = PreprocessedSample(
+ generation_input=GenerationInput(
+ messages=preprocessed_msgs,
+ )
+ )
+ return processed_sample
+
+ def postprocess_sample(
+ self, generation_sample: GenerationResponseSample, dataset_sample: DictSample
+ ) -> ScorerInputSample:
+ response_text = generation_sample.generation_output.completion_message
+ normalized_response = normalize_response(response_text)
+
+ # extract answer
+ extracted_answer = ""
+ for answer_regex in MULTILINGUAL_ANSWER_REGEXES:
+ regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex)
+ match = re.search(regex, normalized_response)
+ if match:
+ extracted_answer = normalize_extracted_answer(match.group(1))
+ break
+
+ return ScorerInputSample(
+ generated_answer=extracted_answer,
+ expected_answer=dataset_sample.data["Answer"],
+ generation_output=PostprocessedGeneration(
+ completion_message=response_text,
+ ),
+ )
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py
new file mode 100644
index 0000000000..6424963f87
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from .basic_scorers import * # noqa: F401 F403
+from .aggregate_scorer import * # noqa: F401 F403
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py
new file mode 100644
index 0000000000..1a0621960e
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult
+from llama_stack.apis.datasets.datasets import * # noqa: F401 F403
+
+
+class AggregateScorer(BaseScorer[ScorerInputSample]):
+ def __init__(self, scorers: List[BaseScorer[ScorerInputSample]]):
+ self.scorers = scorers
+
+ def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+ all_score_data = {}
+ for scorer in self.scorers:
+ score_data = scorer.score_sample(scorer_input_sample).score_data
+ for k, v in score_data.items():
+ all_score_data[k] = v
+
+ return SingleEvalResult(
+ score_data=all_score_data,
+ )
+
+ def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+ all_metrics = {}
+
+ for scorer in self.scorers:
+ metrics = scorer.aggregate_results(eval_results).metrics
+ for k, v in metrics.items():
+ all_metrics[f"{scorer.__class__.__name__}:{k}"] = v
+
+ return EvalResult(
+ metrics=all_metrics,
+ )
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
new file mode 100644
index 0000000000..748f9fc1f8
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import random
+
+from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult
+from llama_stack.apis.datasets.datasets import * # noqa: F401 F403
+
+
+class RandomScorer(BaseScorer[ScorerInputSample]):
+ def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+ return SingleEvalResult(score_data={"random": random.random()})
+
+ def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+ avg_random = sum(
+ [result.score_data["random"] for result in eval_results]
+ ) / len(eval_results)
+ max_random = max([result.score_data["random"] for result in eval_results])
+ return EvalResult(
+ metrics={
+ "avg_random": avg_random,
+ "max_random": max_random,
+ }
+ )
+
+
+class AccuracyScorer(BaseScorer[ScorerInputSample]):
+ def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+ extracted_answer = scorer_input_sample.generated_answer
+ expected_answer = scorer_input_sample.expected_answer
+
+ if isinstance(expected_answer, list):
+ accuracy = (
+ 1.0 if extracted_answer and extracted_answer in expected_answer else 0.0
+ )
+ else:
+ accuracy = (
+ 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
+ )
+
+ return SingleEvalResult(score_data={"accuracy": accuracy})
+
+ def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+ num_correct = sum([result.score_data["accuracy"] for result in eval_results])
+ num_total = len(eval_results)
+
+ return EvalResult(
+ metrics={
+ "avg_accuracy": num_correct / num_total,
+ "num_correct": num_correct,
+ "num_total": num_total,
+ }
+ )
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py
new file mode 100644
index 0000000000..c124aaad6a
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import numpy as np
+
+from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult
+from llama_stack.apis.datasets.datasets import * # noqa: F401 F403
+from autoevals.llm import * # noqa: F403
+from autoevals.ragas import * # noqa: F403
+
+
+class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]):
+ def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+ input_query = scorer_input_sample.input_query
+ generated_answer = scorer_input_sample.generated_answer
+ expected_answer = scorer_input_sample.expected_answer
+
+ evaluator = Factuality()
+ result = evaluator(generated_answer, expected_answer, input=input_query)
+ factuality = result.score
+ return SingleEvalResult(score_data={"factuality": factuality})
+
+ def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+ avg_score = np.average(
+ [result.score_data["factuality"] for result in eval_results]
+ )
+
+ return EvalResult(
+ metrics={
+ "avg_factuality_score": avg_score,
+ }
+ )
+
+
+class BraintrustAnswerCorrectnessScorer(BaseScorer[ScorerInputSample]):
+ def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+ input_query = scorer_input_sample.input_query
+ generated_answer = scorer_input_sample.generated_answer
+ expected_answer = scorer_input_sample.expected_answer
+
+ evaluator = AnswerCorrectness()
+ result = evaluator(generated_answer, expected_answer, input=input_query)
+ correctness = result.score
+ return SingleEvalResult(score_data={"answer_correctness": correctness})
+
+ def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+ avg_score = np.average(
+ [result.score_data["answer_correctness"] for result in eval_results]
+ )
+
+ return EvalResult(
+ metrics={
+ "avg_correctness_score": avg_score,
+ }
+ )
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py
new file mode 100644
index 0000000000..f5f56b435f
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import asyncio
+import threading
+
+import numpy as np
+
+from llama_stack.distribution.registry.generator_processors import (
+ GeneratorProcessorRegistry,
+)
+from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import (
+ InferenceGenerator,
+)
+
+from llama_stack.apis.evals.evals import * # noqa: F401 F403
+from llama_stack.apis.datasets.datasets import * # noqa: F401 F403
+from llama_stack.apis.inference import * # noqa: F403
+
+
+class LlamaStackLLMJudgeScorer(BaseScorer[ScorerInputSample]):
+ def __init__(self, llm_judge_config: LLMJudgeConfig, inference_api: Inference):
+ self.llm_judge_config = llm_judge_config
+ self.inference_api = inference_api
+ # https://stackoverflow.com/questions/74703727/how-to-call-async-function-from-sync-funcion-and-get-result-while-a-loop-is-alr
+ # We will use another thread wih its own event loop to run the async api within sync function
+ self._loop = asyncio.new_event_loop()
+ self._thr = threading.Thread(
+ target=self._loop.run_forever, name="Async Runner", daemon=True
+ )
+ if not self._thr.is_alive():
+ self._thr.start()
+
+ def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+ input_query = scorer_input_sample.input_query
+ generated_answer = scorer_input_sample.generated_answer
+ expected_answer = scorer_input_sample.expected_answer
+
+ # Judge F1
+ processor = GeneratorProcessorRegistry.get(
+ self.llm_judge_config.judge_processor_config.processor_identifier
+ )()
+ data_sample = DictSample(
+ data={
+ "input_query": input_query,
+ "generated_answer": generated_answer,
+ "expected_answer": expected_answer,
+ }
+ )
+ preprocessed_sample = processor.preprocess_sample(data_sample)
+
+ # Judge Generation
+ generator = InferenceGenerator(
+ model=self.llm_judge_config.judge_model_generation_config.model,
+ inference_api=self.inference_api,
+ )
+
+ future = asyncio.run_coroutine_threadsafe(
+ generator.generate([preprocessed_sample]), self._loop
+ )
+ generation_outputs = future.result()
+ # Judge F2
+ postprocessed_sample = processor.postprocess_sample(
+ generation_outputs[0], data_sample
+ )
+
+ # Judge F3
+ score = float(postprocessed_sample.generated_answer)
+
+ return SingleEvalResult(score_data={"judge_score": score})
+
+ def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+ avg_score = np.average(
+ [result.score_data["judge_score"] for result in eval_results]
+ )
+
+ return EvalResult(
+ metrics={
+ "avg_judge_score": avg_score,
+ }
+ )
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py b/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py
new file mode 100644
index 0000000000..756f351d88
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
new file mode 100644
index 0000000000..fbd98128f1
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.distribution.registry.datasets import DatasetRegistry
+from llama_stack.distribution.registry.generator_processors import (
+ GeneratorProcessorRegistry,
+)
+from llama_stack.distribution.registry.scorers import ScorerRegistry
+
+from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import * # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import (
+ InferenceGenerator,
+)
+
+
+from llama_stack.apis.evals import * # noqa: F403
+from llama_stack.apis.inference import * # noqa: F403
+from termcolor import cprint
+
+
+class RunEvalTask(BaseTask):
+ """
+ RunEvalTask for LlamaStack
+ """
+
+ def __init__(
+ self,
+ *args,
+ **kwargs,
+ ) -> None:
+ super().__init__(*args, **kwargs)
+
+ async def run(
+ self,
+ eval_task_config: EvaluateTaskConfig,
+ inference_api: Inference,
+ *args,
+ **kwargs,
+ ) -> EvalResult:
+ print(f"Running eval task w/ {eval_task_config}")
+
+ print(DatasetRegistry.names())
+ dataset = DatasetRegistry.get(
+ eval_task_config.dataset_config.dataset_identifier
+ )
+ dataset.load(n_samples=eval_task_config.dataset_config.row_limit)
+ print(f"Running on {len(dataset)} samples")
+
+ # F1
+ print(GeneratorProcessorRegistry.names())
+ processor = GeneratorProcessorRegistry.get(
+ eval_task_config.processor_config.processor_identifier
+ )()
+ preprocessed = processor.preprocess(dataset)
+
+ # Generation
+ generator = InferenceGenerator(
+ model=eval_task_config.generation_config.model,
+ inference_api=inference_api,
+ )
+ generation_outputs = await generator.generate(preprocessed)
+
+ # F2
+ postprocessed = processor.postprocess(generation_outputs, dataset)
+ cprint(postprocessed, "blue")
+
+ # F3 - scorer
+ scorer_config_list = eval_task_config.scoring_config.scorer_config_list
+ scorer_list = []
+ for s_conf in scorer_config_list:
+ scorer = ScorerRegistry.get(s_conf.scorer_name)
+ if s_conf.llm_judge_config:
+ scorer_list.append(
+ scorer(
+ llm_judge_config=s_conf.llm_judge_config,
+ inference_api=inference_api,
+ )
+ )
+ else:
+ scorer_list.append(scorer())
+
+ scorer = AggregateScorer(
+ scorers=scorer_list,
+ )
+
+ scorer_results = scorer.score(postprocessed)
+ cprint(scorer_results, "magenta")
+ eval_result = scorer.aggregate_results(scorer_results)
+
+ return eval_result
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
new file mode 100644
index 0000000000..6b11191f1e
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
@@ -0,0 +1,90 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.distribution.registry.datasets import DatasetRegistry
+from llama_stack.distribution.registry.scorers import ScorerRegistry
+
+from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import * # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403
+
+from llama_stack.apis.evals import * # noqa: F403
+from llama_stack.apis.inference import * # noqa: F403
+
+
+class RunScoringTask(BaseTask):
+ """
+ RunScoringTask - only run scoring (F3) based on dataset and scoring config
+ """
+
+ def __init__(
+ self,
+ *args,
+ **kwargs,
+ ) -> None:
+ super().__init__(*args, **kwargs)
+
+ def transform_score_input_sample(
+ self, dataset: BaseDataset
+ ) -> List[ScorerInputSample]:
+ scorer_inputs = []
+ for x in dataset:
+ expected_answer = x.data["expected_answer"]
+ generated_answer = x.data["generated_answer"]
+ input_query = None
+ if "input_query" in x.data:
+ input_query = x.data["input_query"]
+
+ scorer_inputs.append(
+ ScorerInputSample(
+ expected_answer=expected_answer,
+ generated_answer=generated_answer,
+ input_query=input_query,
+ )
+ )
+
+ return scorer_inputs
+
+ async def run(
+ self,
+ dataset_config: EvaluateDatasetConfig,
+ eval_scoring_config: EvaluateScoringConfig,
+ inference_api: Inference,
+ *args,
+ **kwargs,
+ ) -> EvalResult:
+ print(
+ f"Running scoring task w/ dataset={dataset_config} scoring={eval_scoring_config}"
+ )
+
+ dataset = DatasetRegistry.get(dataset_config.dataset_identifier)
+ dataset.load(n_samples=dataset_config.row_limit)
+ print(f"Running on {len(dataset)} samples")
+
+ # transform dataset into List[ScorerInputSample]
+ postprocessed = self.transform_score_input_sample(dataset)
+
+ # F3 - scorer
+ scorer_config_list = eval_scoring_config.scorer_config_list
+ scorer_list = []
+ for s_conf in scorer_config_list:
+ scorer = ScorerRegistry.get(s_conf.scorer_name)
+ if s_conf.llm_judge_config:
+ scorer_list.append(
+ scorer(
+ llm_judge_config=s_conf.llm_judge_config,
+ inference_api=inference_api,
+ )
+ )
+ else:
+ scorer_list.append(scorer())
+
+ scorer = AggregateScorer(
+ scorers=scorer_list,
+ )
+
+ scorer_results = scorer.score(postprocessed)
+ eval_result = scorer.aggregate_results(scorer_results)
+
+ return eval_result
diff --git a/llama_stack/providers/impls/third_party/evals/__init__.py b/llama_stack/providers/impls/third_party/evals/__init__.py
new file mode 100644
index 0000000000..756f351d88
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py b/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py
new file mode 100644
index 0000000000..9886ed6d6c
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import EleutherEvalsImplConfig # noqa
+from llama_stack.apis.inference import * # noqa: F403
+from llama_stack.distribution.datatypes import Api, ProviderSpec
+
+
+async def get_provider_impl(
+ config: EleutherEvalsImplConfig, deps: Dict[Api, ProviderSpec]
+):
+ from .eleuther import EleutherEvalsAdapter
+
+ impl = EleutherEvalsAdapter(config, deps[Api.inference])
+ await impl.initialize()
+ return impl
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/config.py b/llama_stack/providers/impls/third_party/evals/eleuther/config.py
new file mode 100644
index 0000000000..a9ab297b42
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/config.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pydantic import BaseModel
+
+
+class EleutherEvalsImplConfig(BaseModel): ...
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py
new file mode 100644
index 0000000000..e4b32a45e0
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py
@@ -0,0 +1,170 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+from llama_stack.apis.inference import * # noqa: F403
+from llama_stack.apis.evals import * # noqa: F403
+import os
+import random
+import threading
+from pathlib import Path
+
+import lm_eval
+import tqdm
+from lm_eval.api.model import LM
+from lm_eval.evaluator import evaluate, get_task_list
+from lm_eval.tasks import get_task_dict, TaskManager
+from termcolor import cprint
+
+from .config import EleutherEvalsImplConfig
+
+
+# https://stackoverflow.com/questions/74703727/how-to-call-async-function-from-sync-funcion-and-get-result-while-a-loop-is-alr
+# We will use another thread wih its own event loop to run the async api within sync function
+_loop = asyncio.new_event_loop()
+_thr = threading.Thread(target=_loop.run_forever, name="Async Runner", daemon=True)
+
+
+class EleutherEvalsWrapper(LM):
+ def __init__(
+ self,
+ inference_api: Inference,
+ model: str,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.inference_api = inference_api
+ self.model = model
+ self.tokenizer = None
+ self.tokenized_requests = False
+ self.kwargs = kwargs
+
+ @property
+ def eot_token_id(self):
+ raise NotImplementedError("Not implemented")
+
+ @property
+ def max_length(self) -> int:
+ return NotImplementedError("Not implemented")
+
+ @property
+ def max_gen_toks(self) -> int:
+ return NotImplementedError("Not implemented")
+
+ @property
+ def batch_size(self):
+ # Isn't used because we override _loglikelihood_tokens
+ raise NotImplementedError("No support for logits.")
+
+ @property
+ def device(self):
+ # Isn't used because we override _loglikelihood_tokens
+ raise NotImplementedError("No support for logits.")
+
+ @property
+ def world_size(self):
+ return 1
+
+ def tok_encode(self, string: str) -> List[int]:
+ return NotImplementedError("Not implemented")
+
+ def tok_decode(self, tokens: List[int]) -> str:
+ return NotImplementedError("Not implemented")
+
+ def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
+ raise NotImplementedError("No support for logits.")
+
+ def _model_call(self, inps):
+ # Isn't used because we override _loglikelihood_tokens
+ raise NotImplementedError()
+
+ def _model_generate(self, context, max_length, eos_token_id):
+ # Isn't used because we override generate_until
+ raise NotImplementedError()
+
+ def loglikelihood(self, requests, disable_tqdm: bool = False):
+ # TODO: implement inference completion with loglikelihood
+ res = []
+ for req in requests:
+ res.append((-random.random(), False))
+
+ return res
+
+ def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+ raise NotImplementedError("No support for logits.")
+
+ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+ res = []
+ if not _thr.is_alive():
+ _thr.start()
+ for req in tqdm.tqdm(requests):
+ chat_completion_coro_fn = self.inference_api.chat_completion(
+ model=self.model,
+ messages=[
+ {
+ "role": "user",
+ "content": req.args[0],
+ }
+ ],
+ stream=False,
+ )
+ future = asyncio.run_coroutine_threadsafe(chat_completion_coro_fn, _loop)
+ response = future.result()
+ res.append(response.completion_message.content)
+
+ return res
+
+
+class EleutherEvalsAdapter(Evals):
+ def __init__(self, config: EleutherEvalsImplConfig, inference_api: Inference):
+ self.inference_api = inference_api
+
+ async def initialize(self) -> None:
+ pass
+
+ async def shutdown(self) -> None:
+ pass
+
+ async def run_evals(
+ self,
+ model: str,
+ task: str,
+ dataset: Optional[str] = None,
+ eval_task_config: Optional[EvaluateTaskConfig] = None,
+ ) -> EvaluateResponse:
+ cprint(f"Eleuther Evals: {model} {dataset} {task}", "red")
+
+ eluther_wrapper = EleutherEvalsWrapper(self.inference_api, model)
+ current_dir = Path(os.path.dirname(os.path.abspath(__file__)))
+
+ # custom registry of harness tasks
+ task_manager = TaskManager(
+ include_path=str(current_dir / "tasks"),
+ )
+
+ task_dict = get_task_dict(task, task_manager)
+ cprint(task_dict, "blue")
+
+ task_types = set([t.task.OUTPUT_TYPE for t in get_task_list(task_dict)])
+ cprint(task_types, "cyan")
+
+ output = evaluate(
+ eluther_wrapper,
+ task_dict,
+ limit=eval_task_config.n_samples,
+ )
+
+ eval_result = EvalResult(
+ metrics={},
+ )
+ formatted_output = lm_eval.utils.make_table(output)
+
+ cprint(formatted_output, "green")
+
+ return EvaluateResponse(
+ eval_result=eval_result,
+ formatted_report=formatted_output,
+ )
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml
new file mode 100644
index 0000000000..e10277a314
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml
@@ -0,0 +1,32 @@
+task: meta_ifeval
+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
+dataset_name: Llama-3.1-8B-Instruct-evals__ifeval__strict__details
+output_type: generate_until
+test_split: latest
+process_docs: !function utils.process_docs
+num_fewshot: 0
+doc_to_text: prompt
+doc_to_target: 0
+generation_kwargs:
+ until: []
+ do_sample: false
+ temperature: 0.0
+ max_gen_toks: 1280
+process_results: !function utils.process_results
+metric_list:
+ - metric: prompt_level_strict_acc
+ aggregation: mean
+ higher_is_better: true
+ - metric: inst_level_strict_acc
+ aggregation: !function utils.agg_inst_level_acc
+ higher_is_better: true
+ - metric: prompt_level_loose_acc
+ aggregation: mean
+ higher_is_better: true
+ - metric: inst_level_loose_acc
+ aggregation: !function utils.agg_inst_level_acc
+ higher_is_better: true
+metadata:
+ version: 2.0
+fewshot_config:
+ sampler: first_n
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py
new file mode 100644
index 0000000000..aa171343fd
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py
@@ -0,0 +1,191 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import dataclasses
+from typing import Dict, Optional, Union
+
+import datasets
+
+from lm_eval.tasks.ifeval import instructions_registry
+
+
+@dataclasses.dataclass
+class InputExample:
+ key: int
+ instruction_id_list: list[str]
+ prompt: str
+ kwargs: list[Dict[str, Optional[Union[str, int]]]]
+
+
+@dataclasses.dataclass
+class OutputExample:
+ instruction_id_list: list[str]
+ prompt: str
+ response: str
+ follow_all_instructions: bool
+ follow_instruction_list: list[bool]
+
+
+def test_instruction_following_strict(
+ inp,
+ response,
+):
+ """Tests response to see if instructions are followed."""
+ instruction_list = inp.instruction_id_list
+ is_following_list = []
+
+ for index, instruction_id in enumerate(instruction_list):
+ instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+ instruction = instruction_cls(instruction_id)
+
+ # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+ kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+ instruction.build_description(**kwargs)
+ args = instruction.get_instruction_args()
+ if args and "prompt" in args:
+ instruction.build_description(prompt=inp.prompt)
+
+ if response.strip() and instruction.check_following(response):
+ is_following_list.append(True)
+ else:
+ is_following_list.append(False)
+
+ return OutputExample(
+ instruction_id_list=inp.instruction_id_list,
+ prompt=inp.prompt,
+ response=response,
+ follow_all_instructions=all(is_following_list),
+ follow_instruction_list=is_following_list,
+ )
+
+
+def test_instruction_following_loose(
+ inp,
+ response,
+):
+ """Tests response for an upper bound for following instructions."""
+ r = response.split("\n")
+ response_remove_first = "\n".join(r[1:]).strip()
+ response_remove_last = "\n".join(r[:-1]).strip()
+ response_remove_both = "\n".join(r[1:-1]).strip()
+ revised_response = response.replace("*", "")
+ revised_response_remove_first = response_remove_first.replace("*", "")
+ revised_response_remove_last = response_remove_last.replace("*", "")
+ revised_response_remove_both = response_remove_both.replace("*", "")
+ all_responses = [
+ response,
+ revised_response,
+ response_remove_first,
+ response_remove_last,
+ response_remove_both,
+ revised_response_remove_first,
+ revised_response_remove_last,
+ revised_response_remove_both,
+ ]
+ instruction_list = inp.instruction_id_list
+ is_following_list = []
+
+ for index, instruction_id in enumerate(instruction_list):
+ instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+ instruction = instruction_cls(instruction_id)
+
+ # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+ kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+ instruction.build_description(**kwargs)
+ args = instruction.get_instruction_args()
+ if args and "prompt" in args:
+ instruction.build_description(prompt=inp.prompt)
+
+ is_following = False
+ for r in all_responses:
+ if r.strip() and instruction.check_following(r):
+ is_following = True
+ break
+
+ is_following_list.append(is_following)
+
+ return OutputExample(
+ instruction_id_list=inp.instruction_id_list,
+ prompt=inp.prompt,
+ response=response,
+ follow_all_instructions=all(is_following_list),
+ follow_instruction_list=is_following_list,
+ )
+
+
+def process_results(doc, results):
+ new_kwargs = []
+ for item in doc["kwargs"]:
+ if item["nth_paragraph"]:
+ item["nth_paragraph"] = int(item["nth_paragraph"])
+ new_kwargs.append(item)
+ inp = InputExample(
+ key=doc["key"],
+ instruction_id_list=doc["instruction_id_list"],
+ prompt=doc["prompt"],
+ kwargs=new_kwargs,
+ )
+ response = results[0]
+
+ out_strict = test_instruction_following_strict(inp, response)
+ out_loose = test_instruction_following_loose(inp, response)
+
+ return {
+ "prompt_level_strict_acc": out_strict.follow_all_instructions,
+ "inst_level_strict_acc": out_strict.follow_instruction_list,
+ "prompt_level_loose_acc": out_loose.follow_all_instructions,
+ "inst_level_loose_acc": out_loose.follow_instruction_list,
+ }
+
+
+def agg_inst_level_acc(items):
+ flat_items = [item for sublist in items for item in sublist]
+ inst_level_acc = sum(flat_items) / len(flat_items)
+ return inst_level_acc
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+ def _get_question(example: dict) -> dict:
+ # get the question from the ifeval dataset
+ example["input_question"] = (
+ eval(
+ example["input_question"]
+ .replace("null", "None")
+ .replace("true", "True")
+ .replace("false", "False")
+ )["dialog"][0]["body"]
+ .replace("Is it True that the first song", "Is it true that the first song")
+ .replace("Is the following True", "Is the following true")
+ )
+ example["input_final_prompts"] = example["input_final_prompts"][0]
+ return example
+
+ original_dataset_name = "wis-k/instruction-following-eval"
+ ifeval_data = datasets.load_dataset(original_dataset_name, split="train")
+ ifeval_df = ifeval_data.to_pandas()
+ ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
+
+ meta_dataset = dataset.map(_get_question)
+ meta_df = meta_dataset.to_pandas()
+
+ # join the two datasets on the input_question column
+ joined = meta_df.join(ifeval_df.set_index("input_question"), on="input_question")
+ joined = joined.rename(columns={"input_final_prompts": "prompt"})
+ joined = joined.rename(columns={"is_correct": "previous_is_correct"})
+ joined = datasets.Dataset.from_pandas(joined)
+ joined = joined.select_columns(
+ [
+ "input_question",
+ "prompt",
+ "previous_is_correct",
+ "instruction_id_list",
+ "kwargs",
+ "output_prediction_text",
+ "key",
+ ]
+ )
+ joined.rename_column("output_prediction_text", "previous_output_prediction_text")
+ return joined
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
new file mode 100644
index 0000000000..1ec3c107d8
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
@@ -0,0 +1,29 @@
+task: meta_mmlu_pro_instruct
+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
+dataset_name: Llama-3.1-8B-Instruct-evals__mmlu_pro__details
+test_split: latest
+output_type: generate_until
+process_docs: !function utils.process_docs
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+ - name: "strict-match"
+ filter:
+ - function: "regex"
+ group_select: -1
+ regex_pattern: 'best answer is ([A-Z])'
+ - function: "take_first"
+generation_kwargs:
+ until: []
+ do_sample: false
+ temperature: 0
+ max_gen_toks: 1024
+num_fewshot: 0
+metric_list:
+ - metric: exact_match
+ aggregation: mean
+ higher_is_better: true
+ ignore_case: true
+ ignore_punctuation: true
+metadata:
+ version: 1.0
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py
new file mode 100644
index 0000000000..6b8bc3e5b2
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import datasets
+
+
+def doc_to_text(doc: dict) -> str:
+ return doc["input_final_prompts"][0]
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+ def _process_doc(doc: dict) -> dict:
+ out_doc = {
+ "problem": doc["input_question"],
+ "gold": doc["input_correct_responses"][0],
+ }
+ return out_doc
+
+ dataset = dataset.select_columns(
+ [
+ "input_question",
+ "input_correct_responses",
+ "input_final_prompts",
+ "is_correct",
+ "input_question_hash",
+ "input_choice_list",
+ "output_prediction_text",
+ ],
+ )
+ dataset = dataset.rename_column("is_correct", "previously_is_correct")
+ dataset = dataset.map(_process_doc)
+ return dataset
diff --git a/llama_stack/providers/registry/evals.py b/llama_stack/providers/registry/evals.py
new file mode 100644
index 0000000000..a8a7e735ff
--- /dev/null
+++ b/llama_stack/providers/registry/evals.py
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from llama_stack.distribution.datatypes import * # noqa: F403
+
+
+def available_providers() -> List[ProviderSpec]:
+ return [
+ InlineProviderSpec(
+ api=Api.evals,
+ provider_type="meta-reference",
+ pip_packages=[
+ "matplotlib",
+ "pillow",
+ "pandas",
+ "scikit-learn",
+ "datasets",
+ "numpy",
+ "autoevals",
+ "openpyxl",
+ ],
+ module="llama_stack.providers.impls.meta_reference.evals",
+ config_class="llama_stack.providers.impls.meta_reference.evals.MetaReferenceEvalsImplConfig",
+ api_dependencies=[
+ Api.inference,
+ ],
+ ),
+ InlineProviderSpec(
+ api=Api.evals,
+ provider_type="eleuther",
+ pip_packages=[
+ "lm-eval",
+ ],
+ module="llama_stack.providers.impls.third_party.evals.eleuther",
+ config_class="llama_stack.providers.impls.third_party.evals.eleuther.EleutherEvalsImplConfig",
+ api_dependencies=[
+ Api.inference,
+ ],
+ ),
+ ]
diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py
index 9fffc0f99a..2070649043 100644
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@@ -152,7 +152,7 @@ def severity(levelname: str) -> LogSeverity:
elif levelname == "INFO":
return LogSeverity.INFO
elif levelname == "WARNING":
- return LogSeverity.WARNING
+ return LogSeverity.WARN
elif levelname == "ERROR":
return LogSeverity.ERROR
elif levelname == "CRITICAL":
diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml
index e12f6e8528..31fb726708 100644
--- a/tests/examples/local-run.yaml
+++ b/tests/examples/local-run.yaml
@@ -11,16 +11,26 @@ apis:
- memory_banks
- inference
- safety
+- evals
+- datasets
providers:
- inference:
+ evals:
- provider_id: meta-reference
provider_type: meta-reference
+ config: {}
+ inference:
+ - provider_id: remote::tgi
+ provider_type: remote::tgi
config:
- model: Llama3.1-8B-Instruct
- quantization: null
- torch_seed: null
- max_seq_len: 4096
- max_batch_size: 1
+ url: http://127.0.0.1:5009
+ # - provider_id: meta-reference
+ # provider_type: meta-reference
+ # config:
+ # model: Llama3.1-8B-Instruct
+ # quantization: null
+ # torch_seed: null
+ # max_seq_len: 4096
+ # max_batch_size: 1
safety:
- provider_id: meta-reference
provider_type: meta-reference