diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py index 871c01a80f..994b06e583 100644 --- a/docs/openapi_generator/generate.py +++ b/docs/openapi_generator/generate.py @@ -33,7 +33,7 @@ from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_stack.apis.agents import * # noqa: F403 -from llama_stack.apis.dataset import * # noqa: F403 +from llama_stack.apis.datasets import * # noqa: F403 from llama_stack.apis.evals import * # noqa: F403 from llama_stack.apis.inference import * # noqa: F403 from llama_stack.apis.batch_inference import * # noqa: F403 @@ -61,7 +61,7 @@ class LlamaStack( Telemetry, PostTraining, Memory, - Evaluations, + Evals, Models, Shields, Inspect, diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html index a2f92b6e42..7ce99db3a7 100644 --- a/docs/resources/llama-stack-spec.html +++ b/docs/resources/llama-stack-spec.html @@ -21,7 +21,7 @@ "info": { "title": "[DRAFT] Llama Stack Specification", "version": "0.0.1", - "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-10 15:29:56.831109" + "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-15 10:20:19.984531" }, "servers": [ { @@ -109,39 +109,6 @@ } } }, - "/evaluate/job/cancel": { - "post": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "Evaluations" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CancelEvaluationJobRequest" - } - } - }, - "required": true - } - } - }, "/post_training/job/cancel": { "post": { "responses": { @@ -393,7 +360,14 @@ "post": { "responses": { "200": { - "description": "OK" + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CreateDatasetResponse" + } + } + } } }, "tags": [ @@ -489,119 +463,6 @@ } }, "/datasets/delete": { - "post": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "Datasets" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/DeleteDatasetRequest" - } - } - }, - "required": true - } - } - }, - "/inference/embeddings": { - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EmbeddingsResponse" - } - } - } - } - }, - "tags": [ - "Inference" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EmbeddingsRequest" - } - } - }, - "required": true - } - } - }, - "/evaluate/question_answering/": { - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluationJob" - } - } - } - } - }, - "tags": [ - "Evaluations" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluateQuestionAnsweringRequest" - } - } - }, - "required": true - } - } - }, - "/evaluate/summarization/": { "post": { "responses": { "200": { @@ -609,14 +470,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluationJob" + "$ref": "#/components/schemas/DeleteDatasetResponse" } } } } }, "tags": [ - "Evaluations" + "Datasets" ], "parameters": [ { @@ -633,7 +494,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluateSummarizationRequest" + "$ref": "#/components/schemas/DeleteDatasetRequest" } } }, @@ -641,7 +502,7 @@ } } }, - "/evaluate/text_generation/": { + "/inference/embeddings": { "post": { "responses": { "200": { @@ -649,14 +510,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluationJob" + "$ref": "#/components/schemas/EmbeddingsResponse" } } } } }, "tags": [ - "Evaluations" + "Inference" ], "parameters": [ { @@ -673,7 +534,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluateTextGenerationRequest" + "$ref": "#/components/schemas/EmbeddingsRequest" } } }, @@ -845,7 +706,21 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/TrainEvalDataset" + "oneOf": [ + { + "oneOf": [ + { + "$ref": "#/components/schemas/HuggingfaceDatasetDef" + }, + { + "$ref": "#/components/schemas/CustomDatasetDef" + } + ] + }, + { + "type": "null" + } + ] } } } @@ -856,7 +731,7 @@ ], "parameters": [ { - "name": "dataset_uuid", + "name": "dataset_identifier", "in": "query", "required": true, "schema": { @@ -875,7 +750,7 @@ ] } }, - "/evaluate/job/artifacts": { + "/memory_banks/get": { "get": { "responses": { "200": { @@ -883,18 +758,38 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluationJobArtifactsResponse" + "oneOf": [ + { + "oneOf": [ + { + "$ref": "#/components/schemas/VectorMemoryBankDef" + }, + { + "$ref": "#/components/schemas/KeyValueMemoryBankDef" + }, + { + "$ref": "#/components/schemas/KeywordMemoryBankDef" + }, + { + "$ref": "#/components/schemas/GraphMemoryBankDef" + } + ] + }, + { + "type": "null" + } + ] } } } } }, "tags": [ - "Evaluations" + "MemoryBanks" ], "parameters": [ { - "name": "job_uuid", + "name": "identifier", "in": "query", "required": true, "schema": { @@ -913,7 +808,7 @@ ] } }, - "/evaluate/job/logs": { + "/models/get": { "get": { "responses": { "200": { @@ -921,18 +816,25 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluationJobLogStream" + "oneOf": [ + { + "$ref": "#/components/schemas/ModelDefWithProvider" + }, + { + "type": "null" + } + ] } } } } }, "tags": [ - "Evaluations" + "Models" ], "parameters": [ { - "name": "job_uuid", + "name": "identifier", "in": "query", "required": true, "schema": { @@ -951,7 +853,7 @@ ] } }, - "/evaluate/job/status": { + "/shields/get": { "get": { "responses": { "200": { @@ -959,18 +861,25 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluationJobStatusResponse" + "oneOf": [ + { + "$ref": "#/components/schemas/ShieldDefWithProvider" + }, + { + "type": "null" + } + ] } } } } }, "tags": [ - "Evaluations" + "Shields" ], "parameters": [ { - "name": "job_uuid", + "name": "shield_type", "in": "query", "required": true, "schema": { @@ -989,24 +898,32 @@ ] } }, - "/evaluate/jobs": { + "/telemetry/get_trace": { "get": { "responses": { "200": { "description": "OK", "content": { - "application/jsonl": { + "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluationJob" + "$ref": "#/components/schemas/Trace" } } } } }, "tags": [ - "Evaluations" + "Telemetry" ], "parameters": [ + { + "name": "trace_id", + "in": "query", + "required": true, + "schema": { + "type": "string" + } + }, { "name": "X-LlamaStack-ProviderData", "in": "header", @@ -1019,7 +936,7 @@ ] } }, - "/memory_banks/get": { + "/post_training/job/artifacts": { "get": { "responses": { "200": { @@ -1027,204 +944,18 @@ "content": { "application/json": { "schema": { - "oneOf": [ - { - "oneOf": [ - { - "$ref": "#/components/schemas/VectorMemoryBankDef" - }, - { - "$ref": "#/components/schemas/KeyValueMemoryBankDef" - }, - { - "$ref": "#/components/schemas/KeywordMemoryBankDef" - }, - { - "$ref": "#/components/schemas/GraphMemoryBankDef" - } - ] - }, - { - "type": "null" - } - ] + "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse" } } } } }, "tags": [ - "MemoryBanks" + "PostTraining" ], "parameters": [ { - "name": "identifier", - "in": "query", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ] - } - }, - "/models/get": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/ModelDefWithProvider" - }, - { - "type": "null" - } - ] - } - } - } - } - }, - "tags": [ - "Models" - ], - "parameters": [ - { - "name": "identifier", - "in": "query", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ] - } - }, - "/shields/get": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/ShieldDefWithProvider" - }, - { - "type": "null" - } - ] - } - } - } - } - }, - "tags": [ - "Shields" - ], - "parameters": [ - { - "name": "shield_type", - "in": "query", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ] - } - }, - "/telemetry/get_trace": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Trace" - } - } - } - } - }, - "tags": [ - "Telemetry" - ], - "parameters": [ - { - "name": "trace_id", - "in": "query", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ] - } - }, - "/post_training/job/artifacts": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse" - } - } - } - } - }, - "tags": [ - "PostTraining" - ], - "parameters": [ - { - "name": "job_uuid", + "name": "job_uuid", "in": "query", "required": true, "schema": { @@ -1412,6 +1143,43 @@ } } }, + "/datasets/list": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/jsonl": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/HuggingfaceDatasetDef" + }, + { + "$ref": "#/components/schemas/CustomDatasetDef" + } + ] + } + } + } + } + }, + "tags": [ + "Datasets" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ] + } + }, "/memory_banks/list": { "get": { "responses": { @@ -1836,7 +1604,7 @@ } } }, - "/safety/run_shield": { + "/evals/run_eval_task": { "post": { "responses": { "200": { @@ -1844,14 +1612,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RunShieldResponse" + "$ref": "#/components/schemas/EvaluateResponse" } } } } }, "tags": [ - "Safety" + "Evals" ], "parameters": [ { @@ -1868,7 +1636,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RunShieldRequest" + "$ref": "#/components/schemas/RunEvalTaskRequest" } } }, @@ -1876,7 +1644,7 @@ } } }, - "/post_training/supervised_fine_tune": { + "/evals/run_scorer": { "post": { "responses": { "200": { @@ -1884,14 +1652,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/PostTrainingJob" + "$ref": "#/components/schemas/EvaluateResponse" } } } } }, "tags": [ - "PostTraining" + "Evals" ], "parameters": [ { @@ -1908,7 +1676,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SupervisedFineTuneRequest" + "$ref": "#/components/schemas/RunScorerRequest" } } }, @@ -1916,7 +1684,7 @@ } } }, - "/synthetic_data_generation/generate": { + "/safety/run_shield": { "post": { "responses": { "200": { @@ -1924,14 +1692,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SyntheticDataGenerationResponse" + "$ref": "#/components/schemas/RunShieldResponse" } } } } }, "tags": [ - "SyntheticDataGeneration" + "Safety" ], "parameters": [ { @@ -1948,54 +1716,134 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SyntheticDataGenerateRequest" + "$ref": "#/components/schemas/RunShieldRequest" } } }, "required": true } } - } - }, - "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema", - "components": { - "schemas": { - "BuiltinTool": { - "type": "string", - "enum": [ - "brave_search", - "wolfram_alpha", - "photogen", - "code_interpreter" - ] - }, - "CompletionMessage": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "assistant", - "default": "assistant" - }, - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "$ref": "#/components/schemas/ImageMedia" - }, - { - "type": "array", - "items": { - "oneOf": [ - { - "type": "string" - }, - { - "$ref": "#/components/schemas/ImageMedia" - } - ] + }, + "/post_training/supervised_fine_tune": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/PostTrainingJob" + } + } + } + } + }, + "tags": [ + "PostTraining" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupervisedFineTuneRequest" + } + } + }, + "required": true + } + } + }, + "/synthetic_data_generation/generate": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SyntheticDataGenerationResponse" + } + } + } + } + }, + "tags": [ + "SyntheticDataGeneration" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SyntheticDataGenerateRequest" + } + } + }, + "required": true + } + } + } + }, + "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema", + "components": { + "schemas": { + "BuiltinTool": { + "type": "string", + "enum": [ + "brave_search", + "wolfram_alpha", + "photogen", + "code_interpreter" + ] + }, + "CompletionMessage": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "assistant", + "default": "assistant" + }, + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/components/schemas/ImageMedia" + }, + { + "type": "array", + "items": { + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/components/schemas/ImageMedia" + } + ] } } ] @@ -2571,18 +2419,6 @@ "completion_message_batch" ] }, - "CancelEvaluationJobRequest": { - "type": "object", - "properties": { - "job_uuid": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_uuid" - ] - }, "CancelTrainingJobRequest": { "type": "object", "properties": { @@ -4090,19 +3926,58 @@ "error" ] }, - "TrainEvalDataset": { + "CustomDatasetDef": { "type": "object", "properties": { - "columns": { + "type": { + "type": "string", + "const": "custom", + "default": "custom" + }, + "identifier": { + "type": "string" + }, + "url": { + "type": "string" + }, + "rename_columns_map": { "type": "object", "additionalProperties": { - "$ref": "#/components/schemas/TrainEvalDatasetColumnType" + "type": "string" } + } + }, + "additionalProperties": false, + "required": [ + "type", + "identifier", + "url" + ] + }, + "HuggingfaceDatasetDef": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "huggingface", + "default": "huggingface" }, - "content_url": { - "$ref": "#/components/schemas/URL" + "identifier": { + "type": "string" }, - "metadata": { + "dataset_path": { + "type": "string" + }, + "dataset_name": { + "type": "string" + }, + "rename_columns_map": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "kwargs": { "type": "object", "additionalProperties": { "oneOf": [ @@ -4130,35 +4005,48 @@ }, "additionalProperties": false, "required": [ - "columns", - "content_url" - ], - "title": "Dataset to be used for training or evaluating language models." - }, - "TrainEvalDatasetColumnType": { - "type": "string", - "enum": [ - "dialog", - "text", - "media", - "number", - "json" + "type", + "identifier", + "dataset_path", + "kwargs" ] }, "CreateDatasetRequest": { "type": "object", "properties": { - "uuid": { - "type": "string" + "dataset_def": { + "oneOf": [ + { + "$ref": "#/components/schemas/HuggingfaceDatasetDef" + }, + { + "$ref": "#/components/schemas/CustomDatasetDef" + } + ] + } + }, + "additionalProperties": false, + "required": [ + "dataset_def" + ] + }, + "CreateDatasetResponse": { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "success", + "fail" + ] }, - "dataset": { - "$ref": "#/components/schemas/TrainEvalDataset" + "msg": { + "type": "string" } }, "additionalProperties": false, "required": [ - "uuid", - "dataset" + "status" ] }, "DeleteAgentsRequest": { @@ -4192,13 +4080,32 @@ "DeleteDatasetRequest": { "type": "object", "properties": { - "dataset_uuid": { + "dataset_identifier": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "dataset_identifier" + ] + }, + "DeleteDatasetResponse": { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "success", + "fail" + ] + }, + "msg": { "type": "string" } }, "additionalProperties": false, "required": [ - "dataset_uuid" + "status" ] }, "EmbeddingsRequest": { @@ -4258,112 +4165,42 @@ "embeddings" ] }, - "EvaluateQuestionAnsweringRequest": { + "GetAgentsSessionRequest": { "type": "object", "properties": { - "metrics": { + "turn_ids": { "type": "array", "items": { - "type": "string", - "enum": [ - "em", - "f1" - ] + "type": "string" } } }, - "additionalProperties": false, - "required": [ - "metrics" - ] + "additionalProperties": false }, - "EvaluationJob": { + "GraphMemoryBankDef": { "type": "object", "properties": { - "job_uuid": { + "identifier": { "type": "string" + }, + "provider_id": { + "type": "string", + "default": "" + }, + "type": { + "type": "string", + "const": "graph", + "default": "graph" } }, "additionalProperties": false, "required": [ - "job_uuid" + "identifier", + "provider_id", + "type" ] }, - "EvaluateSummarizationRequest": { - "type": "object", - "properties": { - "metrics": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "rouge", - "bleu" - ] - } - } - }, - "additionalProperties": false, - "required": [ - "metrics" - ] - }, - "EvaluateTextGenerationRequest": { - "type": "object", - "properties": { - "metrics": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "perplexity", - "rouge", - "bleu" - ] - } - } - }, - "additionalProperties": false, - "required": [ - "metrics" - ] - }, - "GetAgentsSessionRequest": { - "type": "object", - "properties": { - "turn_ids": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false - }, - "GraphMemoryBankDef": { - "type": "object", - "properties": { - "identifier": { - "type": "string" - }, - "provider_id": { - "type": "string", - "default": "" - }, - "type": { - "type": "string", - "const": "graph", - "default": "graph" - } - }, - "additionalProperties": false, - "required": [ - "identifier", - "provider_id", - "type" - ] - }, - "KeyValueMemoryBankDef": { + "KeyValueMemoryBankDef": { "type": "object", "properties": { "identifier": { @@ -4513,43 +4350,6 @@ "step" ] }, - "EvaluationJobArtifactsResponse": { - "type": "object", - "properties": { - "job_uuid": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_uuid" - ], - "title": "Artifacts of a evaluation job." - }, - "EvaluationJobLogStream": { - "type": "object", - "properties": { - "job_uuid": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_uuid" - ] - }, - "EvaluationJobStatusResponse": { - "type": "object", - "properties": { - "job_uuid": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_uuid" - ] - }, "ModelDefWithProvider": { "type": "object", "properties": { @@ -5265,6 +5065,61 @@ "dpo" ] }, + "TrainEvalDataset": { + "type": "object", + "properties": { + "columns": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/TrainEvalDatasetColumnType" + } + }, + "content_url": { + "$ref": "#/components/schemas/URL" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "columns", + "content_url" + ], + "title": "Dataset to be used for training or evaluating language models." + }, + "TrainEvalDatasetColumnType": { + "type": "string", + "enum": [ + "dialog", + "text", + "media", + "number", + "json" + ] + }, "TrainingConfig": { "type": "object", "properties": { @@ -5491,222 +5346,520 @@ "document_id": { "type": "string" } - }, - "additionalProperties": false, - "required": [ - "content", - "token_count", - "document_id" + }, + "additionalProperties": false, + "required": [ + "content", + "token_count", + "document_id" + ] + } + }, + "scores": { + "type": "array", + "items": { + "type": "number" + } + } + }, + "additionalProperties": false, + "required": [ + "chunks", + "scores" + ] + }, + "RegisterMemoryBankRequest": { + "type": "object", + "properties": { + "memory_bank": { + "oneOf": [ + { + "$ref": "#/components/schemas/VectorMemoryBankDef" + }, + { + "$ref": "#/components/schemas/KeyValueMemoryBankDef" + }, + { + "$ref": "#/components/schemas/KeywordMemoryBankDef" + }, + { + "$ref": "#/components/schemas/GraphMemoryBankDef" + } + ] + } + }, + "additionalProperties": false, + "required": [ + "memory_bank" + ] + }, + "RegisterModelRequest": { + "type": "object", + "properties": { + "model": { + "$ref": "#/components/schemas/ModelDefWithProvider" + } + }, + "additionalProperties": false, + "required": [ + "model" + ] + }, + "RegisterShieldRequest": { + "type": "object", + "properties": { + "shield": { + "$ref": "#/components/schemas/ShieldDefWithProvider" + } + }, + "additionalProperties": false, + "required": [ + "shield" + ] + }, + "DialogGenerations": { + "type": "object", + "properties": { + "dialog": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/UserMessage" + }, + { + "$ref": "#/components/schemas/SystemMessage" + }, + { + "$ref": "#/components/schemas/ToolResponseMessage" + }, + { + "$ref": "#/components/schemas/CompletionMessage" + } + ] + } + }, + "sampled_generations": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/UserMessage" + }, + { + "$ref": "#/components/schemas/SystemMessage" + }, + { + "$ref": "#/components/schemas/ToolResponseMessage" + }, + { + "$ref": "#/components/schemas/CompletionMessage" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "dialog", + "sampled_generations" + ] + }, + "RewardScoreRequest": { + "type": "object", + "properties": { + "dialog_generations": { + "type": "array", + "items": { + "$ref": "#/components/schemas/DialogGenerations" + } + }, + "model": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "dialog_generations", + "model" + ] + }, + "RewardScoringResponse": { + "type": "object", + "properties": { + "scored_generations": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ScoredDialogGenerations" + } + } + }, + "additionalProperties": false, + "required": [ + "scored_generations" + ], + "title": "Response from the reward scoring. Batch of (prompt, response, score) tuples that pass the threshold." + }, + "ScoredDialogGenerations": { + "type": "object", + "properties": { + "dialog": { + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/UserMessage" + }, + { + "$ref": "#/components/schemas/SystemMessage" + }, + { + "$ref": "#/components/schemas/ToolResponseMessage" + }, + { + "$ref": "#/components/schemas/CompletionMessage" + } + ] + } + }, + "scored_generations": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ScoredMessage" + } + } + }, + "additionalProperties": false, + "required": [ + "dialog", + "scored_generations" + ] + }, + "ScoredMessage": { + "type": "object", + "properties": { + "message": { + "oneOf": [ + { + "$ref": "#/components/schemas/UserMessage" + }, + { + "$ref": "#/components/schemas/SystemMessage" + }, + { + "$ref": "#/components/schemas/ToolResponseMessage" + }, + { + "$ref": "#/components/schemas/CompletionMessage" + } + ] + }, + "score": { + "type": "number" + } + }, + "additionalProperties": false, + "required": [ + "message", + "score" + ] + }, + "EvaluateDatasetConfig": { + "type": "object", + "properties": { + "dataset_identifier": { + "type": "string" + }, + "row_limit": { + "type": "integer" + }, + "kwargs": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "dataset_identifier" + ] + }, + "EvaluateJudgeScoringConfig": { + "type": "object" + }, + "EvaluateModelGenerationConfig": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams" + }, + "kwargs": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "model", + "sampling_params" + ] + }, + "EvaluatePostprocessConfig": { + "type": "object", + "properties": { + "kwargs": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false + }, + "EvaluatePreprocessConfig": { + "type": "object", + "properties": { + "kwargs": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } ] } + } + }, + "additionalProperties": false + }, + "EvaluateProcessorConfig": { + "type": "object", + "properties": { + "processor_identifier": { + "type": "string" }, - "scores": { - "type": "array", - "items": { - "type": "number" - } + "preprocess_config": { + "$ref": "#/components/schemas/EvaluatePreprocessConfig" + }, + "postprocess_config": { + "$ref": "#/components/schemas/EvaluatePostprocessConfig" } }, "additionalProperties": false, "required": [ - "chunks", - "scores" + "processor_identifier" ] }, - "RegisterMemoryBankRequest": { + "EvaluateScoringConfig": { "type": "object", "properties": { - "memory_bank": { - "oneOf": [ - { - "$ref": "#/components/schemas/VectorMemoryBankDef" - }, - { - "$ref": "#/components/schemas/KeyValueMemoryBankDef" - }, - { - "$ref": "#/components/schemas/KeywordMemoryBankDef" - }, - { - "$ref": "#/components/schemas/GraphMemoryBankDef" - } - ] + "scorer_config_list": { + "type": "array", + "items": { + "$ref": "#/components/schemas/EvaluateSingleScorerConfig" + } } }, "additionalProperties": false, "required": [ - "memory_bank" + "scorer_config_list" ] }, - "RegisterModelRequest": { + "EvaluateSingleScorerConfig": { "type": "object", "properties": { - "model": { - "$ref": "#/components/schemas/ModelDefWithProvider" + "scorer_name": { + "type": "string" + }, + "llm_judge_config": { + "$ref": "#/components/schemas/LLMJudgeConfig" } }, "additionalProperties": false, "required": [ - "model" + "scorer_name" ] }, - "RegisterShieldRequest": { + "EvaluateTaskConfig": { "type": "object", "properties": { - "shield": { - "$ref": "#/components/schemas/ShieldDefWithProvider" + "dataset_config": { + "$ref": "#/components/schemas/EvaluateDatasetConfig" + }, + "processor_config": { + "$ref": "#/components/schemas/EvaluateProcessorConfig" + }, + "generation_config": { + "$ref": "#/components/schemas/EvaluateModelGenerationConfig" + }, + "scoring_config": { + "$ref": "#/components/schemas/EvaluateScoringConfig" } }, "additionalProperties": false, "required": [ - "shield" + "dataset_config", + "processor_config", + "generation_config", + "scoring_config" ] }, - "DialogGenerations": { + "LLMJudgeConfig": { "type": "object", "properties": { - "dialog": { - "type": "array", - "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/SystemMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - }, - { - "$ref": "#/components/schemas/CompletionMessage" - } - ] - } + "judge_processor_config": { + "$ref": "#/components/schemas/EvaluateProcessorConfig" }, - "sampled_generations": { - "type": "array", - "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/SystemMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - }, - { - "$ref": "#/components/schemas/CompletionMessage" - } - ] - } + "judge_model_generation_config": { + "$ref": "#/components/schemas/EvaluateModelGenerationConfig" + }, + "judge_scoring_config": { + "$ref": "#/components/schemas/EvaluateJudgeScoringConfig" } }, "additionalProperties": false, "required": [ - "dialog", - "sampled_generations" + "judge_processor_config", + "judge_model_generation_config", + "judge_scoring_config" ] }, - "RewardScoreRequest": { + "RunEvalTaskRequest": { "type": "object", "properties": { - "dialog_generations": { - "type": "array", - "items": { - "$ref": "#/components/schemas/DialogGenerations" - } - }, - "model": { - "type": "string" + "eval_task_config": { + "$ref": "#/components/schemas/EvaluateTaskConfig" } }, "additionalProperties": false, "required": [ - "dialog_generations", - "model" + "eval_task_config" ] }, - "RewardScoringResponse": { + "EvalResult": { "type": "object", "properties": { - "scored_generations": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ScoredDialogGenerations" + "metrics": { + "type": "object", + "additionalProperties": { + "type": "number" } } }, "additionalProperties": false, "required": [ - "scored_generations" + "metrics" ], - "title": "Response from the reward scoring. Batch of (prompt, response, score) tuples that pass the threshold." + "title": "Aggregated final evaluation result." }, - "ScoredDialogGenerations": { + "EvaluateResponse": { "type": "object", "properties": { - "dialog": { - "type": "array", - "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/SystemMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - }, - { - "$ref": "#/components/schemas/CompletionMessage" - } - ] - } + "eval_result": { + "$ref": "#/components/schemas/EvalResult" }, - "scored_generations": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ScoredMessage" - } + "formatted_report": { + "type": "string" } }, "additionalProperties": false, "required": [ - "dialog", - "scored_generations" - ] + "eval_result" + ], + "title": "Scores for evaluation." }, - "ScoredMessage": { + "RunScorerRequest": { "type": "object", "properties": { - "message": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/SystemMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - }, - { - "$ref": "#/components/schemas/CompletionMessage" - } - ] + "dataset_config": { + "$ref": "#/components/schemas/EvaluateDatasetConfig" }, - "score": { - "type": "number" + "eval_scoring_config": { + "$ref": "#/components/schemas/EvaluateScoringConfig" } }, "additionalProperties": false, "required": [ - "message", - "score" + "dataset_config", + "eval_scoring_config" ] }, "RunShieldRequest": { @@ -6075,49 +6228,49 @@ ], "tags": [ { - "name": "Evaluations" + "name": "Agents" }, { - "name": "Inspect" + "name": "Telemetry" }, { - "name": "RewardScoring" + "name": "Safety" }, { - "name": "Datasets" + "name": "MemoryBanks" }, { - "name": "Models" + "name": "Datasets" }, { - "name": "Telemetry" + "name": "Shields" }, { - "name": "PostTraining" + "name": "RewardScoring" }, { - "name": "SyntheticDataGeneration" + "name": "PostTraining" }, { - "name": "BatchInference" + "name": "Models" }, { - "name": "Inference" + "name": "Inspect" }, { - "name": "Agents" + "name": "Evals" }, { - "name": "Memory" + "name": "BatchInference" }, { - "name": "Safety" + "name": "Inference" }, { - "name": "Shields" + "name": "Memory" }, { - "name": "MemoryBanks" + "name": "SyntheticDataGeneration" }, { "name": "BuiltinTool", @@ -6195,10 +6348,6 @@ "name": "BatchCompletionResponse", "description": "" }, - { - "name": "CancelEvaluationJobRequest", - "description": "" - }, { "name": "CancelTrainingJobRequest", "description": "" @@ -6368,17 +6517,21 @@ "description": "" }, { - "name": "TrainEvalDataset", - "description": "Dataset to be used for training or evaluating language models.\n\n" + "name": "CustomDatasetDef", + "description": "" }, { - "name": "TrainEvalDatasetColumnType", - "description": "" + "name": "HuggingfaceDatasetDef", + "description": "" }, { "name": "CreateDatasetRequest", "description": "" }, + { + "name": "CreateDatasetResponse", + "description": "" + }, { "name": "DeleteAgentsRequest", "description": "" @@ -6391,6 +6544,10 @@ "name": "DeleteDatasetRequest", "description": "" }, + { + "name": "DeleteDatasetResponse", + "description": "" + }, { "name": "EmbeddingsRequest", "description": "" @@ -6399,22 +6556,6 @@ "name": "EmbeddingsResponse", "description": "" }, - { - "name": "EvaluateQuestionAnsweringRequest", - "description": "" - }, - { - "name": "EvaluationJob", - "description": "" - }, - { - "name": "EvaluateSummarizationRequest", - "description": "" - }, - { - "name": "EvaluateTextGenerationRequest", - "description": "" - }, { "name": "GetAgentsSessionRequest", "description": "" @@ -6443,18 +6584,6 @@ "name": "AgentStepResponse", "description": "" }, - { - "name": "EvaluationJobArtifactsResponse", - "description": "Artifacts of a evaluation job.\n\n" - }, - { - "name": "EvaluationJobLogStream", - "description": "" - }, - { - "name": "EvaluationJobStatusResponse", - "description": "" - }, { "name": "ModelDefWithProvider", "description": "" @@ -6555,6 +6684,14 @@ "name": "RLHFAlgorithm", "description": "" }, + { + "name": "TrainEvalDataset", + "description": "Dataset to be used for training or evaluating language models.\n\n" + }, + { + "name": "TrainEvalDatasetColumnType", + "description": "" + }, { "name": "TrainingConfig", "description": "" @@ -6603,6 +6740,62 @@ "name": "ScoredMessage", "description": "" }, + { + "name": "EvaluateDatasetConfig", + "description": "" + }, + { + "name": "EvaluateJudgeScoringConfig", + "description": "" + }, + { + "name": "EvaluateModelGenerationConfig", + "description": "" + }, + { + "name": "EvaluatePostprocessConfig", + "description": "" + }, + { + "name": "EvaluatePreprocessConfig", + "description": "" + }, + { + "name": "EvaluateProcessorConfig", + "description": "" + }, + { + "name": "EvaluateScoringConfig", + "description": "" + }, + { + "name": "EvaluateSingleScorerConfig", + "description": "" + }, + { + "name": "EvaluateTaskConfig", + "description": "" + }, + { + "name": "LLMJudgeConfig", + "description": "" + }, + { + "name": "RunEvalTaskRequest", + "description": "" + }, + { + "name": "EvalResult", + "description": "Aggregated final evaluation result.\n\n" + }, + { + "name": "EvaluateResponse", + "description": "Scores for evaluation.\n\n" + }, + { + "name": "RunScorerRequest", + "description": "" + }, { "name": "RunShieldRequest", "description": "" @@ -6647,7 +6840,7 @@ "Agents", "BatchInference", "Datasets", - "Evaluations", + "Evals", "Inference", "Inspect", "Memory", @@ -6681,7 +6874,6 @@ "BatchCompletionRequest", "BatchCompletionResponse", "BuiltinTool", - "CancelEvaluationJobRequest", "CancelTrainingJobRequest", "ChatCompletionRequest", "ChatCompletionResponse", @@ -6698,31 +6890,40 @@ "CreateAgentSessionRequest", "CreateAgentTurnRequest", "CreateDatasetRequest", + "CreateDatasetResponse", + "CustomDatasetDef", "DPOAlignmentConfig", "DeleteAgentsRequest", "DeleteAgentsSessionRequest", "DeleteDatasetRequest", + "DeleteDatasetResponse", "DialogGenerations", "DoraFinetuningConfig", "EmbeddingsRequest", "EmbeddingsResponse", - "EvaluateQuestionAnsweringRequest", - "EvaluateSummarizationRequest", - "EvaluateTextGenerationRequest", - "EvaluationJob", - "EvaluationJobArtifactsResponse", - "EvaluationJobLogStream", - "EvaluationJobStatusResponse", + "EvalResult", + "EvaluateDatasetConfig", + "EvaluateJudgeScoringConfig", + "EvaluateModelGenerationConfig", + "EvaluatePostprocessConfig", + "EvaluatePreprocessConfig", + "EvaluateProcessorConfig", + "EvaluateResponse", + "EvaluateScoringConfig", + "EvaluateSingleScorerConfig", + "EvaluateTaskConfig", "FinetuningAlgorithm", "FunctionCallToolDefinition", "GetAgentsSessionRequest", "GraphMemoryBankDef", "HealthInfo", + "HuggingfaceDatasetDef", "ImageMedia", "InferenceStep", "InsertDocumentsRequest", "KeyValueMemoryBankDef", "KeywordMemoryBankDef", + "LLMJudgeConfig", "LogEventRequest", "LogSeverity", "LoraFinetuningConfig", @@ -6752,6 +6953,8 @@ "RewardScoreRequest", "RewardScoringResponse", "RouteInfo", + "RunEvalTaskRequest", + "RunScorerRequest", "RunShieldRequest", "RunShieldResponse", "SafetyViolation", diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml index c9822d6ca9..c116742243 100644 --- a/docs/resources/llama-stack-spec.yaml +++ b/docs/resources/llama-stack-spec.yaml @@ -315,14 +315,6 @@ components: - photogen - code_interpreter type: string - CancelEvaluationJobRequest: - additionalProperties: false - properties: - job_uuid: - type: string - required: - - job_uuid - type: object CancelTrainingJobRequest: additionalProperties: false properties: @@ -572,13 +564,45 @@ components: CreateDatasetRequest: additionalProperties: false properties: - dataset: - $ref: '#/components/schemas/TrainEvalDataset' - uuid: + dataset_def: + oneOf: + - $ref: '#/components/schemas/HuggingfaceDatasetDef' + - $ref: '#/components/schemas/CustomDatasetDef' + required: + - dataset_def + type: object + CreateDatasetResponse: + additionalProperties: false + properties: + msg: + type: string + status: + enum: + - success + - fail type: string required: - - uuid - - dataset + - status + type: object + CustomDatasetDef: + additionalProperties: false + properties: + identifier: + type: string + rename_columns_map: + additionalProperties: + type: string + type: object + type: + const: custom + default: custom + type: string + url: + type: string + required: + - type + - identifier + - url type: object DPOAlignmentConfig: additionalProperties: false @@ -619,10 +643,23 @@ components: DeleteDatasetRequest: additionalProperties: false properties: - dataset_uuid: + dataset_identifier: type: string required: - - dataset_uuid + - dataset_identifier + type: object + DeleteDatasetResponse: + additionalProperties: false + properties: + msg: + type: string + status: + enum: + - success + - fail + type: string + required: + - status type: object DialogGenerations: additionalProperties: false @@ -701,78 +738,147 @@ components: required: - embeddings type: object - EvaluateQuestionAnsweringRequest: + EvalResult: additionalProperties: false properties: metrics: - items: - enum: - - em - - f1 - type: string - type: array + additionalProperties: + type: number + type: object required: - metrics + title: Aggregated final evaluation result. type: object - EvaluateSummarizationRequest: + EvaluateDatasetConfig: additionalProperties: false properties: - metrics: - items: - enum: - - rouge - - bleu - type: string - type: array + dataset_identifier: + type: string + kwargs: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + row_limit: + type: integer required: - - metrics + - dataset_identifier type: object - EvaluateTextGenerationRequest: + EvaluateJudgeScoringConfig: + type: object + EvaluateModelGenerationConfig: additionalProperties: false properties: - metrics: - items: - enum: - - perplexity - - rouge - - bleu - type: string - type: array + kwargs: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + model: + type: string + sampling_params: + $ref: '#/components/schemas/SamplingParams' required: - - metrics + - model + - sampling_params type: object - EvaluationJob: + EvaluatePostprocessConfig: additionalProperties: false properties: - job_uuid: + kwargs: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + type: object + EvaluatePreprocessConfig: + additionalProperties: false + properties: + kwargs: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + type: object + EvaluateProcessorConfig: + additionalProperties: false + properties: + postprocess_config: + $ref: '#/components/schemas/EvaluatePostprocessConfig' + preprocess_config: + $ref: '#/components/schemas/EvaluatePreprocessConfig' + processor_identifier: type: string required: - - job_uuid + - processor_identifier type: object - EvaluationJobArtifactsResponse: + EvaluateResponse: additionalProperties: false properties: - job_uuid: + eval_result: + $ref: '#/components/schemas/EvalResult' + formatted_report: type: string required: - - job_uuid - title: Artifacts of a evaluation job. + - eval_result + title: Scores for evaluation. type: object - EvaluationJobLogStream: + EvaluateScoringConfig: additionalProperties: false properties: - job_uuid: - type: string + scorer_config_list: + items: + $ref: '#/components/schemas/EvaluateSingleScorerConfig' + type: array required: - - job_uuid + - scorer_config_list type: object - EvaluationJobStatusResponse: + EvaluateSingleScorerConfig: additionalProperties: false properties: - job_uuid: + llm_judge_config: + $ref: '#/components/schemas/LLMJudgeConfig' + scorer_name: type: string required: - - job_uuid + - scorer_name + type: object + EvaluateTaskConfig: + additionalProperties: false + properties: + dataset_config: + $ref: '#/components/schemas/EvaluateDatasetConfig' + generation_config: + $ref: '#/components/schemas/EvaluateModelGenerationConfig' + processor_config: + $ref: '#/components/schemas/EvaluateProcessorConfig' + scoring_config: + $ref: '#/components/schemas/EvaluateScoringConfig' + required: + - dataset_config + - processor_config + - generation_config + - scoring_config type: object FinetuningAlgorithm: enum: @@ -845,6 +951,39 @@ components: required: - status type: object + HuggingfaceDatasetDef: + additionalProperties: false + properties: + dataset_name: + type: string + dataset_path: + type: string + identifier: + type: string + kwargs: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + rename_columns_map: + additionalProperties: + type: string + type: object + type: + const: huggingface + default: huggingface + type: string + required: + - type + - identifier + - dataset_path + - kwargs + type: object ImageMedia: additionalProperties: false properties: @@ -936,6 +1075,20 @@ components: - provider_id - type type: object + LLMJudgeConfig: + additionalProperties: false + properties: + judge_model_generation_config: + $ref: '#/components/schemas/EvaluateModelGenerationConfig' + judge_processor_config: + $ref: '#/components/schemas/EvaluateProcessorConfig' + judge_scoring_config: + $ref: '#/components/schemas/EvaluateJudgeScoringConfig' + required: + - judge_processor_config + - judge_model_generation_config + - judge_scoring_config + type: object LogEventRequest: additionalProperties: false properties: @@ -1629,6 +1782,25 @@ components: - method - provider_types type: object + RunEvalTaskRequest: + additionalProperties: false + properties: + eval_task_config: + $ref: '#/components/schemas/EvaluateTaskConfig' + required: + - eval_task_config + type: object + RunScorerRequest: + additionalProperties: false + properties: + dataset_config: + $ref: '#/components/schemas/EvaluateDatasetConfig' + eval_scoring_config: + $ref: '#/components/schemas/EvaluateScoringConfig' + required: + - dataset_config + - eval_scoring_config + type: object RunShieldRequest: additionalProperties: false properties: @@ -2507,7 +2679,7 @@ info: description: "This is the specification of the llama stack that provides\n \ \ a set of endpoints and their corresponding interfaces that are tailored\ \ to\n best leverage Llama Models. The specification is still in\ - \ draft and subject to change.\n Generated at 2024-10-10 15:29:56.831109" + \ draft and subject to change.\n Generated at 2024-10-15 10:20:19.984531" title: '[DRAFT] Llama Stack Specification' version: 0.0.1 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema @@ -2794,81 +2966,16 @@ paths: schema: $ref: '#/components/schemas/CreateDatasetRequest' required: true - responses: - '200': - description: OK - tags: - - Datasets - /datasets/delete: - post: - parameters: - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/DeleteDatasetRequest' - required: true - responses: - '200': - description: OK - tags: - - Datasets - /datasets/get: - get: - parameters: - - in: query - name: dataset_uuid - required: true - schema: - type: string - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string responses: '200': content: application/json: schema: - $ref: '#/components/schemas/TrainEvalDataset' + $ref: '#/components/schemas/CreateDatasetResponse' description: OK tags: - Datasets - /evaluate/job/artifacts: - get: - parameters: - - in: query - name: job_uuid - required: true - schema: - type: string - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string - responses: - '200': - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluationJobArtifactsResponse' - description: OK - tags: - - Evaluations - /evaluate/job/cancel: + /datasets/delete: post: parameters: - description: JSON-encoded provider data which will be made available to the @@ -2882,42 +2989,22 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/CancelEvaluationJobRequest' - required: true - responses: - '200': - description: OK - tags: - - Evaluations - /evaluate/job/logs: - get: - parameters: - - in: query - name: job_uuid + $ref: '#/components/schemas/DeleteDatasetRequest' required: true - schema: - type: string - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string responses: '200': content: application/json: schema: - $ref: '#/components/schemas/EvaluationJobLogStream' + $ref: '#/components/schemas/DeleteDatasetResponse' description: OK tags: - - Evaluations - /evaluate/job/status: + - Datasets + /datasets/get: get: parameters: - in: query - name: job_uuid + name: dataset_identifier required: true schema: type: string @@ -2933,11 +3020,15 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/EvaluationJobStatusResponse' + oneOf: + - oneOf: + - $ref: '#/components/schemas/HuggingfaceDatasetDef' + - $ref: '#/components/schemas/CustomDatasetDef' + - type: 'null' description: OK tags: - - Evaluations - /evaluate/jobs: + - Datasets + /datasets/list: get: parameters: - description: JSON-encoded provider data which will be made available to the @@ -2952,36 +3043,13 @@ paths: content: application/jsonl: schema: - $ref: '#/components/schemas/EvaluationJob' - description: OK - tags: - - Evaluations - /evaluate/question_answering/: - post: - parameters: - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateQuestionAnsweringRequest' - required: true - responses: - '200': - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluationJob' + oneOf: + - $ref: '#/components/schemas/HuggingfaceDatasetDef' + - $ref: '#/components/schemas/CustomDatasetDef' description: OK tags: - - Evaluations - /evaluate/summarization/: + - Datasets + /evals/run_eval_task: post: parameters: - description: JSON-encoded provider data which will be made available to the @@ -2995,18 +3063,18 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/EvaluateSummarizationRequest' + $ref: '#/components/schemas/RunEvalTaskRequest' required: true responses: '200': content: application/json: schema: - $ref: '#/components/schemas/EvaluationJob' + $ref: '#/components/schemas/EvaluateResponse' description: OK tags: - - Evaluations - /evaluate/text_generation/: + - Evals + /evals/run_scorer: post: parameters: - description: JSON-encoded provider data which will be made available to the @@ -3020,17 +3088,17 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/EvaluateTextGenerationRequest' + $ref: '#/components/schemas/RunScorerRequest' required: true responses: '200': content: application/json: schema: - $ref: '#/components/schemas/EvaluationJob' + $ref: '#/components/schemas/EvaluateResponse' description: OK tags: - - Evaluations + - Evals /health: get: parameters: @@ -3712,21 +3780,21 @@ security: servers: - url: http://any-hosted-llama-stack.com tags: -- name: Evaluations -- name: Inspect -- name: RewardScoring -- name: Datasets -- name: Models +- name: Agents - name: Telemetry +- name: Safety +- name: MemoryBanks +- name: Datasets +- name: Shields +- name: RewardScoring - name: PostTraining -- name: SyntheticDataGeneration +- name: Models +- name: Inspect +- name: Evals - name: BatchInference - name: Inference -- name: Agents - name: Memory -- name: Safety -- name: Shields -- name: MemoryBanks +- name: SyntheticDataGeneration - description: name: BuiltinTool - description: name: BatchCompletionResponse -- description: - name: CancelEvaluationJobRequest - description: name: CancelTrainingJobRequest @@ -3919,17 +3984,18 @@ tags: name: Turn - description: name: ViolationLevel -- description: 'Dataset to be used for training or evaluating language models. - - - ' - name: TrainEvalDataset -- description: - name: TrainEvalDatasetColumnType + name: CustomDatasetDef +- description: + name: HuggingfaceDatasetDef - description: name: CreateDatasetRequest +- description: + name: CreateDatasetResponse - description: name: DeleteAgentsRequest @@ -3939,23 +4005,15 @@ tags: - description: name: DeleteDatasetRequest +- description: + name: DeleteDatasetResponse - description: name: EmbeddingsRequest - description: name: EmbeddingsResponse -- description: - name: EvaluateQuestionAnsweringRequest -- description: - name: EvaluationJob -- description: - name: EvaluateSummarizationRequest -- description: - name: EvaluateTextGenerationRequest - description: name: GetAgentsSessionRequest @@ -3979,18 +4037,6 @@ tags: - description: name: AgentStepResponse -- description: 'Artifacts of a evaluation job. - - - ' - name: EvaluationJobArtifactsResponse -- description: - name: EvaluationJobLogStream -- description: - name: EvaluationJobStatusResponse - description: name: ModelDefWithProvider @@ -4067,6 +4113,14 @@ tags: name: OptimizerConfig - description: name: RLHFAlgorithm +- description: 'Dataset to be used for training or evaluating language models. + + + ' + name: TrainEvalDataset +- description: + name: TrainEvalDatasetColumnType - description: name: TrainingConfig - description: name: ScoredMessage +- description: + name: EvaluateDatasetConfig +- description: + name: EvaluateJudgeScoringConfig +- description: + name: EvaluateModelGenerationConfig +- description: + name: EvaluatePostprocessConfig +- description: + name: EvaluatePreprocessConfig +- description: + name: EvaluateProcessorConfig +- description: + name: EvaluateScoringConfig +- description: + name: EvaluateSingleScorerConfig +- description: + name: EvaluateTaskConfig +- description: + name: LLMJudgeConfig +- description: + name: RunEvalTaskRequest +- description: 'Aggregated final evaluation result. + + + ' + name: EvalResult +- description: 'Scores for evaluation. + + + ' + name: EvaluateResponse +- description: + name: RunScorerRequest - description: name: RunShieldRequest @@ -4141,7 +4240,7 @@ x-tagGroups: - Agents - BatchInference - Datasets - - Evaluations + - Evals - Inference - Inspect - Memory @@ -4172,7 +4271,6 @@ x-tagGroups: - BatchCompletionRequest - BatchCompletionResponse - BuiltinTool - - CancelEvaluationJobRequest - CancelTrainingJobRequest - ChatCompletionRequest - ChatCompletionResponse @@ -4189,31 +4287,40 @@ x-tagGroups: - CreateAgentSessionRequest - CreateAgentTurnRequest - CreateDatasetRequest + - CreateDatasetResponse + - CustomDatasetDef - DPOAlignmentConfig - DeleteAgentsRequest - DeleteAgentsSessionRequest - DeleteDatasetRequest + - DeleteDatasetResponse - DialogGenerations - DoraFinetuningConfig - EmbeddingsRequest - EmbeddingsResponse - - EvaluateQuestionAnsweringRequest - - EvaluateSummarizationRequest - - EvaluateTextGenerationRequest - - EvaluationJob - - EvaluationJobArtifactsResponse - - EvaluationJobLogStream - - EvaluationJobStatusResponse + - EvalResult + - EvaluateDatasetConfig + - EvaluateJudgeScoringConfig + - EvaluateModelGenerationConfig + - EvaluatePostprocessConfig + - EvaluatePreprocessConfig + - EvaluateProcessorConfig + - EvaluateResponse + - EvaluateScoringConfig + - EvaluateSingleScorerConfig + - EvaluateTaskConfig - FinetuningAlgorithm - FunctionCallToolDefinition - GetAgentsSessionRequest - GraphMemoryBankDef - HealthInfo + - HuggingfaceDatasetDef - ImageMedia - InferenceStep - InsertDocumentsRequest - KeyValueMemoryBankDef - KeywordMemoryBankDef + - LLMJudgeConfig - LogEventRequest - LogSeverity - LoraFinetuningConfig @@ -4243,6 +4350,8 @@ x-tagGroups: - RewardScoreRequest - RewardScoringResponse - RouteInfo + - RunEvalTaskRequest + - RunScorerRequest - RunShieldRequest - RunShieldResponse - SafetyViolation diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py deleted file mode 100644 index 2fa8bb4e5e..0000000000 --- a/llama_stack/apis/dataset/dataset.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from enum import Enum -from typing import Any, Dict, Optional, Protocol - -from llama_models.llama3.api.datatypes import URL - -from llama_models.schema_utils import json_schema_type, webmethod - -from pydantic import BaseModel - - -@json_schema_type -class TrainEvalDatasetColumnType(Enum): - dialog = "dialog" - text = "text" - media = "media" - number = "number" - json = "json" - - -@json_schema_type -class TrainEvalDataset(BaseModel): - """Dataset to be used for training or evaluating language models.""" - - # TODO(ashwin): figure out if we need to add an enum for a "dataset type" - - columns: Dict[str, TrainEvalDatasetColumnType] - content_url: URL - metadata: Optional[Dict[str, Any]] = None - - -@json_schema_type -class CreateDatasetRequest(BaseModel): - """Request to create a dataset.""" - - uuid: str - dataset: TrainEvalDataset - - -class Datasets(Protocol): - @webmethod(route="/datasets/create") - def create_dataset( - self, - uuid: str, - dataset: TrainEvalDataset, - ) -> None: ... - - @webmethod(route="/datasets/get") - def get_dataset( - self, - dataset_uuid: str, - ) -> TrainEvalDataset: ... - - @webmethod(route="/datasets/delete") - def delete_dataset( - self, - dataset_uuid: str, - ) -> None: ... diff --git a/llama_stack/apis/dataset/__init__.py b/llama_stack/apis/datasets/__init__.py similarity index 82% rename from llama_stack/apis/dataset/__init__.py rename to llama_stack/apis/datasets/__init__.py index 33557a0ab1..102b9927f3 100644 --- a/llama_stack/apis/dataset/__init__.py +++ b/llama_stack/apis/datasets/__init__.py @@ -4,4 +4,4 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from .dataset import * # noqa: F401 F403 +from .datasets import * # noqa: F401 F403 diff --git a/llama_stack/apis/datasets/client.py b/llama_stack/apis/datasets/client.py new file mode 100644 index 0000000000..e292b14d8c --- /dev/null +++ b/llama_stack/apis/datasets/client.py @@ -0,0 +1,156 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import asyncio +import json +from typing import Optional + +import fire +import httpx +from termcolor import cprint + +from .datasets import * # noqa: F403 + + +def deserialize_dataset_def(j: Optional[Dict[str, Any]]) -> Optional[DatasetDef]: + if not j: + return None + if j["type"] == "huggingface": + return HuggingfaceDatasetDef(**j) + elif j["type"] == "custom": + return CustomDatasetDef(**j) + else: + raise ValueError(f"Unknown dataset type: {j['type']}") + + +class DatasetsClient(Datasets): + def __init__(self, base_url: str): + self.base_url = base_url + + async def initialize(self) -> None: + pass + + async def shutdown(self) -> None: + pass + + async def create_dataset( + self, + dataset_def: DatasetDef, + ) -> CreateDatasetResponse: + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.base_url}/datasets/create", + json={ + "dataset_def": json.loads(dataset_def.json()), + }, + headers={"Content-Type": "application/json"}, + timeout=60, + ) + response.raise_for_status() + return CreateDatasetResponse(**response.json()) + + async def get_dataset( + self, + dataset_identifier: str, + ) -> Optional[DatasetDef]: + async with httpx.AsyncClient() as client: + response = await client.get( + f"{self.base_url}/datasets/get", + params={ + "dataset_identifier": dataset_identifier, + }, + headers={"Content-Type": "application/json"}, + timeout=60, + ) + response.raise_for_status() + if not response.json(): + return + + return deserialize_dataset_def(response.json()) + + async def delete_dataset( + self, + dataset_identifier: str, + ) -> DeleteDatasetResponse: + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.base_url}/datasets/delete", + json={ + "dataset_identifier": dataset_identifier, + }, + headers={"Content-Type": "application/json"}, + timeout=60, + ) + response.raise_for_status() + return DeleteDatasetResponse(**response.json()) + + async def list_dataset( + self, + ) -> List[DatasetDef]: + async with httpx.AsyncClient() as client: + response = await client.get( + f"{self.base_url}/datasets/list", + headers={"Content-Type": "application/json"}, + timeout=60, + ) + response.raise_for_status() + if not response.json(): + return + + return [deserialize_dataset_def(x) for x in response.json()] + + +async def run_main(host: str, port: int): + client = DatasetsClient(f"http://{host}:{port}") + + # register dataset + response = await client.create_dataset( + dataset_def=CustomDatasetDef( + identifier="test-dataset", + url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", + ), + ) + cprint(response, "green") + + # register HF dataset + response = await client.create_dataset( + dataset_def=HuggingfaceDatasetDef( + identifier="hellaswag", + dataset_name="hellaswag", + kwargs={"split": "validation", "trust_remote_code": True}, + ) + ) + cprint(response, "green") + + # get dataset + get_dataset = await client.get_dataset( + dataset_identifier="test-dataset", + ) + cprint(get_dataset, "cyan") + + # delete dataset + delete_dataset = await client.delete_dataset( + dataset_identifier="test-dataset", + ) + cprint(delete_dataset, "red") + + # get again after deletion + get_dataset = await client.get_dataset( + dataset_identifier="test-dataset", + ) + cprint(get_dataset, "yellow") + + # list datasets + list_dataset = await client.list_dataset() + cprint(list_dataset, "blue") + + +def main(host: str, port: int): + asyncio.run(run_main(host, port)) + + +if __name__ == "__main__": + fire.Fire(main) diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py new file mode 100644 index 0000000000..f5991c52e1 --- /dev/null +++ b/llama_stack/apis/datasets/datasets.py @@ -0,0 +1,225 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from abc import ABC, abstractmethod +from enum import Enum +from typing import Any, Dict, Generic, Iterator, Literal, Protocol, TypeVar, Union + +from llama_models.schema_utils import json_schema_type, webmethod +from llama_models.llama3.api.datatypes import * # noqa: F403 + +from pydantic import BaseModel, Field +from typing_extensions import Annotated + + +@json_schema_type +class TrainEvalDatasetColumnType(Enum): + dialog = "dialog" + text = "text" + media = "media" + number = "number" + json = "json" + + +@json_schema_type +class TrainEvalDataset(BaseModel): + """Dataset to be used for training or evaluating language models.""" + + # TODO(ashwin): figure out if we need to add an enum for a "dataset type" + + columns: Dict[str, TrainEvalDatasetColumnType] + content_url: URL + metadata: Optional[Dict[str, Any]] = None + + +@json_schema_type +class GenerationInput(BaseModel): + messages: List[Message] + + +@json_schema_type +class GenerationOutput(BaseModel): + completion_message: str + logprobs: Optional[List[TokenLogProbs]] = None + + +@json_schema_type +class PostprocessedGeneration(BaseModel): + completion_message: str + logprobs: Optional[List[TokenLogProbs]] = None + + +# A sample (row) from dataset +TDatasetSample = TypeVar("TDatasetSample") + + +@json_schema_type +class DatasetSample(BaseModel): ... + + +@json_schema_type +class DictSample(DatasetSample): + data: Dict[str, Any] + + +# A sample (row) from evals intermediate dataset after preprocessing +TPreprocessedSample = TypeVar("TPreprocessedSample") + + +@json_schema_type +class PreprocessedSample(DatasetSample): + generation_input: GenerationInput + + +# A sample (row) from evals intermediate dataset after inference +TGenerationResponseSample = TypeVar("TGenerationResponseSample") + + +@json_schema_type +class GenerationResponseSample(DatasetSample): + generation_output: GenerationOutput + + +# A sample (row) for prepared evals dataset ready for scoring +TScorerInputSample = TypeVar("TScorerInputSample") + + +@json_schema_type +class ScorerInputSample(DatasetSample): + """ + A dataset is required to have the following columns to be used for scoring: + - generated_answer: str + - expected_answer: Union[str, List[str]] + - (optional) input_query: str + - (optional) generation_output: PostprocessedGeneration + """ + + generated_answer: str + expected_answer: Union[str, List[str]] + input_query: Optional[str] = None + generation_output: Optional[PostprocessedGeneration] = None + + +@json_schema_type +class DatasetType(Enum): + custom = "custom" + huggingface = "huggingface" + + +@json_schema_type +class HuggingfaceDatasetDef(BaseModel): + type: Literal[DatasetType.huggingface.value] = DatasetType.huggingface.value + identifier: str = Field( + description="A unique name for the dataset", + ) + dataset_path: str = Field( + description="The name of the dataset into HF (e.g. meta-llama/Llama-3.1-8B-Instruct-evals)", + ) + dataset_name: Optional[str] = Field( + description="The name of the dataset into HF (e.g. Llama-3.1-8B-Instruct-evals__ifeval__strict__details)", + ) + rename_columns_map: Optional[Dict[str, str]] = Field( + description="A map of column names to rename to fit the schema of eval dataset for scoring", + default=None, + ) + kwargs: Dict[str, Any] = Field( + description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)", + default_factory=dict, + ) + + +@json_schema_type +class CustomDatasetDef(BaseModel): + type: Literal[DatasetType.custom.value] = DatasetType.custom.value + identifier: str = Field( + description="A unique name for the dataset", + ) + url: str = Field( + description="The URL to the dataset", + ) + rename_columns_map: Optional[Dict[str, str]] = Field( + description="A map of column names to rename to fit the schema of eval dataset for scoring", + default=None, + ) + + +DatasetDef = Annotated[ + Union[ + HuggingfaceDatasetDef, + CustomDatasetDef, + ], + Field(discriminator="type"), +] + + +class DatasetsResponseStatus(Enum): + success = "success" + fail = "fail" + + +@json_schema_type +class CreateDatasetResponse(BaseModel): + status: DatasetsResponseStatus = Field( + description="Return status of the dataset creation", + ) + msg: Optional[str] = None + + +@json_schema_type +class DeleteDatasetResponse(BaseModel): + status: DatasetsResponseStatus = Field( + description="Return status of the dataset creation", + ) + msg: Optional[str] = None + + +class BaseDataset(ABC, Generic[TDatasetSample]): + def __init__(self) -> None: + self.type: str = self.__class__.__name__ + + @property + @abstractmethod + def dataset_id(self) -> str: + raise NotImplementedError() + + @abstractmethod + def __iter__(self) -> Iterator[TDatasetSample]: + raise NotImplementedError() + + @abstractmethod + def __str__(self) -> str: + raise NotImplementedError() + + @abstractmethod + def __len__(self) -> int: + raise NotImplementedError() + + @abstractmethod + def load(self) -> None: + raise NotImplementedError() + + +class Datasets(Protocol): + @webmethod(route="/datasets/create") + async def create_dataset( + self, + dataset_def: DatasetDef, + ) -> CreateDatasetResponse: ... + + @webmethod(route="/datasets/get", method="GET") + async def get_dataset( + self, + dataset_identifier: str, + ) -> Optional[DatasetDef]: ... + + @webmethod(route="/datasets/delete") + async def delete_dataset( + self, + dataset_identifier: str, + ) -> DeleteDatasetResponse: ... + + @webmethod(route="/datasets/list", method="GET") + async def list_datasets(self) -> List[DatasetDef]: ... diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py new file mode 100644 index 0000000000..fc4820232f --- /dev/null +++ b/llama_stack/apis/evals/client.py @@ -0,0 +1,183 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import asyncio +import json + +import fire +import httpx +from termcolor import cprint + +from .evals import * # noqa: F403 +import base64 +import mimetypes +import os + +from ..datasets.client import DatasetsClient + + +def data_url_from_file(file_path: str) -> str: + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found: {file_path}") + + with open(file_path, "rb") as file: + file_content = file.read() + + base64_content = base64.b64encode(file_content).decode("utf-8") + mime_type, _ = mimetypes.guess_type(file_path) + + data_url = f"data:{mime_type};base64,{base64_content}" + + return data_url + + +class EvaluationClient(Evals): + def __init__(self, base_url: str): + self.base_url = base_url + + async def initialize(self) -> None: + pass + + async def shutdown(self) -> None: + pass + + async def run_evals( + self, + eval_task_config: EvaluateTaskConfig, + ) -> EvaluateResponse: + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.base_url}/evals/run_eval_task", + json={ + "eval_task_config": json.loads(eval_task_config.json()), + }, + headers={"Content-Type": "application/json"}, + timeout=3600, + ) + response.raise_for_status() + return EvaluateResponse(**response.json()) + + async def run_scorer( + self, + dataset_config: EvaluateDatasetConfig, + eval_scoring_config: EvaluateScoringConfig, + ) -> EvaluateResponse: + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.base_url}/evals/run_scorer", + json={ + "dataset_config": json.loads(dataset_config.json()), + "eval_scoring_config": json.loads(eval_scoring_config.json()), + }, + headers={"Content-Type": "application/json"}, + timeout=3600, + ) + response.raise_for_status() + return EvaluateResponse(**response.json()) + + +async def run_main(host: str, port: int, eval_dataset_path: str = ""): + client = EvaluationClient(f"http://{host}:{port}") + dataset_client = DatasetsClient(f"http://{host}:{port}") + + # Full Eval Task + # 1. register custom dataset + response = await dataset_client.create_dataset( + dataset_def=CustomDatasetDef( + identifier="mmlu-simple-eval-en", + url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", + ), + ) + cprint(f"datasets/create: {response}", "cyan") + + # 2. run evals on the registered dataset + eval_task_config = EvaluateTaskConfig( + dataset_config=EvaluateDatasetConfig( + dataset_identifier="mmlu-simple-eval-en", + row_limit=3, + ), + processor_config=EvaluateProcessorConfig( + processor_identifier="mmlu", + ), + generation_config=EvaluateModelGenerationConfig( + model="Llama3.1-8B-Instruct", + ), + scoring_config=EvaluateScoringConfig( + scorer_config_list=[ + EvaluateSingleScorerConfig(scorer_name="accuracy"), + EvaluateSingleScorerConfig(scorer_name="random"), + ] + ), + ) + response = await client.run_evals( + eval_task_config=eval_task_config, + ) + for k, v in response.eval_result.metrics.items(): + cprint(f"{k}: {v}", "green") + + # Scoring Task + # 1. register huggingface dataset + response = await dataset_client.create_dataset( + dataset_def=HuggingfaceDatasetDef( + identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals", + dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + rename_columns_map={ + "output_parsed_answer": "generated_answer", + "input_correct_responses": "expected_answer", + }, + kwargs={"split": "latest"}, + ) + ) + cprint(response, "cyan") + + # register custom dataset from file path + response = await dataset_client.create_dataset( + dataset_def=CustomDatasetDef( + identifier="rag-evals", + url=data_url_from_file(eval_dataset_path), + ) + ) + cprint(response, "cyan") + + # 2. run evals on the registered dataset + response = await client.run_scorer( + dataset_config=EvaluateDatasetConfig( + dataset_identifier="rag-evals", + row_limit=10, + ), + eval_scoring_config=EvaluateScoringConfig( + scorer_config_list=[ + # EvaluateSingleScorerConfig(scorer_name="accuracy"), + # EvaluateSingleScorerConfig( + # scorer_name="braintrust::answer-correctness" + # ), + EvaluateSingleScorerConfig( + scorer_name="llamastack-llm-judge", + llm_judge_config=LLMJudgeConfig( + judge_processor_config=EvaluateProcessorConfig( + processor_identifier="judge", + ), + judge_model_generation_config=EvaluateModelGenerationConfig( + model="Llama3.1-8B-Instruct", + ), + judge_scoring_config=EvaluateJudgeScoringConfig(), + ), + ), + ] + ), + ) + + for k, v in response.eval_result.metrics.items(): + cprint(f"{k}: {v}", "green") + + +def main(host: str, port: int, eval_dataset_path: str = ""): + asyncio.run(run_main(host, port, eval_dataset_path)) + + +if __name__ == "__main__": + fire.Fire(main) diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index 0be2243ab1..c484db734f 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -4,119 +4,256 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from enum import Enum -from typing import List, Protocol +from abc import ABC, abstractmethod +from typing import Dict, Generic, List, Optional, Protocol from llama_models.schema_utils import webmethod - from pydantic import BaseModel from llama_models.llama3.api.datatypes import * # noqa: F403 -from llama_stack.apis.dataset import * # noqa: F403 -from llama_stack.apis.common.training_types import * # noqa: F403 +from llama_stack.apis.datasets import * # noqa: F403 -class TextGenerationMetric(Enum): - perplexity = "perplexity" - rouge = "rouge" - bleu = "bleu" +class EvaluationJob(BaseModel): + job_uuid: str -class QuestionAnsweringMetric(Enum): - em = "em" - f1 = "f1" +class EvaluationJobLogStream(BaseModel): + job_uuid: str -class SummarizationMetric(Enum): - rouge = "rouge" - bleu = "bleu" +@json_schema_type +class EvalResult(BaseModel): + """Aggregated final evaluation result.""" + metrics: Dict[str, float] -class EvaluationJob(BaseModel): - job_uuid: str +@json_schema_type +class SingleEvalResult(BaseModel): + """Single evaluation result. Contains a scorer name, and corresponding metrics from scorer.""" -class EvaluationJobLogStream(BaseModel): - job_uuid: str + score_data: Dict[str, float] + + +@json_schema_type +class EvaluateResponse(BaseModel): + """Scores for evaluation.""" + eval_result: EvalResult + formatted_report: Optional[str] = None -class EvaluateTaskRequestCommon(BaseModel): + +@json_schema_type +class EvaluationJobStatusResponse(BaseModel): job_uuid: str - dataset: TrainEvalDataset - checkpoint: Checkpoint - # generation params - sampling_params: SamplingParams = SamplingParams() +@json_schema_type +class EvaluationJobCreateResponse(BaseModel): + """Response to create a evaluation job.""" + + job_uuid: str @json_schema_type -class EvaluateTextGenerationRequest(EvaluateTaskRequestCommon): - """Request to evaluate text generation.""" +class EvaluateDatasetConfig(BaseModel): + # identifier to previously registered dataset via DatasetDef + dataset_identifier: str + # limit number of rows to evaluate + row_limit: Optional[int] = None + kwargs: Optional[Dict[str, Any]] = None - metrics: List[TextGenerationMetric] + +@json_schema_type +class EvaluatePreprocessConfig(BaseModel): + kwargs: Optional[Dict[str, Any]] = None @json_schema_type -class EvaluateQuestionAnsweringRequest(EvaluateTaskRequestCommon): - """Request to evaluate question answering.""" +class EvaluateModelGenerationConfig(BaseModel): + model: str + sampling_params: SamplingParams = SamplingParams() + kwargs: Optional[Dict[str, Any]] = None - metrics: List[QuestionAnsweringMetric] + +@json_schema_type +class EvaluatePostprocessConfig(BaseModel): + kwargs: Optional[Dict[str, Any]] = None @json_schema_type -class EvaluateSummarizationRequest(EvaluateTaskRequestCommon): - """Request to evaluate summarization.""" +class EvaluateProcessorConfig(BaseModel): + processor_identifier: str + preprocess_config: Optional[EvaluatePreprocessConfig] = None + postprocess_config: Optional[EvaluatePostprocessConfig] = None - metrics: List[SummarizationMetric] +@json_schema_type +class EvaluateJudgeScoringConfig(BaseModel): ... -class EvaluationJobStatusResponse(BaseModel): - job_uuid: str + +@json_schema_type +class LLMJudgeConfig(BaseModel): + judge_processor_config: EvaluateProcessorConfig + judge_model_generation_config: EvaluateModelGenerationConfig + judge_scoring_config: EvaluateJudgeScoringConfig @json_schema_type -class EvaluationJobArtifactsResponse(BaseModel): - """Artifacts of a evaluation job.""" +class EvaluateSingleScorerConfig(BaseModel): + scorer_name: str + llm_judge_config: Optional[LLMJudgeConfig] = None - job_uuid: str + +@json_schema_type +class EvaluateScoringConfig(BaseModel): + # list of scorer (metrics) names to use + scorer_config_list: List[EvaluateSingleScorerConfig] -class Evaluations(Protocol): - @webmethod(route="/evaluate/text_generation/") - def evaluate_text_generation( +@json_schema_type +class EvaluateTaskConfig(BaseModel): + dataset_config: EvaluateDatasetConfig + processor_config: EvaluateProcessorConfig + generation_config: EvaluateModelGenerationConfig + scoring_config: EvaluateScoringConfig + + +class BaseGeneratorProcessor( + ABC, + Generic[ + TDatasetSample, + TPreprocessedSample, + TGenerationResponseSample, + TScorerInputSample, + ], +): + """ + Base class for all generator processors. Each processor needs to implement the following methods: + - F1: preprocess_sample(self, dataset) + - F2: postprocess_sample(self) + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + def __str__(self) -> str: + return self.__class__.__name__ + + def preprocess( + self, dataset: BaseDataset[TDatasetSample] + ) -> List[TPreprocessedSample]: + return [self.preprocess_sample(sample) for sample in dataset] + + def postprocess( + self, + generation: List[TGenerationResponseSample], + dataset: BaseDataset[TDatasetSample], + ) -> List[TScorerInputSample]: + return [ + self.postprocess_sample(generation_sample, dataset_sample) + for generation_sample, dataset_sample in zip(generation, dataset) + ] + + @abstractmethod + def preprocess_sample(self, sample: TDatasetSample) -> TPreprocessedSample: + raise NotImplementedError() + + @abstractmethod + def postprocess_sample( self, - metrics: List[TextGenerationMetric], - ) -> EvaluationJob: ... + generation_sample: TGenerationResponseSample, + dataset_sample: TDatasetSample, + ) -> TScorerInputSample: + raise NotImplementedError() + + +class BaseGenerator(ABC, Generic[TPreprocessedSample, TGenerationResponseSample]): + """ + Base class for all generators. Each generator needs to implement the following methods: + - generate(self, preprocessed_dataset) + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + def __str__(self) -> str: + return self.__class__.__name__ + + @abstractmethod + async def generate( + self, preprocessed_dataset: List[TPreprocessedSample] + ) -> List[TGenerationResponseSample]: + raise NotImplementedError() + + +class BaseScorer(ABC, Generic[TScorerInputSample]): + """ + Base class for all scorers. Each scorer needs to implement the following methods: + - score_sample(self, scorer_input_sample) + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + def __str__(self) -> str: + return self.__class__.__name__ + + @abstractmethod + def score_sample(self, scorer_input_sample: TScorerInputSample) -> SingleEvalResult: + raise NotImplementedError() + + @abstractmethod + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + raise NotImplementedError() + + def score( + self, prepared_eval_dataset: List[TScorerInputSample] + ) -> List[SingleEvalResult]: + return [self.score_sample(sample) for sample in prepared_eval_dataset] + + +class BaseTask(ABC): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + @abstractmethod + async def run(self, *args, **kwargs) -> EvalResult: + raise NotImplementedError() + + +class Evals(Protocol): - @webmethod(route="/evaluate/question_answering/") - def evaluate_question_answering( + @webmethod(route="/evals/run_eval_task") + async def run_eval_task( self, - metrics: List[QuestionAnsweringMetric], - ) -> EvaluationJob: ... + eval_task_config: EvaluateTaskConfig, + ) -> EvaluateResponse: ... - @webmethod(route="/evaluate/summarization/") - def evaluate_summarization( + @webmethod(route="/evals/run_scorer") + async def run_scorer( self, - metrics: List[SummarizationMetric], - ) -> EvaluationJob: ... + dataset_config: EvaluateDatasetConfig, + eval_scoring_config: EvaluateScoringConfig, + ) -> EvaluateResponse: ... - @webmethod(route="/evaluate/jobs") - def get_evaluation_jobs(self) -> List[EvaluationJob]: ... + # @webmethod(route="/evals/jobs") + # def get_evaluation_jobs(self) -> List[EvaluationJob]: ... - @webmethod(route="/evaluate/job/status") - def get_evaluation_job_status( - self, job_uuid: str - ) -> EvaluationJobStatusResponse: ... + # @webmethod(route="/evals/job/create") + # async def create_evaluation_job( + # self, model: str, dataset: str, task: str + # ) -> EvaluationJob: ... - # sends SSE stream of logs - @webmethod(route="/evaluate/job/logs") - def get_evaluation_job_logstream(self, job_uuid: str) -> EvaluationJobLogStream: ... + # @webmethod(route="/evals/job/status") + # def get_evaluation_job_status( + # self, job_uuid: str + # ) -> EvaluationJobStatusResponse: ... - @webmethod(route="/evaluate/job/cancel") - def cancel_evaluation_job(self, job_uuid: str) -> None: ... + # # sends SSE stream of logs + # @webmethod(route="/evals/job/logs") + # def get_evaluation_job_logstream(self, job_uuid: str) -> EvaluationJobLogStream: ... - @webmethod(route="/evaluate/job/artifacts") - def get_evaluation_job_artifacts( - self, job_uuid: str - ) -> EvaluationJobArtifactsResponse: ... + # @webmethod(route="/evals/job/cancel") + # def cancel_evaluation_job(self, job_uuid: str) -> None: ... diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py index d943f48b20..cdfe5c4673 100644 --- a/llama_stack/apis/post_training/post_training.py +++ b/llama_stack/apis/post_training/post_training.py @@ -14,7 +14,7 @@ from pydantic import BaseModel, Field from llama_models.llama3.api.datatypes import * # noqa: F403 -from llama_stack.apis.dataset import * # noqa: F403 +from llama_stack.apis.datasets import * # noqa: F403 from llama_stack.apis.common.training_types import * # noqa: F403 diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py index 0044de09ee..ce7f5a8e50 100644 --- a/llama_stack/distribution/datatypes.py +++ b/llama_stack/distribution/datatypes.py @@ -73,6 +73,16 @@ class RoutingTableProviderSpec(ProviderSpec): pip_packages: List[str] = Field(default_factory=list) +# Example: /datasets +class RegistryProviderSpec(ProviderSpec): + provider_type: str = "registry" + config_class: str = "" + docker_image: Optional[str] = None + + module: str + pip_packages: List[str] = Field(default_factory=list) + + class DistributionSpec(BaseModel): description: Optional[str] = Field( default="", diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py index 999646cc06..d96db23b46 100644 --- a/llama_stack/distribution/distribution.py +++ b/llama_stack/distribution/distribution.py @@ -21,6 +21,19 @@ class AutoRoutedApiInfo(BaseModel): router_api: Api +class RegistryApiInfo(BaseModel): + registry_api: Api + # registry: Registry + + +def builtin_registry_apis() -> List[RegistryApiInfo]: + return [ + RegistryApiInfo( + registry_api=Api.datasets, + ) + ] + + def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]: return [ AutoRoutedApiInfo( @@ -42,7 +55,12 @@ def providable_apis() -> List[Api]: routing_table_apis = set( x.routing_table_api for x in builtin_automatically_routed_apis() ) - return [api for api in Api if api not in routing_table_apis and api != Api.inspect] + registry_apis = set( + x.registry_api for x in builtin_registry_apis() if x.registry_api + ) + non_providable_apis = routing_table_apis | registry_apis | {Api.inspect} + + return [api for api in Api if api not in non_providable_apis] def get_provider_registry() -> Dict[Api, Dict[str, ProviderSpec]]: diff --git a/llama_stack/distribution/registry/__init__.py b/llama_stack/distribution/registry/__init__.py new file mode 100644 index 0000000000..6e68333280 --- /dev/null +++ b/llama_stack/distribution/registry/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import Any + +from llama_stack.providers.datatypes import Api +from .datasets.dataset import DatasetRegistryImpl + + +async def get_registry_impl(api: Api, _deps) -> Any: + api_to_registry = { + "datasets": DatasetRegistryImpl, + } + + if api.value not in api_to_registry: + raise ValueError(f"API {api.value} not found in registry map") + + impl = api_to_registry[api.value]() + await impl.initialize() + return impl diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py new file mode 100644 index 0000000000..4474c8d7d8 --- /dev/null +++ b/llama_stack/distribution/registry/datasets/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from llama_stack.apis.datasets import * # noqa: F403 +from ..registry import Registry + + +DatasetRegistry = Registry[BaseDataset]() diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py new file mode 100644 index 0000000000..838e8c65fa --- /dev/null +++ b/llama_stack/distribution/registry/datasets/dataset.py @@ -0,0 +1,83 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.datasets import * # noqa: F403 +from llama_stack.distribution.registry.datasets import DatasetRegistry +from llama_stack.distribution.registry.datasets.dataset_wrappers import ( + CustomDataset, + HuggingfaceDataset, +) + + +class DatasetRegistryImpl(Datasets): + """API Impl to interact with underlying dataset registry""" + + def __init__( + self, + ) -> None: + pass + + async def initialize(self) -> None: + pass + + async def shutdown(self) -> None: + pass + + async def create_dataset( + self, + dataset_def: DatasetDef, + ) -> CreateDatasetResponse: + if dataset_def.type == DatasetType.huggingface.value: + dataset_cls = HuggingfaceDataset(dataset_def) + else: + dataset_cls = CustomDataset(dataset_def) + + try: + DatasetRegistry.register( + dataset_def.identifier, + dataset_cls, + ) + except ValueError as e: + return CreateDatasetResponse( + status=DatasetsResponseStatus.fail, + msg=str(e), + ) + + return CreateDatasetResponse( + status=DatasetsResponseStatus.success, + msg=f"Dataset '{dataset_def.identifier}' registered", + ) + + async def get_dataset( + self, + dataset_identifier: str, + ) -> Optional[DatasetDef]: + try: + dataset_ref = DatasetRegistry.get(dataset_identifier).config + except ValueError as e: + return None + + return dataset_ref + + async def delete_dataset(self, dataset_identifier: str) -> DeleteDatasetResponse: + try: + DatasetRegistry.delete(dataset_identifier) + except ValueError as e: + return DeleteDatasetResponse( + status=DatasetsResponseStatus.fail, + msg=str(e), + ) + + return DeleteDatasetResponse( + status=DatasetsResponseStatus.success, + msg=f"Dataset '{dataset_identifier}' deleted", + ) + + async def list_datasets(self) -> List[DatasetDef]: + return [ + DatasetRegistry.get(dataset_identifier).config + for dataset_identifier in DatasetRegistry.names() + ] diff --git a/llama_stack/distribution/registry/datasets/dataset_wrappers.py b/llama_stack/distribution/registry/datasets/dataset_wrappers.py new file mode 100644 index 0000000000..6c9af5887c --- /dev/null +++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py @@ -0,0 +1,115 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import io + +import pandas +from datasets import Dataset, load_dataset + +from llama_stack.apis.datasets import * # noqa: F403 +from llama_stack.providers.utils.memory.vector_store import parse_data_url + + +class CustomDataset(BaseDataset[DictSample]): + def __init__(self, config: CustomDatasetDef) -> None: + super().__init__() + self.config = config + self.dataset = None + self.index = 0 + + @property + def dataset_id(self) -> str: + return self.config.identifier + + def __iter__(self) -> Iterator[DictSample]: + if not self.dataset: + self.load() + return (DictSample(data=x) for x in self.dataset) + + def __str__(self) -> str: + return f"CustomDataset({self.config})" + + def __len__(self) -> int: + if not self.dataset: + self.load() + return len(self.dataset) + + def load(self, n_samples: Optional[int] = None) -> None: + if self.dataset: + return + + # TODO: more robust support w/ data url + if self.config.url.endswith(".csv"): + df = pandas.read_csv(self.config.url) + elif self.config.url.endswith(".xlsx"): + df = pandas.read_excel(self.config.url) + elif self.config.url.startswith("data:"): + parts = parse_data_url(self.config.url) + data = parts["data"] + if parts["is_base64"]: + data = base64.b64decode(data) + else: + data = unquote(data) + encoding = parts["encoding"] or "utf-8" + data = data.encode(encoding) + + mime_type = parts["mimetype"] + mime_category = mime_type.split("/")[0] + data_bytes = io.BytesIO(data) + + if mime_category == "text": + df = pandas.read_csv(data_bytes) + else: + df = pandas.read_excel(data_bytes) + else: + raise ValueError(f"Unsupported file type: {self.config.url}") + + if n_samples is not None: + df = df.sample(n=min(n_samples, len(df))) + + self.dataset = Dataset.from_pandas(df) + if self.config.rename_columns_map: + for k, v in self.config.rename_columns_map.items(): + self.dataset = self.dataset.rename_column(k, v) + + +class HuggingfaceDataset(BaseDataset[DictSample]): + def __init__(self, config: HuggingfaceDatasetDef): + super().__init__() + self.config = config + self.dataset = None + + @property + def dataset_id(self) -> str: + return self.config.identifier + + def __iter__(self) -> Iterator[DictSample]: + if not self.dataset: + self.load() + return (DictSample(data=x) for x in self.dataset) + + def __str__(self): + return f"HuggingfaceDataset({self.config})" + + def __len__(self): + if not self.dataset: + self.load() + return len(self.dataset) + + def load(self, n_samples: Optional[int] = None): + if self.dataset: + return + + if self.config.dataset_name: + self.config.kwargs["name"] = self.config.dataset_name + + self.dataset = load_dataset(self.config.dataset_path, **self.config.kwargs) + + if n_samples: + self.dataset = self.dataset.select(range(n_samples)) + + if self.config.rename_columns_map: + for k, v in self.config.rename_columns_map.items(): + self.dataset = self.dataset.rename_column(k, v) diff --git a/llama_stack/distribution/registry/generator_processors/__init__.py b/llama_stack/distribution/registry/generator_processors/__init__.py new file mode 100644 index 0000000000..862984f548 --- /dev/null +++ b/llama_stack/distribution/registry/generator_processors/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.processor import * # noqa: F403 + +from ..registry import Registry + +# TODO: decide whether we should group dataset+processor together via Tasks +GeneratorProcessorRegistry = Registry[BaseGeneratorProcessor]() + +PROCESSOR_REGISTRY = { + "mmlu": MMLUProcessor, + "judge": JudgeProcessor, +} + +for k, v in PROCESSOR_REGISTRY.items(): + GeneratorProcessorRegistry.register(k, v) diff --git a/llama_stack/distribution/registry/registry.py b/llama_stack/distribution/registry/registry.py new file mode 100644 index 0000000000..702ed7d869 --- /dev/null +++ b/llama_stack/distribution/registry/registry.py @@ -0,0 +1,36 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import AbstractSet, Generic, TypeVar + +TRegistry = TypeVar("TRegistry") + + +class Registry(Generic[TRegistry]): + + def __init__(self) -> None: + super().__init__() + self.registry = {} + + def names(self) -> AbstractSet[str]: + return self.registry.keys() + + def register(self, name: str, task: TRegistry) -> None: + if name in self.registry: + raise ValueError(f"Dataset {name} already exists.") + self.registry[name] = task + + def get(self, name: str) -> TRegistry: + if name not in self.registry: + raise ValueError(f"Dataset {name} not found.") + return self.registry[name] + + def delete(self, name: str) -> None: + if name not in self.registry: + raise ValueError(f"Dataset {name} not found.") + del self.registry[name] + + def reset(self) -> None: + self.registry = {} diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py new file mode 100644 index 0000000000..dda71d4e00 --- /dev/null +++ b/llama_stack/distribution/registry/scorers/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +# TODO: make these import config based +from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.scorer.braintrust_scorer import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.scorer.llm_judge_scorer import * # noqa: F403 + +from ..registry import Registry + +# TODO: make these import config based +ScorerRegistry = Registry[BaseScorer]() + +SCORER_REGISTRY = { + "accuracy": AccuracyScorer, + "random": RandomScorer, + "llamastack-llm-judge": LlamaStackLLMJudgeScorer, + "braintrust::factuality": BraintrustFactualityScorer, + "braintrust::answer-correctness": BraintrustAnswerCorrectnessScorer, +} + +for k, v in SCORER_REGISTRY.items(): + ScorerRegistry.register(k, v) diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py index a05e08cd7c..e71c3fd8ce 100644 --- a/llama_stack/distribution/resolver.py +++ b/llama_stack/distribution/resolver.py @@ -12,6 +12,8 @@ from llama_stack.distribution.datatypes import * # noqa: F403 from llama_stack.apis.agents import Agents +from llama_stack.apis.datasets import Datasets +from llama_stack.apis.evals import Evals from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect from llama_stack.apis.memory import Memory @@ -22,6 +24,7 @@ from llama_stack.apis.telemetry import Telemetry from llama_stack.distribution.distribution import ( builtin_automatically_routed_apis, + builtin_registry_apis, get_provider_registry, ) from llama_stack.distribution.utils.dynamic import instantiate_class_type @@ -38,6 +41,8 @@ def api_protocol_map() -> Dict[Api, Any]: Api.safety: Safety, Api.shields: Shields, Api.telemetry: Telemetry, + Api.evals: Evals, + Api.datasets: Datasets, } @@ -137,6 +142,20 @@ async def resolve_impls_with_routing(run_config: StackRunConfig) -> Dict[Api, An ) } + for info in builtin_registry_apis(): + providers_with_specs[info.registry_api.value] = { + "__builtin__": ProviderWithSpec( + provider_id="__registry__", + provider_type="__registry__", + config={}, + spec=RegistryProviderSpec( + api=info.registry_api, + module="llama_stack.distribution.registry", + deps__=[], + ), + ) + } + sorted_providers = topological_sort( {k: v.values() for k, v in providers_with_specs.items()} ) @@ -257,6 +276,12 @@ async def instantiate_provider( config = None args = [provider_spec.api, inner_impls, deps] + elif isinstance(provider_spec, RegistryProviderSpec): + print("ROUTER PROVIDER SPEC") + method = "get_registry_impl" + + config = None + args = [provider_spec.api, deps] else: method = "get_provider_impl" diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py index 777cd855b7..1d397c9e73 100644 --- a/llama_stack/providers/datatypes.py +++ b/llama_stack/providers/datatypes.py @@ -32,6 +32,9 @@ class Api(Enum): # built-in API inspect = "inspect" + evals = "evals" + datasets = "datasets" + class ModelsProtocolPrivate(Protocol): async def list_models(self) -> List[ModelDef]: ... diff --git a/llama_stack/providers/impls/meta_reference/evals/__init__.py b/llama_stack/providers/impls/meta_reference/evals/__init__.py new file mode 100644 index 0000000000..f4dd4b79d6 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .config import MetaReferenceEvalsImplConfig # noqa +from llama_stack.apis.inference import * # noqa: F403 +from llama_stack.distribution.datatypes import Api, ProviderSpec + + +async def get_provider_impl( + config: MetaReferenceEvalsImplConfig, deps: Dict[Api, ProviderSpec] +): + from .evals import MetaReferenceEvalsImpl + + impl = MetaReferenceEvalsImpl(config, deps[Api.inference]) + await impl.initialize() + return impl diff --git a/llama_stack/providers/impls/meta_reference/evals/config.py b/llama_stack/providers/impls/meta_reference/evals/config.py new file mode 100644 index 0000000000..05dee366ed --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/config.py @@ -0,0 +1,10 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pydantic import BaseModel + + +class MetaReferenceEvalsImplConfig(BaseModel): ... diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py new file mode 100644 index 0000000000..7d3eaa85d8 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -0,0 +1,58 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import json + +from termcolor import cprint + +from llama_stack.apis.inference import * # noqa: F403 +from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.apis.datasets import * # noqa: F403 + +from .config import MetaReferenceEvalsImplConfig +from .tasks.run_eval_task import RunEvalTask +from .tasks.run_scoring_task import RunScoringTask + + +class MetaReferenceEvalsImpl(Evals): + def __init__(self, config: MetaReferenceEvalsImplConfig, inference_api: Inference): + self.inference_api = inference_api + + async def initialize(self) -> None: + pass + + async def shutdown(self) -> None: + pass + + async def run_eval_task( + self, + eval_task_config: EvaluateTaskConfig, + ) -> EvaluateResponse: + cprint(f"run_eval_task: on {eval_task_config}", "green") + + run_task = RunEvalTask() + eval_result = await run_task.run(eval_task_config, self.inference_api) + + return EvaluateResponse( + eval_result=eval_result, + formatted_report=json.dumps(eval_result.json(), indent=4), + ) + + async def run_scorer( + self, + dataset_config: EvaluateDatasetConfig, + eval_scoring_config: EvaluateScoringConfig, + ) -> EvaluateResponse: + cprint(f"run_scorer: on {dataset_config} with {eval_scoring_config}", "green") + + run_task = RunScoringTask() + eval_result = await run_task.run( + dataset_config, eval_scoring_config, self.inference_api + ) + + return EvaluateResponse( + eval_result=eval_result, + formatted_report=json.dumps(eval_result.json(), indent=4), + ) diff --git a/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py b/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py new file mode 100644 index 0000000000..756f351d88 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py b/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py new file mode 100644 index 0000000000..dafbb16f5b --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py @@ -0,0 +1,47 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from termcolor import cprint + +from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.apis.inference import * # noqa: F403 + + +class InferenceGenerator(BaseGenerator[PreprocessedSample, GenerationResponseSample]): + """ + InferenceGenerator for LlamaStack + """ + + def __init__( + self, + model, + inference_api, + *args, + **kwargs, + ) -> None: + super().__init__(*args, **kwargs) + self.model = model + self.inference_api = inference_api + + async def generate( + self, preprocessed_dataset: List[PreprocessedSample] + ) -> List[GenerationResponseSample]: + generation_outputs = [] + for sample in preprocessed_dataset: + response = await self.inference_api.chat_completion( + model=self.model, + messages=sample.generation_input.messages, + stream=False, + ) + cprint(f"response: {response}", "cyan") + + generation_outputs.append( + GenerationResponseSample( + generation_output=GenerationOutput( + completion_message=response.completion_message.content + ) + ) + ) + return generation_outputs diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py new file mode 100644 index 0000000000..5a7ca27958 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from .judge_processor import JudgeProcessor # noqa: F401 +from .mmlu_processor import MMLUProcessor # noqa: F401 diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py b/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py new file mode 100644 index 0000000000..d7d6ae3eb2 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py @@ -0,0 +1,75 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import re + +from llama_stack.apis.evals import * # noqa: F403 + +JUDGE_PROMPT = """ +You will be given a question, a expected_answer, and a system_answer. +Your task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question. +Give your answer as a integer on a scale of 0 to 5, where 0 means that the system_answer is not correct at all compared with expected_answer, and 5 means that the answer completely and correctly answers the question. + +Provide your feedback as follows: + +Feedback::: +Total rating: (your rating, as a int between 0 and 5) + +Now here are the question, expected_answer, system_answer. + +Question: {question} +Expected Answer: {expected_answer} +System Answer: {answer} + +Feedback::: +Total rating: +""" + + +class JudgeProcessor( + BaseGeneratorProcessor[ + DictSample, PreprocessedSample, GenerationResponseSample, ScorerInputSample + ] +): + """ + Generator processor for LLM Judge + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def preprocess_sample(self, sample: DictSample) -> PreprocessedSample: + content = JUDGE_PROMPT.format( + question=sample.data["input_query"], + expected_answer=sample.data["expected_answer"], + answer=sample.data["generated_answer"], + ) + preprocessed_msgs = [ + { + "role": "user", + "content": content, + } + ] + processed_sample = PreprocessedSample( + generation_input=GenerationInput( + messages=preprocessed_msgs, + ) + ) + return processed_sample + + def postprocess_sample( + self, generation_sample: GenerationResponseSample, dataset_sample: DictSample + ) -> ScorerInputSample: + response_text = generation_sample.generation_output.completion_message + match = re.search(r"Total rating: (\d+)", response_text) + judge_rating = int(match.group(1)) + + return ScorerInputSample( + generated_answer=str(judge_rating), + expected_answer=dataset_sample.data["expected_answer"], + generation_output=PostprocessedGeneration( + completion_message=response_text, + ), + ) diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py b/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py new file mode 100644 index 0000000000..fc2d9eb642 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py @@ -0,0 +1,161 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import re + +from llama_stack.apis.evals import * # noqa: F403 + +QUERY_TEMPLATE_MULTICHOICE = """ +Answer the following multiple choice question and make the answer very simple. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. + +{Question} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + +MULTILINGUAL_ANSWER_REGEXES = [ + r"Answer\s*:", + r"Answer\s*:​​​​​​", # Korean invisible character + r"উত্তর\s*:", + r"उत्तर\s*:", + r"উত্তরঃ", + r"উত্তর\s*:", + r"Antwort\s*:", + r"답변\s*:", + r"정답\s*:", + r"답\s*:", + r"答案\s*:", + r"答案\s*:", + r"答\s*:", + r"答\s*:", + r"答复\s*:", + r"答曰\s*:", + r"الإجابة:", + r"الجواب:", + r"إجابة:", + r"الإجابة النهائية:", + r"الإجابة الصحيحة:", + r"الإجابة الصحيحة هي:", + r"الإجابة هي:", + r"Respuesta\s*:", + r"Risposta\s*:", + r"答え\s*:", + r"答え\s*:", + r"回答\s*:", + r"回答\s*:", + r"解答\s*:", + r"Jawaban\s*:", + r"Réponse\s*:", + r"Resposta\s*:", + r"Jibu\s*:", + r"Idahun\s*:", + r"Ìdáhùn\s*:", + r"Idáhùn\s*:", + r"Àmọ̀nà\s*:", + r"Àdáhùn\s*:", + r"Ànúgọ\s*:", + r"Àṣàyàn\s*:", +] + +MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = ( + r"(?i){}\s*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[A]|[B]|[C]|[D])" +) + + +def normalize_response(response: str) -> str: + """ + Normalize the response by removing markdown and LaTeX formatting that may prevent a match. + """ + + return ( + response.replace("**", "") + .replace("$\\boxed{", "") + .replace("}$", "") + .replace("\\$", "") + .replace("$\\text{", "") + .replace("$", "") + .replace("\\mathrm{", "") + .replace("\\{", "") + .replace("\\text", "") + .replace("\\(", "") + .replace("\\mathbf{", "") + .replace("{", "") + .replace("\\boxed", "") + ) + + +def normalize_extracted_answer(extracted_answer: str) -> str: + return ( + # In arabic these are the letters used for A-D in multiple choice questions + extracted_answer.replace("أ", " A") + .replace("ب", " B") + .replace("ج", " C") + .replace("د", " D") + # In Bengali these are the letters used for A-D in multiple choice questions + .replace("অ", " A") + .replace("ব", " B") + .replace("ড", " C") + .replace("ঢ", " D") + # In Japanese these are the letters sometimes used for A-D in multiple choice questions + .replace("A", " A") + .replace("B", " B") + .replace("C", " C") + .replace("D", " D") + .strip() + ) + + +class MMLUProcessor( + BaseGeneratorProcessor[ + DictSample, PreprocessedSample, GenerationResponseSample, ScorerInputSample + ] +): + """ + Generator processor for MMLU + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def preprocess_sample(self, sample: DictSample) -> PreprocessedSample: + content = QUERY_TEMPLATE_MULTICHOICE.format(**sample.data) + preprocessed_msgs = [ + { + "role": "user", + "content": content, + } + ] + processed_sample = PreprocessedSample( + generation_input=GenerationInput( + messages=preprocessed_msgs, + ) + ) + return processed_sample + + def postprocess_sample( + self, generation_sample: GenerationResponseSample, dataset_sample: DictSample + ) -> ScorerInputSample: + response_text = generation_sample.generation_output.completion_message + normalized_response = normalize_response(response_text) + + # extract answer + extracted_answer = "" + for answer_regex in MULTILINGUAL_ANSWER_REGEXES: + regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex) + match = re.search(regex, normalized_response) + if match: + extracted_answer = normalize_extracted_answer(match.group(1)) + break + + return ScorerInputSample( + generated_answer=extracted_answer, + expected_answer=dataset_sample.data["Answer"], + generation_output=PostprocessedGeneration( + completion_message=response_text, + ), + ) diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py new file mode 100644 index 0000000000..6424963f87 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from .basic_scorers import * # noqa: F401 F403 +from .aggregate_scorer import * # noqa: F401 F403 diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py new file mode 100644 index 0000000000..1a0621960e --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py @@ -0,0 +1,35 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult +from llama_stack.apis.datasets.datasets import * # noqa: F401 F403 + + +class AggregateScorer(BaseScorer[ScorerInputSample]): + def __init__(self, scorers: List[BaseScorer[ScorerInputSample]]): + self.scorers = scorers + + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + all_score_data = {} + for scorer in self.scorers: + score_data = scorer.score_sample(scorer_input_sample).score_data + for k, v in score_data.items(): + all_score_data[k] = v + + return SingleEvalResult( + score_data=all_score_data, + ) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + all_metrics = {} + + for scorer in self.scorers: + metrics = scorer.aggregate_results(eval_results).metrics + for k, v in metrics.items(): + all_metrics[f"{scorer.__class__.__name__}:{k}"] = v + + return EvalResult( + metrics=all_metrics, + ) diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py new file mode 100644 index 0000000000..748f9fc1f8 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py @@ -0,0 +1,55 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import random + +from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult +from llama_stack.apis.datasets.datasets import * # noqa: F401 F403 + + +class RandomScorer(BaseScorer[ScorerInputSample]): + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + return SingleEvalResult(score_data={"random": random.random()}) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + avg_random = sum( + [result.score_data["random"] for result in eval_results] + ) / len(eval_results) + max_random = max([result.score_data["random"] for result in eval_results]) + return EvalResult( + metrics={ + "avg_random": avg_random, + "max_random": max_random, + } + ) + + +class AccuracyScorer(BaseScorer[ScorerInputSample]): + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + extracted_answer = scorer_input_sample.generated_answer + expected_answer = scorer_input_sample.expected_answer + + if isinstance(expected_answer, list): + accuracy = ( + 1.0 if extracted_answer and extracted_answer in expected_answer else 0.0 + ) + else: + accuracy = ( + 1.0 if extracted_answer and extracted_answer == expected_answer else 0.0 + ) + + return SingleEvalResult(score_data={"accuracy": accuracy}) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + num_correct = sum([result.score_data["accuracy"] for result in eval_results]) + num_total = len(eval_results) + + return EvalResult( + metrics={ + "avg_accuracy": num_correct / num_total, + "num_correct": num_correct, + "num_total": num_total, + } + ) diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py new file mode 100644 index 0000000000..c124aaad6a --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py @@ -0,0 +1,57 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import numpy as np + +from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult +from llama_stack.apis.datasets.datasets import * # noqa: F401 F403 +from autoevals.llm import * # noqa: F403 +from autoevals.ragas import * # noqa: F403 + + +class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]): + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + input_query = scorer_input_sample.input_query + generated_answer = scorer_input_sample.generated_answer + expected_answer = scorer_input_sample.expected_answer + + evaluator = Factuality() + result = evaluator(generated_answer, expected_answer, input=input_query) + factuality = result.score + return SingleEvalResult(score_data={"factuality": factuality}) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + avg_score = np.average( + [result.score_data["factuality"] for result in eval_results] + ) + + return EvalResult( + metrics={ + "avg_factuality_score": avg_score, + } + ) + + +class BraintrustAnswerCorrectnessScorer(BaseScorer[ScorerInputSample]): + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + input_query = scorer_input_sample.input_query + generated_answer = scorer_input_sample.generated_answer + expected_answer = scorer_input_sample.expected_answer + + evaluator = AnswerCorrectness() + result = evaluator(generated_answer, expected_answer, input=input_query) + correctness = result.score + return SingleEvalResult(score_data={"answer_correctness": correctness}) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + avg_score = np.average( + [result.score_data["answer_correctness"] for result in eval_results] + ) + + return EvalResult( + metrics={ + "avg_correctness_score": avg_score, + } + ) diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py new file mode 100644 index 0000000000..f5f56b435f --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py @@ -0,0 +1,83 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import asyncio +import threading + +import numpy as np + +from llama_stack.distribution.registry.generator_processors import ( + GeneratorProcessorRegistry, +) +from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import ( + InferenceGenerator, +) + +from llama_stack.apis.evals.evals import * # noqa: F401 F403 +from llama_stack.apis.datasets.datasets import * # noqa: F401 F403 +from llama_stack.apis.inference import * # noqa: F403 + + +class LlamaStackLLMJudgeScorer(BaseScorer[ScorerInputSample]): + def __init__(self, llm_judge_config: LLMJudgeConfig, inference_api: Inference): + self.llm_judge_config = llm_judge_config + self.inference_api = inference_api + # https://stackoverflow.com/questions/74703727/how-to-call-async-function-from-sync-funcion-and-get-result-while-a-loop-is-alr + # We will use another thread wih its own event loop to run the async api within sync function + self._loop = asyncio.new_event_loop() + self._thr = threading.Thread( + target=self._loop.run_forever, name="Async Runner", daemon=True + ) + if not self._thr.is_alive(): + self._thr.start() + + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + input_query = scorer_input_sample.input_query + generated_answer = scorer_input_sample.generated_answer + expected_answer = scorer_input_sample.expected_answer + + # Judge F1 + processor = GeneratorProcessorRegistry.get( + self.llm_judge_config.judge_processor_config.processor_identifier + )() + data_sample = DictSample( + data={ + "input_query": input_query, + "generated_answer": generated_answer, + "expected_answer": expected_answer, + } + ) + preprocessed_sample = processor.preprocess_sample(data_sample) + + # Judge Generation + generator = InferenceGenerator( + model=self.llm_judge_config.judge_model_generation_config.model, + inference_api=self.inference_api, + ) + + future = asyncio.run_coroutine_threadsafe( + generator.generate([preprocessed_sample]), self._loop + ) + generation_outputs = future.result() + # Judge F2 + postprocessed_sample = processor.postprocess_sample( + generation_outputs[0], data_sample + ) + + # Judge F3 + score = float(postprocessed_sample.generated_answer) + + return SingleEvalResult(score_data={"judge_score": score}) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + avg_score = np.average( + [result.score_data["judge_score"] for result in eval_results] + ) + + return EvalResult( + metrics={ + "avg_judge_score": avg_score, + } + ) diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py b/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py new file mode 100644 index 0000000000..756f351d88 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py new file mode 100644 index 0000000000..fbd98128f1 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py @@ -0,0 +1,93 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from llama_stack.distribution.registry.datasets import DatasetRegistry +from llama_stack.distribution.registry.generator_processors import ( + GeneratorProcessorRegistry, +) +from llama_stack.distribution.registry.scorers import ScorerRegistry + +from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import ( + InferenceGenerator, +) + + +from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.apis.inference import * # noqa: F403 +from termcolor import cprint + + +class RunEvalTask(BaseTask): + """ + RunEvalTask for LlamaStack + """ + + def __init__( + self, + *args, + **kwargs, + ) -> None: + super().__init__(*args, **kwargs) + + async def run( + self, + eval_task_config: EvaluateTaskConfig, + inference_api: Inference, + *args, + **kwargs, + ) -> EvalResult: + print(f"Running eval task w/ {eval_task_config}") + + print(DatasetRegistry.names()) + dataset = DatasetRegistry.get( + eval_task_config.dataset_config.dataset_identifier + ) + dataset.load(n_samples=eval_task_config.dataset_config.row_limit) + print(f"Running on {len(dataset)} samples") + + # F1 + print(GeneratorProcessorRegistry.names()) + processor = GeneratorProcessorRegistry.get( + eval_task_config.processor_config.processor_identifier + )() + preprocessed = processor.preprocess(dataset) + + # Generation + generator = InferenceGenerator( + model=eval_task_config.generation_config.model, + inference_api=inference_api, + ) + generation_outputs = await generator.generate(preprocessed) + + # F2 + postprocessed = processor.postprocess(generation_outputs, dataset) + cprint(postprocessed, "blue") + + # F3 - scorer + scorer_config_list = eval_task_config.scoring_config.scorer_config_list + scorer_list = [] + for s_conf in scorer_config_list: + scorer = ScorerRegistry.get(s_conf.scorer_name) + if s_conf.llm_judge_config: + scorer_list.append( + scorer( + llm_judge_config=s_conf.llm_judge_config, + inference_api=inference_api, + ) + ) + else: + scorer_list.append(scorer()) + + scorer = AggregateScorer( + scorers=scorer_list, + ) + + scorer_results = scorer.score(postprocessed) + cprint(scorer_results, "magenta") + eval_result = scorer.aggregate_results(scorer_results) + + return eval_result diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py new file mode 100644 index 0000000000..6b11191f1e --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py @@ -0,0 +1,90 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from llama_stack.distribution.registry.datasets import DatasetRegistry +from llama_stack.distribution.registry.scorers import ScorerRegistry + +from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 + +from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.apis.inference import * # noqa: F403 + + +class RunScoringTask(BaseTask): + """ + RunScoringTask - only run scoring (F3) based on dataset and scoring config + """ + + def __init__( + self, + *args, + **kwargs, + ) -> None: + super().__init__(*args, **kwargs) + + def transform_score_input_sample( + self, dataset: BaseDataset + ) -> List[ScorerInputSample]: + scorer_inputs = [] + for x in dataset: + expected_answer = x.data["expected_answer"] + generated_answer = x.data["generated_answer"] + input_query = None + if "input_query" in x.data: + input_query = x.data["input_query"] + + scorer_inputs.append( + ScorerInputSample( + expected_answer=expected_answer, + generated_answer=generated_answer, + input_query=input_query, + ) + ) + + return scorer_inputs + + async def run( + self, + dataset_config: EvaluateDatasetConfig, + eval_scoring_config: EvaluateScoringConfig, + inference_api: Inference, + *args, + **kwargs, + ) -> EvalResult: + print( + f"Running scoring task w/ dataset={dataset_config} scoring={eval_scoring_config}" + ) + + dataset = DatasetRegistry.get(dataset_config.dataset_identifier) + dataset.load(n_samples=dataset_config.row_limit) + print(f"Running on {len(dataset)} samples") + + # transform dataset into List[ScorerInputSample] + postprocessed = self.transform_score_input_sample(dataset) + + # F3 - scorer + scorer_config_list = eval_scoring_config.scorer_config_list + scorer_list = [] + for s_conf in scorer_config_list: + scorer = ScorerRegistry.get(s_conf.scorer_name) + if s_conf.llm_judge_config: + scorer_list.append( + scorer( + llm_judge_config=s_conf.llm_judge_config, + inference_api=inference_api, + ) + ) + else: + scorer_list.append(scorer()) + + scorer = AggregateScorer( + scorers=scorer_list, + ) + + scorer_results = scorer.score(postprocessed) + eval_result = scorer.aggregate_results(scorer_results) + + return eval_result diff --git a/llama_stack/providers/impls/third_party/evals/__init__.py b/llama_stack/providers/impls/third_party/evals/__init__.py new file mode 100644 index 0000000000..756f351d88 --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py b/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py new file mode 100644 index 0000000000..9886ed6d6c --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .config import EleutherEvalsImplConfig # noqa +from llama_stack.apis.inference import * # noqa: F403 +from llama_stack.distribution.datatypes import Api, ProviderSpec + + +async def get_provider_impl( + config: EleutherEvalsImplConfig, deps: Dict[Api, ProviderSpec] +): + from .eleuther import EleutherEvalsAdapter + + impl = EleutherEvalsAdapter(config, deps[Api.inference]) + await impl.initialize() + return impl diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/config.py b/llama_stack/providers/impls/third_party/evals/eleuther/config.py new file mode 100644 index 0000000000..a9ab297b42 --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/eleuther/config.py @@ -0,0 +1,10 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pydantic import BaseModel + + +class EleutherEvalsImplConfig(BaseModel): ... diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py new file mode 100644 index 0000000000..e4b32a45e0 --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py @@ -0,0 +1,170 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import asyncio +from llama_stack.apis.inference import * # noqa: F403 +from llama_stack.apis.evals import * # noqa: F403 +import os +import random +import threading +from pathlib import Path + +import lm_eval +import tqdm +from lm_eval.api.model import LM +from lm_eval.evaluator import evaluate, get_task_list +from lm_eval.tasks import get_task_dict, TaskManager +from termcolor import cprint + +from .config import EleutherEvalsImplConfig + + +# https://stackoverflow.com/questions/74703727/how-to-call-async-function-from-sync-funcion-and-get-result-while-a-loop-is-alr +# We will use another thread wih its own event loop to run the async api within sync function +_loop = asyncio.new_event_loop() +_thr = threading.Thread(target=_loop.run_forever, name="Async Runner", daemon=True) + + +class EleutherEvalsWrapper(LM): + def __init__( + self, + inference_api: Inference, + model: str, + **kwargs, + ): + super().__init__(**kwargs) + self.inference_api = inference_api + self.model = model + self.tokenizer = None + self.tokenized_requests = False + self.kwargs = kwargs + + @property + def eot_token_id(self): + raise NotImplementedError("Not implemented") + + @property + def max_length(self) -> int: + return NotImplementedError("Not implemented") + + @property + def max_gen_toks(self) -> int: + return NotImplementedError("Not implemented") + + @property + def batch_size(self): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError("No support for logits.") + + @property + def device(self): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError("No support for logits.") + + @property + def world_size(self): + return 1 + + def tok_encode(self, string: str) -> List[int]: + return NotImplementedError("Not implemented") + + def tok_decode(self, tokens: List[int]) -> str: + return NotImplementedError("Not implemented") + + def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False): + raise NotImplementedError("No support for logits.") + + def _model_call(self, inps): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError() + + def _model_generate(self, context, max_length, eos_token_id): + # Isn't used because we override generate_until + raise NotImplementedError() + + def loglikelihood(self, requests, disable_tqdm: bool = False): + # TODO: implement inference completion with loglikelihood + res = [] + for req in requests: + res.append((-random.random(), False)) + + return res + + def loglikelihood_rolling(self, requests, disable_tqdm: bool = False): + raise NotImplementedError("No support for logits.") + + def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: + res = [] + if not _thr.is_alive(): + _thr.start() + for req in tqdm.tqdm(requests): + chat_completion_coro_fn = self.inference_api.chat_completion( + model=self.model, + messages=[ + { + "role": "user", + "content": req.args[0], + } + ], + stream=False, + ) + future = asyncio.run_coroutine_threadsafe(chat_completion_coro_fn, _loop) + response = future.result() + res.append(response.completion_message.content) + + return res + + +class EleutherEvalsAdapter(Evals): + def __init__(self, config: EleutherEvalsImplConfig, inference_api: Inference): + self.inference_api = inference_api + + async def initialize(self) -> None: + pass + + async def shutdown(self) -> None: + pass + + async def run_evals( + self, + model: str, + task: str, + dataset: Optional[str] = None, + eval_task_config: Optional[EvaluateTaskConfig] = None, + ) -> EvaluateResponse: + cprint(f"Eleuther Evals: {model} {dataset} {task}", "red") + + eluther_wrapper = EleutherEvalsWrapper(self.inference_api, model) + current_dir = Path(os.path.dirname(os.path.abspath(__file__))) + + # custom registry of harness tasks + task_manager = TaskManager( + include_path=str(current_dir / "tasks"), + ) + + task_dict = get_task_dict(task, task_manager) + cprint(task_dict, "blue") + + task_types = set([t.task.OUTPUT_TYPE for t in get_task_list(task_dict)]) + cprint(task_types, "cyan") + + output = evaluate( + eluther_wrapper, + task_dict, + limit=eval_task_config.n_samples, + ) + + eval_result = EvalResult( + metrics={}, + ) + formatted_output = lm_eval.utils.make_table(output) + + cprint(formatted_output, "green") + + return EvaluateResponse( + eval_result=eval_result, + formatted_report=formatted_output, + ) diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml new file mode 100644 index 0000000000..e10277a314 --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml @@ -0,0 +1,32 @@ +task: meta_ifeval +dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals +dataset_name: Llama-3.1-8B-Instruct-evals__ifeval__strict__details +output_type: generate_until +test_split: latest +process_docs: !function utils.process_docs +num_fewshot: 0 +doc_to_text: prompt +doc_to_target: 0 +generation_kwargs: + until: [] + do_sample: false + temperature: 0.0 + max_gen_toks: 1280 +process_results: !function utils.process_results +metric_list: + - metric: prompt_level_strict_acc + aggregation: mean + higher_is_better: true + - metric: inst_level_strict_acc + aggregation: !function utils.agg_inst_level_acc + higher_is_better: true + - metric: prompt_level_loose_acc + aggregation: mean + higher_is_better: true + - metric: inst_level_loose_acc + aggregation: !function utils.agg_inst_level_acc + higher_is_better: true +metadata: + version: 2.0 +fewshot_config: + sampler: first_n diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py new file mode 100644 index 0000000000..aa171343fd --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py @@ -0,0 +1,191 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import dataclasses +from typing import Dict, Optional, Union + +import datasets + +from lm_eval.tasks.ifeval import instructions_registry + + +@dataclasses.dataclass +class InputExample: + key: int + instruction_id_list: list[str] + prompt: str + kwargs: list[Dict[str, Optional[Union[str, int]]]] + + +@dataclasses.dataclass +class OutputExample: + instruction_id_list: list[str] + prompt: str + response: str + follow_all_instructions: bool + follow_instruction_list: list[bool] + + +def test_instruction_following_strict( + inp, + response, +): + """Tests response to see if instructions are followed.""" + instruction_list = inp.instruction_id_list + is_following_list = [] + + for index, instruction_id in enumerate(instruction_list): + instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id] + instruction = instruction_cls(instruction_id) + + # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method. + kwargs = {k: v for k, v in inp.kwargs[index].items() if v} + instruction.build_description(**kwargs) + args = instruction.get_instruction_args() + if args and "prompt" in args: + instruction.build_description(prompt=inp.prompt) + + if response.strip() and instruction.check_following(response): + is_following_list.append(True) + else: + is_following_list.append(False) + + return OutputExample( + instruction_id_list=inp.instruction_id_list, + prompt=inp.prompt, + response=response, + follow_all_instructions=all(is_following_list), + follow_instruction_list=is_following_list, + ) + + +def test_instruction_following_loose( + inp, + response, +): + """Tests response for an upper bound for following instructions.""" + r = response.split("\n") + response_remove_first = "\n".join(r[1:]).strip() + response_remove_last = "\n".join(r[:-1]).strip() + response_remove_both = "\n".join(r[1:-1]).strip() + revised_response = response.replace("*", "") + revised_response_remove_first = response_remove_first.replace("*", "") + revised_response_remove_last = response_remove_last.replace("*", "") + revised_response_remove_both = response_remove_both.replace("*", "") + all_responses = [ + response, + revised_response, + response_remove_first, + response_remove_last, + response_remove_both, + revised_response_remove_first, + revised_response_remove_last, + revised_response_remove_both, + ] + instruction_list = inp.instruction_id_list + is_following_list = [] + + for index, instruction_id in enumerate(instruction_list): + instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id] + instruction = instruction_cls(instruction_id) + + # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method. + kwargs = {k: v for k, v in inp.kwargs[index].items() if v} + instruction.build_description(**kwargs) + args = instruction.get_instruction_args() + if args and "prompt" in args: + instruction.build_description(prompt=inp.prompt) + + is_following = False + for r in all_responses: + if r.strip() and instruction.check_following(r): + is_following = True + break + + is_following_list.append(is_following) + + return OutputExample( + instruction_id_list=inp.instruction_id_list, + prompt=inp.prompt, + response=response, + follow_all_instructions=all(is_following_list), + follow_instruction_list=is_following_list, + ) + + +def process_results(doc, results): + new_kwargs = [] + for item in doc["kwargs"]: + if item["nth_paragraph"]: + item["nth_paragraph"] = int(item["nth_paragraph"]) + new_kwargs.append(item) + inp = InputExample( + key=doc["key"], + instruction_id_list=doc["instruction_id_list"], + prompt=doc["prompt"], + kwargs=new_kwargs, + ) + response = results[0] + + out_strict = test_instruction_following_strict(inp, response) + out_loose = test_instruction_following_loose(inp, response) + + return { + "prompt_level_strict_acc": out_strict.follow_all_instructions, + "inst_level_strict_acc": out_strict.follow_instruction_list, + "prompt_level_loose_acc": out_loose.follow_all_instructions, + "inst_level_loose_acc": out_loose.follow_instruction_list, + } + + +def agg_inst_level_acc(items): + flat_items = [item for sublist in items for item in sublist] + inst_level_acc = sum(flat_items) / len(flat_items) + return inst_level_acc + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _get_question(example: dict) -> dict: + # get the question from the ifeval dataset + example["input_question"] = ( + eval( + example["input_question"] + .replace("null", "None") + .replace("true", "True") + .replace("false", "False") + )["dialog"][0]["body"] + .replace("Is it True that the first song", "Is it true that the first song") + .replace("Is the following True", "Is the following true") + ) + example["input_final_prompts"] = example["input_final_prompts"][0] + return example + + original_dataset_name = "wis-k/instruction-following-eval" + ifeval_data = datasets.load_dataset(original_dataset_name, split="train") + ifeval_df = ifeval_data.to_pandas() + ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"}) + + meta_dataset = dataset.map(_get_question) + meta_df = meta_dataset.to_pandas() + + # join the two datasets on the input_question column + joined = meta_df.join(ifeval_df.set_index("input_question"), on="input_question") + joined = joined.rename(columns={"input_final_prompts": "prompt"}) + joined = joined.rename(columns={"is_correct": "previous_is_correct"}) + joined = datasets.Dataset.from_pandas(joined) + joined = joined.select_columns( + [ + "input_question", + "prompt", + "previous_is_correct", + "instruction_id_list", + "kwargs", + "output_prediction_text", + "key", + ] + ) + joined.rename_column("output_prediction_text", "previous_output_prediction_text") + return joined diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml new file mode 100644 index 0000000000..1ec3c107d8 --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml @@ -0,0 +1,29 @@ +task: meta_mmlu_pro_instruct +dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals +dataset_name: Llama-3.1-8B-Instruct-evals__mmlu_pro__details +test_split: latest +output_type: generate_until +process_docs: !function utils.process_docs +doc_to_text: !function utils.doc_to_text +doc_to_target: gold +filter_list: + - name: "strict-match" + filter: + - function: "regex" + group_select: -1 + regex_pattern: 'best answer is ([A-Z])' + - function: "take_first" +generation_kwargs: + until: [] + do_sample: false + temperature: 0 + max_gen_toks: 1024 +num_fewshot: 0 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +metadata: + version: 1.0 diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py new file mode 100644 index 0000000000..6b8bc3e5b2 --- /dev/null +++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py @@ -0,0 +1,35 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import datasets + + +def doc_to_text(doc: dict) -> str: + return doc["input_final_prompts"][0] + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc: dict) -> dict: + out_doc = { + "problem": doc["input_question"], + "gold": doc["input_correct_responses"][0], + } + return out_doc + + dataset = dataset.select_columns( + [ + "input_question", + "input_correct_responses", + "input_final_prompts", + "is_correct", + "input_question_hash", + "input_choice_list", + "output_prediction_text", + ], + ) + dataset = dataset.rename_column("is_correct", "previously_is_correct") + dataset = dataset.map(_process_doc) + return dataset diff --git a/llama_stack/providers/registry/evals.py b/llama_stack/providers/registry/evals.py new file mode 100644 index 0000000000..a8a7e735ff --- /dev/null +++ b/llama_stack/providers/registry/evals.py @@ -0,0 +1,45 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import List + +from llama_stack.distribution.datatypes import * # noqa: F403 + + +def available_providers() -> List[ProviderSpec]: + return [ + InlineProviderSpec( + api=Api.evals, + provider_type="meta-reference", + pip_packages=[ + "matplotlib", + "pillow", + "pandas", + "scikit-learn", + "datasets", + "numpy", + "autoevals", + "openpyxl", + ], + module="llama_stack.providers.impls.meta_reference.evals", + config_class="llama_stack.providers.impls.meta_reference.evals.MetaReferenceEvalsImplConfig", + api_dependencies=[ + Api.inference, + ], + ), + InlineProviderSpec( + api=Api.evals, + provider_type="eleuther", + pip_packages=[ + "lm-eval", + ], + module="llama_stack.providers.impls.third_party.evals.eleuther", + config_class="llama_stack.providers.impls.third_party.evals.eleuther.EleutherEvalsImplConfig", + api_dependencies=[ + Api.inference, + ], + ), + ] diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py index 9fffc0f99a..2070649043 100644 --- a/llama_stack/providers/utils/telemetry/tracing.py +++ b/llama_stack/providers/utils/telemetry/tracing.py @@ -152,7 +152,7 @@ def severity(levelname: str) -> LogSeverity: elif levelname == "INFO": return LogSeverity.INFO elif levelname == "WARNING": - return LogSeverity.WARNING + return LogSeverity.WARN elif levelname == "ERROR": return LogSeverity.ERROR elif levelname == "CRITICAL": diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml index e12f6e8528..31fb726708 100644 --- a/tests/examples/local-run.yaml +++ b/tests/examples/local-run.yaml @@ -11,16 +11,26 @@ apis: - memory_banks - inference - safety +- evals +- datasets providers: - inference: + evals: - provider_id: meta-reference provider_type: meta-reference + config: {} + inference: + - provider_id: remote::tgi + provider_type: remote::tgi config: - model: Llama3.1-8B-Instruct - quantization: null - torch_seed: null - max_seq_len: 4096 - max_batch_size: 1 + url: http://127.0.0.1:5009 + # - provider_id: meta-reference + # provider_type: meta-reference + # config: + # model: Llama3.1-8B-Instruct + # quantization: null + # torch_seed: null + # max_seq_len: 4096 + # max_batch_size: 1 safety: - provider_id: meta-reference provider_type: meta-reference