diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py
index 871c01a80f..994b06e583 100644
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@@ -33,7 +33,7 @@
 
 from llama_models.llama3.api.datatypes import *  # noqa: F403
 from llama_stack.apis.agents import *  # noqa: F403
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
 from llama_stack.apis.evals import *  # noqa: F403
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.apis.batch_inference import *  # noqa: F403
@@ -61,7 +61,7 @@ class LlamaStack(
     Telemetry,
     PostTraining,
     Memory,
-    Evaluations,
+    Evals,
     Models,
     Shields,
     Inspect,
diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index a2f92b6e42..7ce99db3a7 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -21,7 +21,7 @@
     "info": {
         "title": "[DRAFT] Llama Stack Specification",
         "version": "0.0.1",
-        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-10-10 15:29:56.831109"
+        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-10-15 10:20:19.984531"
     },
     "servers": [
         {
@@ -109,39 +109,6 @@
                 }
             }
         },
-        "/evaluate/job/cancel": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    }
-                },
-                "tags": [
-                    "Evaluations"
-                ],
-                "parameters": [
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/CancelEvaluationJobRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/post_training/job/cancel": {
             "post": {
                 "responses": {
@@ -393,7 +360,14 @@
             "post": {
                 "responses": {
                     "200": {
-                        "description": "OK"
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/CreateDatasetResponse"
+                                }
+                            }
+                        }
                     }
                 },
                 "tags": [
@@ -489,119 +463,6 @@
             }
         },
         "/datasets/delete": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    }
-                },
-                "tags": [
-                    "Datasets"
-                ],
-                "parameters": [
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/DeleteDatasetRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
-        "/inference/embeddings": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EmbeddingsResponse"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Inference"
-                ],
-                "parameters": [
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/EmbeddingsRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
-        "/evaluate/question_answering/": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJob"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Evaluations"
-                ],
-                "parameters": [
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/EvaluateQuestionAnsweringRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
-        "/evaluate/summarization/": {
             "post": {
                 "responses": {
                     "200": {
@@ -609,14 +470,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJob"
+                                    "$ref": "#/components/schemas/DeleteDatasetResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Evaluations"
+                    "Datasets"
                 ],
                 "parameters": [
                     {
@@ -633,7 +494,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/EvaluateSummarizationRequest"
+                                "$ref": "#/components/schemas/DeleteDatasetRequest"
                             }
                         }
                     },
@@ -641,7 +502,7 @@
                 }
             }
         },
-        "/evaluate/text_generation/": {
+        "/inference/embeddings": {
             "post": {
                 "responses": {
                     "200": {
@@ -649,14 +510,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJob"
+                                    "$ref": "#/components/schemas/EmbeddingsResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Evaluations"
+                    "Inference"
                 ],
                 "parameters": [
                     {
@@ -673,7 +534,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/EvaluateTextGenerationRequest"
+                                "$ref": "#/components/schemas/EmbeddingsRequest"
                             }
                         }
                     },
@@ -845,7 +706,21 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/TrainEvalDataset"
+                                    "oneOf": [
+                                        {
+                                            "oneOf": [
+                                                {
+                                                    "$ref": "#/components/schemas/HuggingfaceDatasetDef"
+                                                },
+                                                {
+                                                    "$ref": "#/components/schemas/CustomDatasetDef"
+                                                }
+                                            ]
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
@@ -856,7 +731,7 @@
                 ],
                 "parameters": [
                     {
-                        "name": "dataset_uuid",
+                        "name": "dataset_identifier",
                         "in": "query",
                         "required": true,
                         "schema": {
@@ -875,7 +750,7 @@
                 ]
             }
         },
-        "/evaluate/job/artifacts": {
+        "/memory_banks/get": {
             "get": {
                 "responses": {
                     "200": {
@@ -883,18 +758,38 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJobArtifactsResponse"
+                                    "oneOf": [
+                                        {
+                                            "oneOf": [
+                                                {
+                                                    "$ref": "#/components/schemas/VectorMemoryBankDef"
+                                                },
+                                                {
+                                                    "$ref": "#/components/schemas/KeyValueMemoryBankDef"
+                                                },
+                                                {
+                                                    "$ref": "#/components/schemas/KeywordMemoryBankDef"
+                                                },
+                                                {
+                                                    "$ref": "#/components/schemas/GraphMemoryBankDef"
+                                                }
+                                            ]
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Evaluations"
+                    "MemoryBanks"
                 ],
                 "parameters": [
                     {
-                        "name": "job_uuid",
+                        "name": "identifier",
                         "in": "query",
                         "required": true,
                         "schema": {
@@ -913,7 +808,7 @@
                 ]
             }
         },
-        "/evaluate/job/logs": {
+        "/models/get": {
             "get": {
                 "responses": {
                     "200": {
@@ -921,18 +816,25 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJobLogStream"
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/ModelDefWithProvider"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Evaluations"
+                    "Models"
                 ],
                 "parameters": [
                     {
-                        "name": "job_uuid",
+                        "name": "identifier",
                         "in": "query",
                         "required": true,
                         "schema": {
@@ -951,7 +853,7 @@
                 ]
             }
         },
-        "/evaluate/job/status": {
+        "/shields/get": {
             "get": {
                 "responses": {
                     "200": {
@@ -959,18 +861,25 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJobStatusResponse"
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/ShieldDefWithProvider"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Evaluations"
+                    "Shields"
                 ],
                 "parameters": [
                     {
-                        "name": "job_uuid",
+                        "name": "shield_type",
                         "in": "query",
                         "required": true,
                         "schema": {
@@ -989,24 +898,32 @@
                 ]
             }
         },
-        "/evaluate/jobs": {
+        "/telemetry/get_trace": {
             "get": {
                 "responses": {
                     "200": {
                         "description": "OK",
                         "content": {
-                            "application/jsonl": {
+                            "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJob"
+                                    "$ref": "#/components/schemas/Trace"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Evaluations"
+                    "Telemetry"
                 ],
                 "parameters": [
+                    {
+                        "name": "trace_id",
+                        "in": "query",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
                     {
                         "name": "X-LlamaStack-ProviderData",
                         "in": "header",
@@ -1019,7 +936,7 @@
                 ]
             }
         },
-        "/memory_banks/get": {
+        "/post_training/job/artifacts": {
             "get": {
                 "responses": {
                     "200": {
@@ -1027,204 +944,18 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "oneOf": [
-                                                {
-                                                    "$ref": "#/components/schemas/VectorMemoryBankDef"
-                                                },
-                                                {
-                                                    "$ref": "#/components/schemas/KeyValueMemoryBankDef"
-                                                },
-                                                {
-                                                    "$ref": "#/components/schemas/KeywordMemoryBankDef"
-                                                },
-                                                {
-                                                    "$ref": "#/components/schemas/GraphMemoryBankDef"
-                                                }
-                                            ]
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "MemoryBanks"
+                    "PostTraining"
                 ],
                 "parameters": [
                     {
-                        "name": "identifier",
-                        "in": "query",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
-        "/models/get": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/ModelDefWithProvider"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Models"
-                ],
-                "parameters": [
-                    {
-                        "name": "identifier",
-                        "in": "query",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
-        "/shields/get": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/ShieldDefWithProvider"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Shields"
-                ],
-                "parameters": [
-                    {
-                        "name": "shield_type",
-                        "in": "query",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
-        "/telemetry/get_trace": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/Trace"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Telemetry"
-                ],
-                "parameters": [
-                    {
-                        "name": "trace_id",
-                        "in": "query",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
-        "/post_training/job/artifacts": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "PostTraining"
-                ],
-                "parameters": [
-                    {
-                        "name": "job_uuid",
+                        "name": "job_uuid",
                         "in": "query",
                         "required": true,
                         "schema": {
@@ -1412,6 +1143,43 @@
                 }
             }
         },
+        "/datasets/list": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/jsonl": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/HuggingfaceDatasetDef"
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/CustomDatasetDef"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Datasets"
+                ],
+                "parameters": [
+                    {
+                        "name": "X-LlamaStack-ProviderData",
+                        "in": "header",
+                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
         "/memory_banks/list": {
             "get": {
                 "responses": {
@@ -1836,7 +1604,7 @@
                 }
             }
         },
-        "/safety/run_shield": {
+        "/evals/run_eval_task": {
             "post": {
                 "responses": {
                     "200": {
@@ -1844,14 +1612,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/RunShieldResponse"
+                                    "$ref": "#/components/schemas/EvaluateResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "Safety"
+                    "Evals"
                 ],
                 "parameters": [
                     {
@@ -1868,7 +1636,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/RunShieldRequest"
+                                "$ref": "#/components/schemas/RunEvalTaskRequest"
                             }
                         }
                     },
@@ -1876,7 +1644,7 @@
                 }
             }
         },
-        "/post_training/supervised_fine_tune": {
+        "/evals/run_scorer": {
             "post": {
                 "responses": {
                     "200": {
@@ -1884,14 +1652,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/PostTrainingJob"
+                                    "$ref": "#/components/schemas/EvaluateResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "PostTraining"
+                    "Evals"
                 ],
                 "parameters": [
                     {
@@ -1908,7 +1676,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/SupervisedFineTuneRequest"
+                                "$ref": "#/components/schemas/RunScorerRequest"
                             }
                         }
                     },
@@ -1916,7 +1684,7 @@
                 }
             }
         },
-        "/synthetic_data_generation/generate": {
+        "/safety/run_shield": {
             "post": {
                 "responses": {
                     "200": {
@@ -1924,14 +1692,14 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/SyntheticDataGenerationResponse"
+                                    "$ref": "#/components/schemas/RunShieldResponse"
                                 }
                             }
                         }
                     }
                 },
                 "tags": [
-                    "SyntheticDataGeneration"
+                    "Safety"
                 ],
                 "parameters": [
                     {
@@ -1948,54 +1716,134 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/SyntheticDataGenerateRequest"
+                                "$ref": "#/components/schemas/RunShieldRequest"
                             }
                         }
                     },
                     "required": true
                 }
             }
-        }
-    },
-    "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
-    "components": {
-        "schemas": {
-            "BuiltinTool": {
-                "type": "string",
-                "enum": [
-                    "brave_search",
-                    "wolfram_alpha",
-                    "photogen",
-                    "code_interpreter"
-                ]
-            },
-            "CompletionMessage": {
-                "type": "object",
-                "properties": {
-                    "role": {
-                        "type": "string",
-                        "const": "assistant",
-                        "default": "assistant"
-                    },
-                    "content": {
-                        "oneOf": [
-                            {
-                                "type": "string"
-                            },
-                            {
-                                "$ref": "#/components/schemas/ImageMedia"
-                            },
-                            {
-                                "type": "array",
-                                "items": {
-                                    "oneOf": [
-                                        {
-                                            "type": "string"
-                                        },
-                                        {
-                                            "$ref": "#/components/schemas/ImageMedia"
-                                        }
-                                    ]
+        },
+        "/post_training/supervised_fine_tune": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/PostTrainingJob"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "PostTraining"
+                ],
+                "parameters": [
+                    {
+                        "name": "X-LlamaStack-ProviderData",
+                        "in": "header",
+                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/SupervisedFineTuneRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
+        "/synthetic_data_generation/generate": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/SyntheticDataGenerationResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "SyntheticDataGeneration"
+                ],
+                "parameters": [
+                    {
+                        "name": "X-LlamaStack-ProviderData",
+                        "in": "header",
+                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/SyntheticDataGenerateRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        }
+    },
+    "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
+    "components": {
+        "schemas": {
+            "BuiltinTool": {
+                "type": "string",
+                "enum": [
+                    "brave_search",
+                    "wolfram_alpha",
+                    "photogen",
+                    "code_interpreter"
+                ]
+            },
+            "CompletionMessage": {
+                "type": "object",
+                "properties": {
+                    "role": {
+                        "type": "string",
+                        "const": "assistant",
+                        "default": "assistant"
+                    },
+                    "content": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "$ref": "#/components/schemas/ImageMedia"
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "oneOf": [
+                                        {
+                                            "type": "string"
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/ImageMedia"
+                                        }
+                                    ]
                                 }
                             }
                         ]
@@ -2571,18 +2419,6 @@
                     "completion_message_batch"
                 ]
             },
-            "CancelEvaluationJobRequest": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ]
-            },
             "CancelTrainingJobRequest": {
                 "type": "object",
                 "properties": {
@@ -4090,19 +3926,58 @@
                     "error"
                 ]
             },
-            "TrainEvalDataset": {
+            "CustomDatasetDef": {
                 "type": "object",
                 "properties": {
-                    "columns": {
+                    "type": {
+                        "type": "string",
+                        "const": "custom",
+                        "default": "custom"
+                    },
+                    "identifier": {
+                        "type": "string"
+                    },
+                    "url": {
+                        "type": "string"
+                    },
+                    "rename_columns_map": {
                         "type": "object",
                         "additionalProperties": {
-                            "$ref": "#/components/schemas/TrainEvalDatasetColumnType"
+                            "type": "string"
                         }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "identifier",
+                    "url"
+                ]
+            },
+            "HuggingfaceDatasetDef": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "huggingface",
+                        "default": "huggingface"
                     },
-                    "content_url": {
-                        "$ref": "#/components/schemas/URL"
+                    "identifier": {
+                        "type": "string"
                     },
-                    "metadata": {
+                    "dataset_path": {
+                        "type": "string"
+                    },
+                    "dataset_name": {
+                        "type": "string"
+                    },
+                    "rename_columns_map": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "string"
+                        }
+                    },
+                    "kwargs": {
                         "type": "object",
                         "additionalProperties": {
                             "oneOf": [
@@ -4130,35 +4005,48 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "columns",
-                    "content_url"
-                ],
-                "title": "Dataset to be used for training or evaluating language models."
-            },
-            "TrainEvalDatasetColumnType": {
-                "type": "string",
-                "enum": [
-                    "dialog",
-                    "text",
-                    "media",
-                    "number",
-                    "json"
+                    "type",
+                    "identifier",
+                    "dataset_path",
+                    "kwargs"
                 ]
             },
             "CreateDatasetRequest": {
                 "type": "object",
                 "properties": {
-                    "uuid": {
-                        "type": "string"
+                    "dataset_def": {
+                        "oneOf": [
+                            {
+                                "$ref": "#/components/schemas/HuggingfaceDatasetDef"
+                            },
+                            {
+                                "$ref": "#/components/schemas/CustomDatasetDef"
+                            }
+                        ]
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dataset_def"
+                ]
+            },
+            "CreateDatasetResponse": {
+                "type": "object",
+                "properties": {
+                    "status": {
+                        "type": "string",
+                        "enum": [
+                            "success",
+                            "fail"
+                        ]
                     },
-                    "dataset": {
-                        "$ref": "#/components/schemas/TrainEvalDataset"
+                    "msg": {
+                        "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "uuid",
-                    "dataset"
+                    "status"
                 ]
             },
             "DeleteAgentsRequest": {
@@ -4192,13 +4080,32 @@
             "DeleteDatasetRequest": {
                 "type": "object",
                 "properties": {
-                    "dataset_uuid": {
+                    "dataset_identifier": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dataset_identifier"
+                ]
+            },
+            "DeleteDatasetResponse": {
+                "type": "object",
+                "properties": {
+                    "status": {
+                        "type": "string",
+                        "enum": [
+                            "success",
+                            "fail"
+                        ]
+                    },
+                    "msg": {
                         "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "dataset_uuid"
+                    "status"
                 ]
             },
             "EmbeddingsRequest": {
@@ -4258,112 +4165,42 @@
                     "embeddings"
                 ]
             },
-            "EvaluateQuestionAnsweringRequest": {
+            "GetAgentsSessionRequest": {
                 "type": "object",
                 "properties": {
-                    "metrics": {
+                    "turn_ids": {
                         "type": "array",
                         "items": {
-                            "type": "string",
-                            "enum": [
-                                "em",
-                                "f1"
-                            ]
+                            "type": "string"
                         }
                     }
                 },
-                "additionalProperties": false,
-                "required": [
-                    "metrics"
-                ]
+                "additionalProperties": false
             },
-            "EvaluationJob": {
+            "GraphMemoryBankDef": {
                 "type": "object",
                 "properties": {
-                    "job_uuid": {
+                    "identifier": {
                         "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string",
+                        "default": ""
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "graph",
+                        "default": "graph"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "job_uuid"
+                    "identifier",
+                    "provider_id",
+                    "type"
                 ]
             },
-            "EvaluateSummarizationRequest": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "type": "string",
-                            "enum": [
-                                "rouge",
-                                "bleu"
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "metrics"
-                ]
-            },
-            "EvaluateTextGenerationRequest": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "type": "string",
-                            "enum": [
-                                "perplexity",
-                                "rouge",
-                                "bleu"
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "metrics"
-                ]
-            },
-            "GetAgentsSessionRequest": {
-                "type": "object",
-                "properties": {
-                    "turn_ids": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    }
-                },
-                "additionalProperties": false
-            },
-            "GraphMemoryBankDef": {
-                "type": "object",
-                "properties": {
-                    "identifier": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string",
-                        "default": ""
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "graph",
-                        "default": "graph"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "identifier",
-                    "provider_id",
-                    "type"
-                ]
-            },
-            "KeyValueMemoryBankDef": {
+            "KeyValueMemoryBankDef": {
                 "type": "object",
                 "properties": {
                     "identifier": {
@@ -4513,43 +4350,6 @@
                     "step"
                 ]
             },
-            "EvaluationJobArtifactsResponse": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ],
-                "title": "Artifacts of a evaluation job."
-            },
-            "EvaluationJobLogStream": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ]
-            },
-            "EvaluationJobStatusResponse": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ]
-            },
             "ModelDefWithProvider": {
                 "type": "object",
                 "properties": {
@@ -5265,6 +5065,61 @@
                     "dpo"
                 ]
             },
+            "TrainEvalDataset": {
+                "type": "object",
+                "properties": {
+                    "columns": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/TrainEvalDatasetColumnType"
+                        }
+                    },
+                    "content_url": {
+                        "$ref": "#/components/schemas/URL"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "columns",
+                    "content_url"
+                ],
+                "title": "Dataset to be used for training or evaluating language models."
+            },
+            "TrainEvalDatasetColumnType": {
+                "type": "string",
+                "enum": [
+                    "dialog",
+                    "text",
+                    "media",
+                    "number",
+                    "json"
+                ]
+            },
             "TrainingConfig": {
                 "type": "object",
                 "properties": {
@@ -5491,222 +5346,520 @@
                                 "document_id": {
                                     "type": "string"
                                 }
-                            },
-                            "additionalProperties": false,
-                            "required": [
-                                "content",
-                                "token_count",
-                                "document_id"
+                            },
+                            "additionalProperties": false,
+                            "required": [
+                                "content",
+                                "token_count",
+                                "document_id"
+                            ]
+                        }
+                    },
+                    "scores": {
+                        "type": "array",
+                        "items": {
+                            "type": "number"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "chunks",
+                    "scores"
+                ]
+            },
+            "RegisterMemoryBankRequest": {
+                "type": "object",
+                "properties": {
+                    "memory_bank": {
+                        "oneOf": [
+                            {
+                                "$ref": "#/components/schemas/VectorMemoryBankDef"
+                            },
+                            {
+                                "$ref": "#/components/schemas/KeyValueMemoryBankDef"
+                            },
+                            {
+                                "$ref": "#/components/schemas/KeywordMemoryBankDef"
+                            },
+                            {
+                                "$ref": "#/components/schemas/GraphMemoryBankDef"
+                            }
+                        ]
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "memory_bank"
+                ]
+            },
+            "RegisterModelRequest": {
+                "type": "object",
+                "properties": {
+                    "model": {
+                        "$ref": "#/components/schemas/ModelDefWithProvider"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "model"
+                ]
+            },
+            "RegisterShieldRequest": {
+                "type": "object",
+                "properties": {
+                    "shield": {
+                        "$ref": "#/components/schemas/ShieldDefWithProvider"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "shield"
+                ]
+            },
+            "DialogGenerations": {
+                "type": "object",
+                "properties": {
+                    "dialog": {
+                        "type": "array",
+                        "items": {
+                            "oneOf": [
+                                {
+                                    "$ref": "#/components/schemas/UserMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/SystemMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ToolResponseMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/CompletionMessage"
+                                }
+                            ]
+                        }
+                    },
+                    "sampled_generations": {
+                        "type": "array",
+                        "items": {
+                            "oneOf": [
+                                {
+                                    "$ref": "#/components/schemas/UserMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/SystemMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ToolResponseMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/CompletionMessage"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dialog",
+                    "sampled_generations"
+                ]
+            },
+            "RewardScoreRequest": {
+                "type": "object",
+                "properties": {
+                    "dialog_generations": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/DialogGenerations"
+                        }
+                    },
+                    "model": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dialog_generations",
+                    "model"
+                ]
+            },
+            "RewardScoringResponse": {
+                "type": "object",
+                "properties": {
+                    "scored_generations": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ScoredDialogGenerations"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "scored_generations"
+                ],
+                "title": "Response from the reward scoring. Batch of (prompt, response, score) tuples that pass the threshold."
+            },
+            "ScoredDialogGenerations": {
+                "type": "object",
+                "properties": {
+                    "dialog": {
+                        "type": "array",
+                        "items": {
+                            "oneOf": [
+                                {
+                                    "$ref": "#/components/schemas/UserMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/SystemMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/ToolResponseMessage"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/CompletionMessage"
+                                }
+                            ]
+                        }
+                    },
+                    "scored_generations": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ScoredMessage"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dialog",
+                    "scored_generations"
+                ]
+            },
+            "ScoredMessage": {
+                "type": "object",
+                "properties": {
+                    "message": {
+                        "oneOf": [
+                            {
+                                "$ref": "#/components/schemas/UserMessage"
+                            },
+                            {
+                                "$ref": "#/components/schemas/SystemMessage"
+                            },
+                            {
+                                "$ref": "#/components/schemas/ToolResponseMessage"
+                            },
+                            {
+                                "$ref": "#/components/schemas/CompletionMessage"
+                            }
+                        ]
+                    },
+                    "score": {
+                        "type": "number"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "message",
+                    "score"
+                ]
+            },
+            "EvaluateDatasetConfig": {
+                "type": "object",
+                "properties": {
+                    "dataset_identifier": {
+                        "type": "string"
+                    },
+                    "row_limit": {
+                        "type": "integer"
+                    },
+                    "kwargs": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dataset_identifier"
+                ]
+            },
+            "EvaluateJudgeScoringConfig": {
+                "type": "object"
+            },
+            "EvaluateModelGenerationConfig": {
+                "type": "object",
+                "properties": {
+                    "model": {
+                        "type": "string"
+                    },
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
+                    },
+                    "kwargs": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "model",
+                    "sampling_params"
+                ]
+            },
+            "EvaluatePostprocessConfig": {
+                "type": "object",
+                "properties": {
+                    "kwargs": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false
+            },
+            "EvaluatePreprocessConfig": {
+                "type": "object",
+                "properties": {
+                    "kwargs": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
                             ]
                         }
+                    }
+                },
+                "additionalProperties": false
+            },
+            "EvaluateProcessorConfig": {
+                "type": "object",
+                "properties": {
+                    "processor_identifier": {
+                        "type": "string"
                     },
-                    "scores": {
-                        "type": "array",
-                        "items": {
-                            "type": "number"
-                        }
+                    "preprocess_config": {
+                        "$ref": "#/components/schemas/EvaluatePreprocessConfig"
+                    },
+                    "postprocess_config": {
+                        "$ref": "#/components/schemas/EvaluatePostprocessConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "chunks",
-                    "scores"
+                    "processor_identifier"
                 ]
             },
-            "RegisterMemoryBankRequest": {
+            "EvaluateScoringConfig": {
                 "type": "object",
                 "properties": {
-                    "memory_bank": {
-                        "oneOf": [
-                            {
-                                "$ref": "#/components/schemas/VectorMemoryBankDef"
-                            },
-                            {
-                                "$ref": "#/components/schemas/KeyValueMemoryBankDef"
-                            },
-                            {
-                                "$ref": "#/components/schemas/KeywordMemoryBankDef"
-                            },
-                            {
-                                "$ref": "#/components/schemas/GraphMemoryBankDef"
-                            }
-                        ]
+                    "scorer_config_list": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/EvaluateSingleScorerConfig"
+                        }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "memory_bank"
+                    "scorer_config_list"
                 ]
             },
-            "RegisterModelRequest": {
+            "EvaluateSingleScorerConfig": {
                 "type": "object",
                 "properties": {
-                    "model": {
-                        "$ref": "#/components/schemas/ModelDefWithProvider"
+                    "scorer_name": {
+                        "type": "string"
+                    },
+                    "llm_judge_config": {
+                        "$ref": "#/components/schemas/LLMJudgeConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "model"
+                    "scorer_name"
                 ]
             },
-            "RegisterShieldRequest": {
+            "EvaluateTaskConfig": {
                 "type": "object",
                 "properties": {
-                    "shield": {
-                        "$ref": "#/components/schemas/ShieldDefWithProvider"
+                    "dataset_config": {
+                        "$ref": "#/components/schemas/EvaluateDatasetConfig"
+                    },
+                    "processor_config": {
+                        "$ref": "#/components/schemas/EvaluateProcessorConfig"
+                    },
+                    "generation_config": {
+                        "$ref": "#/components/schemas/EvaluateModelGenerationConfig"
+                    },
+                    "scoring_config": {
+                        "$ref": "#/components/schemas/EvaluateScoringConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "shield"
+                    "dataset_config",
+                    "processor_config",
+                    "generation_config",
+                    "scoring_config"
                 ]
             },
-            "DialogGenerations": {
+            "LLMJudgeConfig": {
                 "type": "object",
                 "properties": {
-                    "dialog": {
-                        "type": "array",
-                        "items": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/UserMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/SystemMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/ToolResponseMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/CompletionMessage"
-                                }
-                            ]
-                        }
+                    "judge_processor_config": {
+                        "$ref": "#/components/schemas/EvaluateProcessorConfig"
                     },
-                    "sampled_generations": {
-                        "type": "array",
-                        "items": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/UserMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/SystemMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/ToolResponseMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/CompletionMessage"
-                                }
-                            ]
-                        }
+                    "judge_model_generation_config": {
+                        "$ref": "#/components/schemas/EvaluateModelGenerationConfig"
+                    },
+                    "judge_scoring_config": {
+                        "$ref": "#/components/schemas/EvaluateJudgeScoringConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "dialog",
-                    "sampled_generations"
+                    "judge_processor_config",
+                    "judge_model_generation_config",
+                    "judge_scoring_config"
                 ]
             },
-            "RewardScoreRequest": {
+            "RunEvalTaskRequest": {
                 "type": "object",
                 "properties": {
-                    "dialog_generations": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/DialogGenerations"
-                        }
-                    },
-                    "model": {
-                        "type": "string"
+                    "eval_task_config": {
+                        "$ref": "#/components/schemas/EvaluateTaskConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "dialog_generations",
-                    "model"
+                    "eval_task_config"
                 ]
             },
-            "RewardScoringResponse": {
+            "EvalResult": {
                 "type": "object",
                 "properties": {
-                    "scored_generations": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ScoredDialogGenerations"
+                    "metrics": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "number"
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "scored_generations"
+                    "metrics"
                 ],
-                "title": "Response from the reward scoring. Batch of (prompt, response, score) tuples that pass the threshold."
+                "title": "Aggregated final evaluation result."
             },
-            "ScoredDialogGenerations": {
+            "EvaluateResponse": {
                 "type": "object",
                 "properties": {
-                    "dialog": {
-                        "type": "array",
-                        "items": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/UserMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/SystemMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/ToolResponseMessage"
-                                },
-                                {
-                                    "$ref": "#/components/schemas/CompletionMessage"
-                                }
-                            ]
-                        }
+                    "eval_result": {
+                        "$ref": "#/components/schemas/EvalResult"
                     },
-                    "scored_generations": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ScoredMessage"
-                        }
+                    "formatted_report": {
+                        "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "dialog",
-                    "scored_generations"
-                ]
+                    "eval_result"
+                ],
+                "title": "Scores for evaluation."
             },
-            "ScoredMessage": {
+            "RunScorerRequest": {
                 "type": "object",
                 "properties": {
-                    "message": {
-                        "oneOf": [
-                            {
-                                "$ref": "#/components/schemas/UserMessage"
-                            },
-                            {
-                                "$ref": "#/components/schemas/SystemMessage"
-                            },
-                            {
-                                "$ref": "#/components/schemas/ToolResponseMessage"
-                            },
-                            {
-                                "$ref": "#/components/schemas/CompletionMessage"
-                            }
-                        ]
+                    "dataset_config": {
+                        "$ref": "#/components/schemas/EvaluateDatasetConfig"
                     },
-                    "score": {
-                        "type": "number"
+                    "eval_scoring_config": {
+                        "$ref": "#/components/schemas/EvaluateScoringConfig"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "message",
-                    "score"
+                    "dataset_config",
+                    "eval_scoring_config"
                 ]
             },
             "RunShieldRequest": {
@@ -6075,49 +6228,49 @@
     ],
     "tags": [
         {
-            "name": "Evaluations"
+            "name": "Agents"
         },
         {
-            "name": "Inspect"
+            "name": "Telemetry"
         },
         {
-            "name": "RewardScoring"
+            "name": "Safety"
         },
         {
-            "name": "Datasets"
+            "name": "MemoryBanks"
         },
         {
-            "name": "Models"
+            "name": "Datasets"
         },
         {
-            "name": "Telemetry"
+            "name": "Shields"
         },
         {
-            "name": "PostTraining"
+            "name": "RewardScoring"
         },
         {
-            "name": "SyntheticDataGeneration"
+            "name": "PostTraining"
         },
         {
-            "name": "BatchInference"
+            "name": "Models"
         },
         {
-            "name": "Inference"
+            "name": "Inspect"
         },
         {
-            "name": "Agents"
+            "name": "Evals"
         },
         {
-            "name": "Memory"
+            "name": "BatchInference"
         },
         {
-            "name": "Safety"
+            "name": "Inference"
         },
         {
-            "name": "Shields"
+            "name": "Memory"
         },
         {
-            "name": "MemoryBanks"
+            "name": "SyntheticDataGeneration"
         },
         {
             "name": "BuiltinTool",
@@ -6195,10 +6348,6 @@
             "name": "BatchCompletionResponse",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/BatchCompletionResponse\" />"
         },
-        {
-            "name": "CancelEvaluationJobRequest",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CancelEvaluationJobRequest\" />"
-        },
         {
             "name": "CancelTrainingJobRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CancelTrainingJobRequest\" />"
@@ -6368,17 +6517,21 @@
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/ViolationLevel\" />"
         },
         {
-            "name": "TrainEvalDataset",
-            "description": "Dataset to be used for training or evaluating language models.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/TrainEvalDataset\" />"
+            "name": "CustomDatasetDef",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CustomDatasetDef\" />"
         },
         {
-            "name": "TrainEvalDatasetColumnType",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/TrainEvalDatasetColumnType\" />"
+            "name": "HuggingfaceDatasetDef",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/HuggingfaceDatasetDef\" />"
         },
         {
             "name": "CreateDatasetRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CreateDatasetRequest\" />"
         },
+        {
+            "name": "CreateDatasetResponse",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CreateDatasetResponse\" />"
+        },
         {
             "name": "DeleteAgentsRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/DeleteAgentsRequest\" />"
@@ -6391,6 +6544,10 @@
             "name": "DeleteDatasetRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/DeleteDatasetRequest\" />"
         },
+        {
+            "name": "DeleteDatasetResponse",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/DeleteDatasetResponse\" />"
+        },
         {
             "name": "EmbeddingsRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EmbeddingsRequest\" />"
@@ -6399,22 +6556,6 @@
             "name": "EmbeddingsResponse",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EmbeddingsResponse\" />"
         },
-        {
-            "name": "EvaluateQuestionAnsweringRequest",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateQuestionAnsweringRequest\" />"
-        },
-        {
-            "name": "EvaluationJob",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluationJob\" />"
-        },
-        {
-            "name": "EvaluateSummarizationRequest",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateSummarizationRequest\" />"
-        },
-        {
-            "name": "EvaluateTextGenerationRequest",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateTextGenerationRequest\" />"
-        },
         {
             "name": "GetAgentsSessionRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/GetAgentsSessionRequest\" />"
@@ -6443,18 +6584,6 @@
             "name": "AgentStepResponse",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/AgentStepResponse\" />"
         },
-        {
-            "name": "EvaluationJobArtifactsResponse",
-            "description": "Artifacts of a evaluation job.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/EvaluationJobArtifactsResponse\" />"
-        },
-        {
-            "name": "EvaluationJobLogStream",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluationJobLogStream\" />"
-        },
-        {
-            "name": "EvaluationJobStatusResponse",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluationJobStatusResponse\" />"
-        },
         {
             "name": "ModelDefWithProvider",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/ModelDefWithProvider\" />"
@@ -6555,6 +6684,14 @@
             "name": "RLHFAlgorithm",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/RLHFAlgorithm\" />"
         },
+        {
+            "name": "TrainEvalDataset",
+            "description": "Dataset to be used for training or evaluating language models.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/TrainEvalDataset\" />"
+        },
+        {
+            "name": "TrainEvalDatasetColumnType",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/TrainEvalDatasetColumnType\" />"
+        },
         {
             "name": "TrainingConfig",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/TrainingConfig\" />"
@@ -6603,6 +6740,62 @@
             "name": "ScoredMessage",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/ScoredMessage\" />"
         },
+        {
+            "name": "EvaluateDatasetConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateDatasetConfig\" />"
+        },
+        {
+            "name": "EvaluateJudgeScoringConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateJudgeScoringConfig\" />"
+        },
+        {
+            "name": "EvaluateModelGenerationConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateModelGenerationConfig\" />"
+        },
+        {
+            "name": "EvaluatePostprocessConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluatePostprocessConfig\" />"
+        },
+        {
+            "name": "EvaluatePreprocessConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluatePreprocessConfig\" />"
+        },
+        {
+            "name": "EvaluateProcessorConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateProcessorConfig\" />"
+        },
+        {
+            "name": "EvaluateScoringConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateScoringConfig\" />"
+        },
+        {
+            "name": "EvaluateSingleScorerConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateSingleScorerConfig\" />"
+        },
+        {
+            "name": "EvaluateTaskConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateTaskConfig\" />"
+        },
+        {
+            "name": "LLMJudgeConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/LLMJudgeConfig\" />"
+        },
+        {
+            "name": "RunEvalTaskRequest",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/RunEvalTaskRequest\" />"
+        },
+        {
+            "name": "EvalResult",
+            "description": "Aggregated final evaluation result.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/EvalResult\" />"
+        },
+        {
+            "name": "EvaluateResponse",
+            "description": "Scores for evaluation.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateResponse\" />"
+        },
+        {
+            "name": "RunScorerRequest",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/RunScorerRequest\" />"
+        },
         {
             "name": "RunShieldRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/RunShieldRequest\" />"
@@ -6647,7 +6840,7 @@
                 "Agents",
                 "BatchInference",
                 "Datasets",
-                "Evaluations",
+                "Evals",
                 "Inference",
                 "Inspect",
                 "Memory",
@@ -6681,7 +6874,6 @@
                 "BatchCompletionRequest",
                 "BatchCompletionResponse",
                 "BuiltinTool",
-                "CancelEvaluationJobRequest",
                 "CancelTrainingJobRequest",
                 "ChatCompletionRequest",
                 "ChatCompletionResponse",
@@ -6698,31 +6890,40 @@
                 "CreateAgentSessionRequest",
                 "CreateAgentTurnRequest",
                 "CreateDatasetRequest",
+                "CreateDatasetResponse",
+                "CustomDatasetDef",
                 "DPOAlignmentConfig",
                 "DeleteAgentsRequest",
                 "DeleteAgentsSessionRequest",
                 "DeleteDatasetRequest",
+                "DeleteDatasetResponse",
                 "DialogGenerations",
                 "DoraFinetuningConfig",
                 "EmbeddingsRequest",
                 "EmbeddingsResponse",
-                "EvaluateQuestionAnsweringRequest",
-                "EvaluateSummarizationRequest",
-                "EvaluateTextGenerationRequest",
-                "EvaluationJob",
-                "EvaluationJobArtifactsResponse",
-                "EvaluationJobLogStream",
-                "EvaluationJobStatusResponse",
+                "EvalResult",
+                "EvaluateDatasetConfig",
+                "EvaluateJudgeScoringConfig",
+                "EvaluateModelGenerationConfig",
+                "EvaluatePostprocessConfig",
+                "EvaluatePreprocessConfig",
+                "EvaluateProcessorConfig",
+                "EvaluateResponse",
+                "EvaluateScoringConfig",
+                "EvaluateSingleScorerConfig",
+                "EvaluateTaskConfig",
                 "FinetuningAlgorithm",
                 "FunctionCallToolDefinition",
                 "GetAgentsSessionRequest",
                 "GraphMemoryBankDef",
                 "HealthInfo",
+                "HuggingfaceDatasetDef",
                 "ImageMedia",
                 "InferenceStep",
                 "InsertDocumentsRequest",
                 "KeyValueMemoryBankDef",
                 "KeywordMemoryBankDef",
+                "LLMJudgeConfig",
                 "LogEventRequest",
                 "LogSeverity",
                 "LoraFinetuningConfig",
@@ -6752,6 +6953,8 @@
                 "RewardScoreRequest",
                 "RewardScoringResponse",
                 "RouteInfo",
+                "RunEvalTaskRequest",
+                "RunScorerRequest",
                 "RunShieldRequest",
                 "RunShieldResponse",
                 "SafetyViolation",
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index c9822d6ca9..c116742243 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -315,14 +315,6 @@ components:
       - photogen
       - code_interpreter
       type: string
-    CancelEvaluationJobRequest:
-      additionalProperties: false
-      properties:
-        job_uuid:
-          type: string
-      required:
-      - job_uuid
-      type: object
     CancelTrainingJobRequest:
       additionalProperties: false
       properties:
@@ -572,13 +564,45 @@ components:
     CreateDatasetRequest:
       additionalProperties: false
       properties:
-        dataset:
-          $ref: '#/components/schemas/TrainEvalDataset'
-        uuid:
+        dataset_def:
+          oneOf:
+          - $ref: '#/components/schemas/HuggingfaceDatasetDef'
+          - $ref: '#/components/schemas/CustomDatasetDef'
+      required:
+      - dataset_def
+      type: object
+    CreateDatasetResponse:
+      additionalProperties: false
+      properties:
+        msg:
+          type: string
+        status:
+          enum:
+          - success
+          - fail
           type: string
       required:
-      - uuid
-      - dataset
+      - status
+      type: object
+    CustomDatasetDef:
+      additionalProperties: false
+      properties:
+        identifier:
+          type: string
+        rename_columns_map:
+          additionalProperties:
+            type: string
+          type: object
+        type:
+          const: custom
+          default: custom
+          type: string
+        url:
+          type: string
+      required:
+      - type
+      - identifier
+      - url
       type: object
     DPOAlignmentConfig:
       additionalProperties: false
@@ -619,10 +643,23 @@ components:
     DeleteDatasetRequest:
       additionalProperties: false
       properties:
-        dataset_uuid:
+        dataset_identifier:
           type: string
       required:
-      - dataset_uuid
+      - dataset_identifier
+      type: object
+    DeleteDatasetResponse:
+      additionalProperties: false
+      properties:
+        msg:
+          type: string
+        status:
+          enum:
+          - success
+          - fail
+          type: string
+      required:
+      - status
       type: object
     DialogGenerations:
       additionalProperties: false
@@ -701,78 +738,147 @@ components:
       required:
       - embeddings
       type: object
-    EvaluateQuestionAnsweringRequest:
+    EvalResult:
       additionalProperties: false
       properties:
         metrics:
-          items:
-            enum:
-            - em
-            - f1
-            type: string
-          type: array
+          additionalProperties:
+            type: number
+          type: object
       required:
       - metrics
+      title: Aggregated final evaluation result.
       type: object
-    EvaluateSummarizationRequest:
+    EvaluateDatasetConfig:
       additionalProperties: false
       properties:
-        metrics:
-          items:
-            enum:
-            - rouge
-            - bleu
-            type: string
-          type: array
+        dataset_identifier:
+          type: string
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+        row_limit:
+          type: integer
       required:
-      - metrics
+      - dataset_identifier
       type: object
-    EvaluateTextGenerationRequest:
+    EvaluateJudgeScoringConfig:
+      type: object
+    EvaluateModelGenerationConfig:
       additionalProperties: false
       properties:
-        metrics:
-          items:
-            enum:
-            - perplexity
-            - rouge
-            - bleu
-            type: string
-          type: array
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+        model:
+          type: string
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
       required:
-      - metrics
+      - model
+      - sampling_params
       type: object
-    EvaluationJob:
+    EvaluatePostprocessConfig:
       additionalProperties: false
       properties:
-        job_uuid:
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+      type: object
+    EvaluatePreprocessConfig:
+      additionalProperties: false
+      properties:
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+      type: object
+    EvaluateProcessorConfig:
+      additionalProperties: false
+      properties:
+        postprocess_config:
+          $ref: '#/components/schemas/EvaluatePostprocessConfig'
+        preprocess_config:
+          $ref: '#/components/schemas/EvaluatePreprocessConfig'
+        processor_identifier:
           type: string
       required:
-      - job_uuid
+      - processor_identifier
       type: object
-    EvaluationJobArtifactsResponse:
+    EvaluateResponse:
       additionalProperties: false
       properties:
-        job_uuid:
+        eval_result:
+          $ref: '#/components/schemas/EvalResult'
+        formatted_report:
           type: string
       required:
-      - job_uuid
-      title: Artifacts of a evaluation job.
+      - eval_result
+      title: Scores for evaluation.
       type: object
-    EvaluationJobLogStream:
+    EvaluateScoringConfig:
       additionalProperties: false
       properties:
-        job_uuid:
-          type: string
+        scorer_config_list:
+          items:
+            $ref: '#/components/schemas/EvaluateSingleScorerConfig'
+          type: array
       required:
-      - job_uuid
+      - scorer_config_list
       type: object
-    EvaluationJobStatusResponse:
+    EvaluateSingleScorerConfig:
       additionalProperties: false
       properties:
-        job_uuid:
+        llm_judge_config:
+          $ref: '#/components/schemas/LLMJudgeConfig'
+        scorer_name:
           type: string
       required:
-      - job_uuid
+      - scorer_name
+      type: object
+    EvaluateTaskConfig:
+      additionalProperties: false
+      properties:
+        dataset_config:
+          $ref: '#/components/schemas/EvaluateDatasetConfig'
+        generation_config:
+          $ref: '#/components/schemas/EvaluateModelGenerationConfig'
+        processor_config:
+          $ref: '#/components/schemas/EvaluateProcessorConfig'
+        scoring_config:
+          $ref: '#/components/schemas/EvaluateScoringConfig'
+      required:
+      - dataset_config
+      - processor_config
+      - generation_config
+      - scoring_config
       type: object
     FinetuningAlgorithm:
       enum:
@@ -845,6 +951,39 @@ components:
       required:
       - status
       type: object
+    HuggingfaceDatasetDef:
+      additionalProperties: false
+      properties:
+        dataset_name:
+          type: string
+        dataset_path:
+          type: string
+        identifier:
+          type: string
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+        rename_columns_map:
+          additionalProperties:
+            type: string
+          type: object
+        type:
+          const: huggingface
+          default: huggingface
+          type: string
+      required:
+      - type
+      - identifier
+      - dataset_path
+      - kwargs
+      type: object
     ImageMedia:
       additionalProperties: false
       properties:
@@ -936,6 +1075,20 @@ components:
       - provider_id
       - type
       type: object
+    LLMJudgeConfig:
+      additionalProperties: false
+      properties:
+        judge_model_generation_config:
+          $ref: '#/components/schemas/EvaluateModelGenerationConfig'
+        judge_processor_config:
+          $ref: '#/components/schemas/EvaluateProcessorConfig'
+        judge_scoring_config:
+          $ref: '#/components/schemas/EvaluateJudgeScoringConfig'
+      required:
+      - judge_processor_config
+      - judge_model_generation_config
+      - judge_scoring_config
+      type: object
     LogEventRequest:
       additionalProperties: false
       properties:
@@ -1629,6 +1782,25 @@ components:
       - method
       - provider_types
       type: object
+    RunEvalTaskRequest:
+      additionalProperties: false
+      properties:
+        eval_task_config:
+          $ref: '#/components/schemas/EvaluateTaskConfig'
+      required:
+      - eval_task_config
+      type: object
+    RunScorerRequest:
+      additionalProperties: false
+      properties:
+        dataset_config:
+          $ref: '#/components/schemas/EvaluateDatasetConfig'
+        eval_scoring_config:
+          $ref: '#/components/schemas/EvaluateScoringConfig'
+      required:
+      - dataset_config
+      - eval_scoring_config
+      type: object
     RunShieldRequest:
       additionalProperties: false
       properties:
@@ -2507,7 +2679,7 @@ info:
   description: "This is the specification of the llama stack that provides\n     \
     \           a set of endpoints and their corresponding interfaces that are tailored\
     \ to\n                best leverage Llama Models. The specification is still in\
-    \ draft and subject to change.\n                Generated at 2024-10-10 15:29:56.831109"
+    \ draft and subject to change.\n                Generated at 2024-10-15 10:20:19.984531"
   title: '[DRAFT] Llama Stack Specification'
   version: 0.0.1
 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
@@ -2794,81 +2966,16 @@ paths:
             schema:
               $ref: '#/components/schemas/CreateDatasetRequest'
         required: true
-      responses:
-        '200':
-          description: OK
-      tags:
-      - Datasets
-  /datasets/delete:
-    post:
-      parameters:
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/DeleteDatasetRequest'
-        required: true
-      responses:
-        '200':
-          description: OK
-      tags:
-      - Datasets
-  /datasets/get:
-    get:
-      parameters:
-      - in: query
-        name: dataset_uuid
-        required: true
-        schema:
-          type: string
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
       responses:
         '200':
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/TrainEvalDataset'
+                $ref: '#/components/schemas/CreateDatasetResponse'
           description: OK
       tags:
       - Datasets
-  /evaluate/job/artifacts:
-    get:
-      parameters:
-      - in: query
-        name: job_uuid
-        required: true
-        schema:
-          type: string
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
-      responses:
-        '200':
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluationJobArtifactsResponse'
-          description: OK
-      tags:
-      - Evaluations
-  /evaluate/job/cancel:
+  /datasets/delete:
     post:
       parameters:
       - description: JSON-encoded provider data which will be made available to the
@@ -2882,42 +2989,22 @@ paths:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/CancelEvaluationJobRequest'
-        required: true
-      responses:
-        '200':
-          description: OK
-      tags:
-      - Evaluations
-  /evaluate/job/logs:
-    get:
-      parameters:
-      - in: query
-        name: job_uuid
+              $ref: '#/components/schemas/DeleteDatasetRequest'
         required: true
-        schema:
-          type: string
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
       responses:
         '200':
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/EvaluationJobLogStream'
+                $ref: '#/components/schemas/DeleteDatasetResponse'
           description: OK
       tags:
-      - Evaluations
-  /evaluate/job/status:
+      - Datasets
+  /datasets/get:
     get:
       parameters:
       - in: query
-        name: job_uuid
+        name: dataset_identifier
         required: true
         schema:
           type: string
@@ -2933,11 +3020,15 @@ paths:
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/EvaluationJobStatusResponse'
+                oneOf:
+                - oneOf:
+                  - $ref: '#/components/schemas/HuggingfaceDatasetDef'
+                  - $ref: '#/components/schemas/CustomDatasetDef'
+                - type: 'null'
           description: OK
       tags:
-      - Evaluations
-  /evaluate/jobs:
+      - Datasets
+  /datasets/list:
     get:
       parameters:
       - description: JSON-encoded provider data which will be made available to the
@@ -2952,36 +3043,13 @@ paths:
           content:
             application/jsonl:
               schema:
-                $ref: '#/components/schemas/EvaluationJob'
-          description: OK
-      tags:
-      - Evaluations
-  /evaluate/question_answering/:
-    post:
-      parameters:
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/EvaluateQuestionAnsweringRequest'
-        required: true
-      responses:
-        '200':
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluationJob'
+                oneOf:
+                - $ref: '#/components/schemas/HuggingfaceDatasetDef'
+                - $ref: '#/components/schemas/CustomDatasetDef'
           description: OK
       tags:
-      - Evaluations
-  /evaluate/summarization/:
+      - Datasets
+  /evals/run_eval_task:
     post:
       parameters:
       - description: JSON-encoded provider data which will be made available to the
@@ -2995,18 +3063,18 @@ paths:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/EvaluateSummarizationRequest'
+              $ref: '#/components/schemas/RunEvalTaskRequest'
         required: true
       responses:
         '200':
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/EvaluationJob'
+                $ref: '#/components/schemas/EvaluateResponse'
           description: OK
       tags:
-      - Evaluations
-  /evaluate/text_generation/:
+      - Evals
+  /evals/run_scorer:
     post:
       parameters:
       - description: JSON-encoded provider data which will be made available to the
@@ -3020,17 +3088,17 @@ paths:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/EvaluateTextGenerationRequest'
+              $ref: '#/components/schemas/RunScorerRequest'
         required: true
       responses:
         '200':
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/EvaluationJob'
+                $ref: '#/components/schemas/EvaluateResponse'
           description: OK
       tags:
-      - Evaluations
+      - Evals
   /health:
     get:
       parameters:
@@ -3712,21 +3780,21 @@ security:
 servers:
 - url: http://any-hosted-llama-stack.com
 tags:
-- name: Evaluations
-- name: Inspect
-- name: RewardScoring
-- name: Datasets
-- name: Models
+- name: Agents
 - name: Telemetry
+- name: Safety
+- name: MemoryBanks
+- name: Datasets
+- name: Shields
+- name: RewardScoring
 - name: PostTraining
-- name: SyntheticDataGeneration
+- name: Models
+- name: Inspect
+- name: Evals
 - name: BatchInference
 - name: Inference
-- name: Agents
 - name: Memory
-- name: Safety
-- name: Shields
-- name: MemoryBanks
+- name: SyntheticDataGeneration
 - description: <SchemaDefinition schemaRef="#/components/schemas/BuiltinTool" />
   name: BuiltinTool
 - description: <SchemaDefinition schemaRef="#/components/schemas/CompletionMessage"
@@ -3782,9 +3850,6 @@ tags:
 - description: <SchemaDefinition schemaRef="#/components/schemas/BatchCompletionResponse"
     />
   name: BatchCompletionResponse
-- description: <SchemaDefinition schemaRef="#/components/schemas/CancelEvaluationJobRequest"
-    />
-  name: CancelEvaluationJobRequest
 - description: <SchemaDefinition schemaRef="#/components/schemas/CancelTrainingJobRequest"
     />
   name: CancelTrainingJobRequest
@@ -3919,17 +3984,18 @@ tags:
   name: Turn
 - description: <SchemaDefinition schemaRef="#/components/schemas/ViolationLevel" />
   name: ViolationLevel
-- description: 'Dataset to be used for training or evaluating language models.
-
-
-    <SchemaDefinition schemaRef="#/components/schemas/TrainEvalDataset" />'
-  name: TrainEvalDataset
-- description: <SchemaDefinition schemaRef="#/components/schemas/TrainEvalDatasetColumnType"
+- description: <SchemaDefinition schemaRef="#/components/schemas/CustomDatasetDef"
     />
-  name: TrainEvalDatasetColumnType
+  name: CustomDatasetDef
+- description: <SchemaDefinition schemaRef="#/components/schemas/HuggingfaceDatasetDef"
+    />
+  name: HuggingfaceDatasetDef
 - description: <SchemaDefinition schemaRef="#/components/schemas/CreateDatasetRequest"
     />
   name: CreateDatasetRequest
+- description: <SchemaDefinition schemaRef="#/components/schemas/CreateDatasetResponse"
+    />
+  name: CreateDatasetResponse
 - description: <SchemaDefinition schemaRef="#/components/schemas/DeleteAgentsRequest"
     />
   name: DeleteAgentsRequest
@@ -3939,23 +4005,15 @@ tags:
 - description: <SchemaDefinition schemaRef="#/components/schemas/DeleteDatasetRequest"
     />
   name: DeleteDatasetRequest
+- description: <SchemaDefinition schemaRef="#/components/schemas/DeleteDatasetResponse"
+    />
+  name: DeleteDatasetResponse
 - description: <SchemaDefinition schemaRef="#/components/schemas/EmbeddingsRequest"
     />
   name: EmbeddingsRequest
 - description: <SchemaDefinition schemaRef="#/components/schemas/EmbeddingsResponse"
     />
   name: EmbeddingsResponse
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateQuestionAnsweringRequest"
-    />
-  name: EvaluateQuestionAnsweringRequest
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluationJob" />
-  name: EvaluationJob
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateSummarizationRequest"
-    />
-  name: EvaluateSummarizationRequest
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateTextGenerationRequest"
-    />
-  name: EvaluateTextGenerationRequest
 - description: <SchemaDefinition schemaRef="#/components/schemas/GetAgentsSessionRequest"
     />
   name: GetAgentsSessionRequest
@@ -3979,18 +4037,6 @@ tags:
 - description: <SchemaDefinition schemaRef="#/components/schemas/AgentStepResponse"
     />
   name: AgentStepResponse
-- description: 'Artifacts of a evaluation job.
-
-
-    <SchemaDefinition schemaRef="#/components/schemas/EvaluationJobArtifactsResponse"
-    />'
-  name: EvaluationJobArtifactsResponse
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluationJobLogStream"
-    />
-  name: EvaluationJobLogStream
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluationJobStatusResponse"
-    />
-  name: EvaluationJobStatusResponse
 - description: <SchemaDefinition schemaRef="#/components/schemas/ModelDefWithProvider"
     />
   name: ModelDefWithProvider
@@ -4067,6 +4113,14 @@ tags:
   name: OptimizerConfig
 - description: <SchemaDefinition schemaRef="#/components/schemas/RLHFAlgorithm" />
   name: RLHFAlgorithm
+- description: 'Dataset to be used for training or evaluating language models.
+
+
+    <SchemaDefinition schemaRef="#/components/schemas/TrainEvalDataset" />'
+  name: TrainEvalDataset
+- description: <SchemaDefinition schemaRef="#/components/schemas/TrainEvalDatasetColumnType"
+    />
+  name: TrainEvalDatasetColumnType
 - description: <SchemaDefinition schemaRef="#/components/schemas/TrainingConfig" />
   name: TrainingConfig
 - description: <SchemaDefinition schemaRef="#/components/schemas/PreferenceOptimizeRequest"
@@ -4104,6 +4158,51 @@ tags:
   name: ScoredDialogGenerations
 - description: <SchemaDefinition schemaRef="#/components/schemas/ScoredMessage" />
   name: ScoredMessage
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateDatasetConfig"
+    />
+  name: EvaluateDatasetConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateJudgeScoringConfig"
+    />
+  name: EvaluateJudgeScoringConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateModelGenerationConfig"
+    />
+  name: EvaluateModelGenerationConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluatePostprocessConfig"
+    />
+  name: EvaluatePostprocessConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluatePreprocessConfig"
+    />
+  name: EvaluatePreprocessConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateProcessorConfig"
+    />
+  name: EvaluateProcessorConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateScoringConfig"
+    />
+  name: EvaluateScoringConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateSingleScorerConfig"
+    />
+  name: EvaluateSingleScorerConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateTaskConfig"
+    />
+  name: EvaluateTaskConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/LLMJudgeConfig" />
+  name: LLMJudgeConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/RunEvalTaskRequest"
+    />
+  name: RunEvalTaskRequest
+- description: 'Aggregated final evaluation result.
+
+
+    <SchemaDefinition schemaRef="#/components/schemas/EvalResult" />'
+  name: EvalResult
+- description: 'Scores for evaluation.
+
+
+    <SchemaDefinition schemaRef="#/components/schemas/EvaluateResponse" />'
+  name: EvaluateResponse
+- description: <SchemaDefinition schemaRef="#/components/schemas/RunScorerRequest"
+    />
+  name: RunScorerRequest
 - description: <SchemaDefinition schemaRef="#/components/schemas/RunShieldRequest"
     />
   name: RunShieldRequest
@@ -4141,7 +4240,7 @@ x-tagGroups:
   - Agents
   - BatchInference
   - Datasets
-  - Evaluations
+  - Evals
   - Inference
   - Inspect
   - Memory
@@ -4172,7 +4271,6 @@ x-tagGroups:
   - BatchCompletionRequest
   - BatchCompletionResponse
   - BuiltinTool
-  - CancelEvaluationJobRequest
   - CancelTrainingJobRequest
   - ChatCompletionRequest
   - ChatCompletionResponse
@@ -4189,31 +4287,40 @@ x-tagGroups:
   - CreateAgentSessionRequest
   - CreateAgentTurnRequest
   - CreateDatasetRequest
+  - CreateDatasetResponse
+  - CustomDatasetDef
   - DPOAlignmentConfig
   - DeleteAgentsRequest
   - DeleteAgentsSessionRequest
   - DeleteDatasetRequest
+  - DeleteDatasetResponse
   - DialogGenerations
   - DoraFinetuningConfig
   - EmbeddingsRequest
   - EmbeddingsResponse
-  - EvaluateQuestionAnsweringRequest
-  - EvaluateSummarizationRequest
-  - EvaluateTextGenerationRequest
-  - EvaluationJob
-  - EvaluationJobArtifactsResponse
-  - EvaluationJobLogStream
-  - EvaluationJobStatusResponse
+  - EvalResult
+  - EvaluateDatasetConfig
+  - EvaluateJudgeScoringConfig
+  - EvaluateModelGenerationConfig
+  - EvaluatePostprocessConfig
+  - EvaluatePreprocessConfig
+  - EvaluateProcessorConfig
+  - EvaluateResponse
+  - EvaluateScoringConfig
+  - EvaluateSingleScorerConfig
+  - EvaluateTaskConfig
   - FinetuningAlgorithm
   - FunctionCallToolDefinition
   - GetAgentsSessionRequest
   - GraphMemoryBankDef
   - HealthInfo
+  - HuggingfaceDatasetDef
   - ImageMedia
   - InferenceStep
   - InsertDocumentsRequest
   - KeyValueMemoryBankDef
   - KeywordMemoryBankDef
+  - LLMJudgeConfig
   - LogEventRequest
   - LogSeverity
   - LoraFinetuningConfig
@@ -4243,6 +4350,8 @@ x-tagGroups:
   - RewardScoreRequest
   - RewardScoringResponse
   - RouteInfo
+  - RunEvalTaskRequest
+  - RunScorerRequest
   - RunShieldRequest
   - RunShieldResponse
   - SafetyViolation
diff --git a/llama_stack/apis/dataset/dataset.py b/llama_stack/apis/dataset/dataset.py
deleted file mode 100644
index 2fa8bb4e5e..0000000000
--- a/llama_stack/apis/dataset/dataset.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import Any, Dict, Optional, Protocol
-
-from llama_models.llama3.api.datatypes import URL
-
-from llama_models.schema_utils import json_schema_type, webmethod
-
-from pydantic import BaseModel
-
-
-@json_schema_type
-class TrainEvalDatasetColumnType(Enum):
-    dialog = "dialog"
-    text = "text"
-    media = "media"
-    number = "number"
-    json = "json"
-
-
-@json_schema_type
-class TrainEvalDataset(BaseModel):
-    """Dataset to be used for training or evaluating language models."""
-
-    # TODO(ashwin): figure out if we need to add an enum for a "dataset type"
-
-    columns: Dict[str, TrainEvalDatasetColumnType]
-    content_url: URL
-    metadata: Optional[Dict[str, Any]] = None
-
-
-@json_schema_type
-class CreateDatasetRequest(BaseModel):
-    """Request to create a dataset."""
-
-    uuid: str
-    dataset: TrainEvalDataset
-
-
-class Datasets(Protocol):
-    @webmethod(route="/datasets/create")
-    def create_dataset(
-        self,
-        uuid: str,
-        dataset: TrainEvalDataset,
-    ) -> None: ...
-
-    @webmethod(route="/datasets/get")
-    def get_dataset(
-        self,
-        dataset_uuid: str,
-    ) -> TrainEvalDataset: ...
-
-    @webmethod(route="/datasets/delete")
-    def delete_dataset(
-        self,
-        dataset_uuid: str,
-    ) -> None: ...
diff --git a/llama_stack/apis/dataset/__init__.py b/llama_stack/apis/datasets/__init__.py
similarity index 82%
rename from llama_stack/apis/dataset/__init__.py
rename to llama_stack/apis/datasets/__init__.py
index 33557a0ab1..102b9927f3 100644
--- a/llama_stack/apis/dataset/__init__.py
+++ b/llama_stack/apis/datasets/__init__.py
@@ -4,4 +4,4 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from .dataset import *  # noqa: F401 F403
+from .datasets import *  # noqa: F401 F403
diff --git a/llama_stack/apis/datasets/client.py b/llama_stack/apis/datasets/client.py
new file mode 100644
index 0000000000..e292b14d8c
--- /dev/null
+++ b/llama_stack/apis/datasets/client.py
@@ -0,0 +1,156 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import json
+from typing import Optional
+
+import fire
+import httpx
+from termcolor import cprint
+
+from .datasets import *  # noqa: F403
+
+
+def deserialize_dataset_def(j: Optional[Dict[str, Any]]) -> Optional[DatasetDef]:
+    if not j:
+        return None
+    if j["type"] == "huggingface":
+        return HuggingfaceDatasetDef(**j)
+    elif j["type"] == "custom":
+        return CustomDatasetDef(**j)
+    else:
+        raise ValueError(f"Unknown dataset type: {j['type']}")
+
+
+class DatasetsClient(Datasets):
+    def __init__(self, base_url: str):
+        self.base_url = base_url
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def create_dataset(
+        self,
+        dataset_def: DatasetDef,
+    ) -> CreateDatasetResponse:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{self.base_url}/datasets/create",
+                json={
+                    "dataset_def": json.loads(dataset_def.json()),
+                },
+                headers={"Content-Type": "application/json"},
+                timeout=60,
+            )
+            response.raise_for_status()
+            return CreateDatasetResponse(**response.json())
+
+    async def get_dataset(
+        self,
+        dataset_identifier: str,
+    ) -> Optional[DatasetDef]:
+        async with httpx.AsyncClient() as client:
+            response = await client.get(
+                f"{self.base_url}/datasets/get",
+                params={
+                    "dataset_identifier": dataset_identifier,
+                },
+                headers={"Content-Type": "application/json"},
+                timeout=60,
+            )
+            response.raise_for_status()
+            if not response.json():
+                return
+
+            return deserialize_dataset_def(response.json())
+
+    async def delete_dataset(
+        self,
+        dataset_identifier: str,
+    ) -> DeleteDatasetResponse:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{self.base_url}/datasets/delete",
+                json={
+                    "dataset_identifier": dataset_identifier,
+                },
+                headers={"Content-Type": "application/json"},
+                timeout=60,
+            )
+            response.raise_for_status()
+            return DeleteDatasetResponse(**response.json())
+
+    async def list_dataset(
+        self,
+    ) -> List[DatasetDef]:
+        async with httpx.AsyncClient() as client:
+            response = await client.get(
+                f"{self.base_url}/datasets/list",
+                headers={"Content-Type": "application/json"},
+                timeout=60,
+            )
+            response.raise_for_status()
+            if not response.json():
+                return
+
+            return [deserialize_dataset_def(x) for x in response.json()]
+
+
+async def run_main(host: str, port: int):
+    client = DatasetsClient(f"http://{host}:{port}")
+
+    # register dataset
+    response = await client.create_dataset(
+        dataset_def=CustomDatasetDef(
+            identifier="test-dataset",
+            url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
+        ),
+    )
+    cprint(response, "green")
+
+    # register HF dataset
+    response = await client.create_dataset(
+        dataset_def=HuggingfaceDatasetDef(
+            identifier="hellaswag",
+            dataset_name="hellaswag",
+            kwargs={"split": "validation", "trust_remote_code": True},
+        )
+    )
+    cprint(response, "green")
+
+    # get dataset
+    get_dataset = await client.get_dataset(
+        dataset_identifier="test-dataset",
+    )
+    cprint(get_dataset, "cyan")
+
+    # delete dataset
+    delete_dataset = await client.delete_dataset(
+        dataset_identifier="test-dataset",
+    )
+    cprint(delete_dataset, "red")
+
+    # get again after deletion
+    get_dataset = await client.get_dataset(
+        dataset_identifier="test-dataset",
+    )
+    cprint(get_dataset, "yellow")
+
+    # list datasets
+    list_dataset = await client.list_dataset()
+    cprint(list_dataset, "blue")
+
+
+def main(host: str, port: int):
+    asyncio.run(run_main(host, port))
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
new file mode 100644
index 0000000000..f5991c52e1
--- /dev/null
+++ b/llama_stack/apis/datasets/datasets.py
@@ -0,0 +1,225 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Any, Dict, Generic, Iterator, Literal, Protocol, TypeVar, Union
+
+from llama_models.schema_utils import json_schema_type, webmethod
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+
+from pydantic import BaseModel, Field
+from typing_extensions import Annotated
+
+
+@json_schema_type
+class TrainEvalDatasetColumnType(Enum):
+    dialog = "dialog"
+    text = "text"
+    media = "media"
+    number = "number"
+    json = "json"
+
+
+@json_schema_type
+class TrainEvalDataset(BaseModel):
+    """Dataset to be used for training or evaluating language models."""
+
+    # TODO(ashwin): figure out if we need to add an enum for a "dataset type"
+
+    columns: Dict[str, TrainEvalDatasetColumnType]
+    content_url: URL
+    metadata: Optional[Dict[str, Any]] = None
+
+
+@json_schema_type
+class GenerationInput(BaseModel):
+    messages: List[Message]
+
+
+@json_schema_type
+class GenerationOutput(BaseModel):
+    completion_message: str
+    logprobs: Optional[List[TokenLogProbs]] = None
+
+
+@json_schema_type
+class PostprocessedGeneration(BaseModel):
+    completion_message: str
+    logprobs: Optional[List[TokenLogProbs]] = None
+
+
+# A sample (row) from dataset
+TDatasetSample = TypeVar("TDatasetSample")
+
+
+@json_schema_type
+class DatasetSample(BaseModel): ...
+
+
+@json_schema_type
+class DictSample(DatasetSample):
+    data: Dict[str, Any]
+
+
+# A sample (row) from evals intermediate dataset after preprocessing
+TPreprocessedSample = TypeVar("TPreprocessedSample")
+
+
+@json_schema_type
+class PreprocessedSample(DatasetSample):
+    generation_input: GenerationInput
+
+
+# A sample (row) from evals intermediate dataset after inference
+TGenerationResponseSample = TypeVar("TGenerationResponseSample")
+
+
+@json_schema_type
+class GenerationResponseSample(DatasetSample):
+    generation_output: GenerationOutput
+
+
+# A sample (row) for prepared evals dataset ready for scoring
+TScorerInputSample = TypeVar("TScorerInputSample")
+
+
+@json_schema_type
+class ScorerInputSample(DatasetSample):
+    """
+    A dataset is required to have the following columns to be used for scoring:
+    - generated_answer: str
+    - expected_answer: Union[str, List[str]]
+    - (optional) input_query: str
+    - (optional) generation_output: PostprocessedGeneration
+    """
+
+    generated_answer: str
+    expected_answer: Union[str, List[str]]
+    input_query: Optional[str] = None
+    generation_output: Optional[PostprocessedGeneration] = None
+
+
+@json_schema_type
+class DatasetType(Enum):
+    custom = "custom"
+    huggingface = "huggingface"
+
+
+@json_schema_type
+class HuggingfaceDatasetDef(BaseModel):
+    type: Literal[DatasetType.huggingface.value] = DatasetType.huggingface.value
+    identifier: str = Field(
+        description="A unique name for the dataset",
+    )
+    dataset_path: str = Field(
+        description="The name of the dataset into HF (e.g. meta-llama/Llama-3.1-8B-Instruct-evals)",
+    )
+    dataset_name: Optional[str] = Field(
+        description="The name of the dataset into HF (e.g. Llama-3.1-8B-Instruct-evals__ifeval__strict__details)",
+    )
+    rename_columns_map: Optional[Dict[str, str]] = Field(
+        description="A map of column names to rename to fit the schema of eval dataset for scoring",
+        default=None,
+    )
+    kwargs: Dict[str, Any] = Field(
+        description="Any additional arguments to get Huggingface (e.g. split, trust_remote_code)",
+        default_factory=dict,
+    )
+
+
+@json_schema_type
+class CustomDatasetDef(BaseModel):
+    type: Literal[DatasetType.custom.value] = DatasetType.custom.value
+    identifier: str = Field(
+        description="A unique name for the dataset",
+    )
+    url: str = Field(
+        description="The URL to the dataset",
+    )
+    rename_columns_map: Optional[Dict[str, str]] = Field(
+        description="A map of column names to rename to fit the schema of eval dataset for scoring",
+        default=None,
+    )
+
+
+DatasetDef = Annotated[
+    Union[
+        HuggingfaceDatasetDef,
+        CustomDatasetDef,
+    ],
+    Field(discriminator="type"),
+]
+
+
+class DatasetsResponseStatus(Enum):
+    success = "success"
+    fail = "fail"
+
+
+@json_schema_type
+class CreateDatasetResponse(BaseModel):
+    status: DatasetsResponseStatus = Field(
+        description="Return status of the dataset creation",
+    )
+    msg: Optional[str] = None
+
+
+@json_schema_type
+class DeleteDatasetResponse(BaseModel):
+    status: DatasetsResponseStatus = Field(
+        description="Return status of the dataset creation",
+    )
+    msg: Optional[str] = None
+
+
+class BaseDataset(ABC, Generic[TDatasetSample]):
+    def __init__(self) -> None:
+        self.type: str = self.__class__.__name__
+
+    @property
+    @abstractmethod
+    def dataset_id(self) -> str:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def __iter__(self) -> Iterator[TDatasetSample]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def __str__(self) -> str:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def __len__(self) -> int:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def load(self) -> None:
+        raise NotImplementedError()
+
+
+class Datasets(Protocol):
+    @webmethod(route="/datasets/create")
+    async def create_dataset(
+        self,
+        dataset_def: DatasetDef,
+    ) -> CreateDatasetResponse: ...
+
+    @webmethod(route="/datasets/get", method="GET")
+    async def get_dataset(
+        self,
+        dataset_identifier: str,
+    ) -> Optional[DatasetDef]: ...
+
+    @webmethod(route="/datasets/delete")
+    async def delete_dataset(
+        self,
+        dataset_identifier: str,
+    ) -> DeleteDatasetResponse: ...
+
+    @webmethod(route="/datasets/list", method="GET")
+    async def list_datasets(self) -> List[DatasetDef]: ...
diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py
new file mode 100644
index 0000000000..fc4820232f
--- /dev/null
+++ b/llama_stack/apis/evals/client.py
@@ -0,0 +1,183 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import json
+
+import fire
+import httpx
+from termcolor import cprint
+
+from .evals import *  # noqa: F403
+import base64
+import mimetypes
+import os
+
+from ..datasets.client import DatasetsClient
+
+
+def data_url_from_file(file_path: str) -> str:
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    with open(file_path, "rb") as file:
+        file_content = file.read()
+
+    base64_content = base64.b64encode(file_content).decode("utf-8")
+    mime_type, _ = mimetypes.guess_type(file_path)
+
+    data_url = f"data:{mime_type};base64,{base64_content}"
+
+    return data_url
+
+
+class EvaluationClient(Evals):
+    def __init__(self, base_url: str):
+        self.base_url = base_url
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def run_evals(
+        self,
+        eval_task_config: EvaluateTaskConfig,
+    ) -> EvaluateResponse:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{self.base_url}/evals/run_eval_task",
+                json={
+                    "eval_task_config": json.loads(eval_task_config.json()),
+                },
+                headers={"Content-Type": "application/json"},
+                timeout=3600,
+            )
+            response.raise_for_status()
+            return EvaluateResponse(**response.json())
+
+    async def run_scorer(
+        self,
+        dataset_config: EvaluateDatasetConfig,
+        eval_scoring_config: EvaluateScoringConfig,
+    ) -> EvaluateResponse:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{self.base_url}/evals/run_scorer",
+                json={
+                    "dataset_config": json.loads(dataset_config.json()),
+                    "eval_scoring_config": json.loads(eval_scoring_config.json()),
+                },
+                headers={"Content-Type": "application/json"},
+                timeout=3600,
+            )
+            response.raise_for_status()
+            return EvaluateResponse(**response.json())
+
+
+async def run_main(host: str, port: int, eval_dataset_path: str = ""):
+    client = EvaluationClient(f"http://{host}:{port}")
+    dataset_client = DatasetsClient(f"http://{host}:{port}")
+
+    # Full Eval Task
+    # 1. register custom dataset
+    response = await dataset_client.create_dataset(
+        dataset_def=CustomDatasetDef(
+            identifier="mmlu-simple-eval-en",
+            url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
+        ),
+    )
+    cprint(f"datasets/create: {response}", "cyan")
+
+    # 2. run evals on the registered dataset
+    eval_task_config = EvaluateTaskConfig(
+        dataset_config=EvaluateDatasetConfig(
+            dataset_identifier="mmlu-simple-eval-en",
+            row_limit=3,
+        ),
+        processor_config=EvaluateProcessorConfig(
+            processor_identifier="mmlu",
+        ),
+        generation_config=EvaluateModelGenerationConfig(
+            model="Llama3.1-8B-Instruct",
+        ),
+        scoring_config=EvaluateScoringConfig(
+            scorer_config_list=[
+                EvaluateSingleScorerConfig(scorer_name="accuracy"),
+                EvaluateSingleScorerConfig(scorer_name="random"),
+            ]
+        ),
+    )
+    response = await client.run_evals(
+        eval_task_config=eval_task_config,
+    )
+    for k, v in response.eval_result.metrics.items():
+        cprint(f"{k}: {v}", "green")
+
+    # Scoring Task
+    # 1. register huggingface dataset
+    response = await dataset_client.create_dataset(
+        dataset_def=HuggingfaceDatasetDef(
+            identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
+            dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals",
+            dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
+            rename_columns_map={
+                "output_parsed_answer": "generated_answer",
+                "input_correct_responses": "expected_answer",
+            },
+            kwargs={"split": "latest"},
+        )
+    )
+    cprint(response, "cyan")
+
+    # register custom dataset from file path
+    response = await dataset_client.create_dataset(
+        dataset_def=CustomDatasetDef(
+            identifier="rag-evals",
+            url=data_url_from_file(eval_dataset_path),
+        )
+    )
+    cprint(response, "cyan")
+
+    # 2. run evals on the registered dataset
+    response = await client.run_scorer(
+        dataset_config=EvaluateDatasetConfig(
+            dataset_identifier="rag-evals",
+            row_limit=10,
+        ),
+        eval_scoring_config=EvaluateScoringConfig(
+            scorer_config_list=[
+                # EvaluateSingleScorerConfig(scorer_name="accuracy"),
+                # EvaluateSingleScorerConfig(
+                #     scorer_name="braintrust::answer-correctness"
+                # ),
+                EvaluateSingleScorerConfig(
+                    scorer_name="llamastack-llm-judge",
+                    llm_judge_config=LLMJudgeConfig(
+                        judge_processor_config=EvaluateProcessorConfig(
+                            processor_identifier="judge",
+                        ),
+                        judge_model_generation_config=EvaluateModelGenerationConfig(
+                            model="Llama3.1-8B-Instruct",
+                        ),
+                        judge_scoring_config=EvaluateJudgeScoringConfig(),
+                    ),
+                ),
+            ]
+        ),
+    )
+
+    for k, v in response.eval_result.metrics.items():
+        cprint(f"{k}: {v}", "green")
+
+
+def main(host: str, port: int, eval_dataset_path: str = ""):
+    asyncio.run(run_main(host, port, eval_dataset_path))
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index 0be2243ab1..c484db734f 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -4,119 +4,256 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from enum import Enum
-from typing import List, Protocol
+from abc import ABC, abstractmethod
+from typing import Dict, Generic, List, Optional, Protocol
 
 from llama_models.schema_utils import webmethod
-
 from pydantic import BaseModel
 
 from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.apis.dataset import *  # noqa: F403
-from llama_stack.apis.common.training_types import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
 
 
-class TextGenerationMetric(Enum):
-    perplexity = "perplexity"
-    rouge = "rouge"
-    bleu = "bleu"
+class EvaluationJob(BaseModel):
+    job_uuid: str
 
 
-class QuestionAnsweringMetric(Enum):
-    em = "em"
-    f1 = "f1"
+class EvaluationJobLogStream(BaseModel):
+    job_uuid: str
 
 
-class SummarizationMetric(Enum):
-    rouge = "rouge"
-    bleu = "bleu"
+@json_schema_type
+class EvalResult(BaseModel):
+    """Aggregated final evaluation result."""
 
+    metrics: Dict[str, float]
 
-class EvaluationJob(BaseModel):
-    job_uuid: str
 
+@json_schema_type
+class SingleEvalResult(BaseModel):
+    """Single evaluation result. Contains a scorer name, and corresponding metrics from scorer."""
 
-class EvaluationJobLogStream(BaseModel):
-    job_uuid: str
+    score_data: Dict[str, float]
+
+
+@json_schema_type
+class EvaluateResponse(BaseModel):
+    """Scores for evaluation."""
 
+    eval_result: EvalResult
+    formatted_report: Optional[str] = None
 
-class EvaluateTaskRequestCommon(BaseModel):
+
+@json_schema_type
+class EvaluationJobStatusResponse(BaseModel):
     job_uuid: str
-    dataset: TrainEvalDataset
 
-    checkpoint: Checkpoint
 
-    # generation params
-    sampling_params: SamplingParams = SamplingParams()
+@json_schema_type
+class EvaluationJobCreateResponse(BaseModel):
+    """Response to create a evaluation job."""
+
+    job_uuid: str
 
 
 @json_schema_type
-class EvaluateTextGenerationRequest(EvaluateTaskRequestCommon):
-    """Request to evaluate text generation."""
+class EvaluateDatasetConfig(BaseModel):
+    # identifier to previously registered dataset via DatasetDef
+    dataset_identifier: str
+    # limit number of rows to evaluate
+    row_limit: Optional[int] = None
+    kwargs: Optional[Dict[str, Any]] = None
 
-    metrics: List[TextGenerationMetric]
+
+@json_schema_type
+class EvaluatePreprocessConfig(BaseModel):
+    kwargs: Optional[Dict[str, Any]] = None
 
 
 @json_schema_type
-class EvaluateQuestionAnsweringRequest(EvaluateTaskRequestCommon):
-    """Request to evaluate question answering."""
+class EvaluateModelGenerationConfig(BaseModel):
+    model: str
+    sampling_params: SamplingParams = SamplingParams()
+    kwargs: Optional[Dict[str, Any]] = None
 
-    metrics: List[QuestionAnsweringMetric]
+
+@json_schema_type
+class EvaluatePostprocessConfig(BaseModel):
+    kwargs: Optional[Dict[str, Any]] = None
 
 
 @json_schema_type
-class EvaluateSummarizationRequest(EvaluateTaskRequestCommon):
-    """Request to evaluate summarization."""
+class EvaluateProcessorConfig(BaseModel):
+    processor_identifier: str
+    preprocess_config: Optional[EvaluatePreprocessConfig] = None
+    postprocess_config: Optional[EvaluatePostprocessConfig] = None
 
-    metrics: List[SummarizationMetric]
 
+@json_schema_type
+class EvaluateJudgeScoringConfig(BaseModel): ...
 
-class EvaluationJobStatusResponse(BaseModel):
-    job_uuid: str
+
+@json_schema_type
+class LLMJudgeConfig(BaseModel):
+    judge_processor_config: EvaluateProcessorConfig
+    judge_model_generation_config: EvaluateModelGenerationConfig
+    judge_scoring_config: EvaluateJudgeScoringConfig
 
 
 @json_schema_type
-class EvaluationJobArtifactsResponse(BaseModel):
-    """Artifacts of a evaluation job."""
+class EvaluateSingleScorerConfig(BaseModel):
+    scorer_name: str
+    llm_judge_config: Optional[LLMJudgeConfig] = None
 
-    job_uuid: str
+
+@json_schema_type
+class EvaluateScoringConfig(BaseModel):
+    # list of scorer (metrics) names to use
+    scorer_config_list: List[EvaluateSingleScorerConfig]
 
 
-class Evaluations(Protocol):
-    @webmethod(route="/evaluate/text_generation/")
-    def evaluate_text_generation(
+@json_schema_type
+class EvaluateTaskConfig(BaseModel):
+    dataset_config: EvaluateDatasetConfig
+    processor_config: EvaluateProcessorConfig
+    generation_config: EvaluateModelGenerationConfig
+    scoring_config: EvaluateScoringConfig
+
+
+class BaseGeneratorProcessor(
+    ABC,
+    Generic[
+        TDatasetSample,
+        TPreprocessedSample,
+        TGenerationResponseSample,
+        TScorerInputSample,
+    ],
+):
+    """
+    Base class for all generator processors. Each processor needs to implement the following methods:
+    - F1: preprocess_sample(self, dataset)
+    - F2: postprocess_sample(self)
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def __str__(self) -> str:
+        return self.__class__.__name__
+
+    def preprocess(
+        self, dataset: BaseDataset[TDatasetSample]
+    ) -> List[TPreprocessedSample]:
+        return [self.preprocess_sample(sample) for sample in dataset]
+
+    def postprocess(
+        self,
+        generation: List[TGenerationResponseSample],
+        dataset: BaseDataset[TDatasetSample],
+    ) -> List[TScorerInputSample]:
+        return [
+            self.postprocess_sample(generation_sample, dataset_sample)
+            for generation_sample, dataset_sample in zip(generation, dataset)
+        ]
+
+    @abstractmethod
+    def preprocess_sample(self, sample: TDatasetSample) -> TPreprocessedSample:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def postprocess_sample(
         self,
-        metrics: List[TextGenerationMetric],
-    ) -> EvaluationJob: ...
+        generation_sample: TGenerationResponseSample,
+        dataset_sample: TDatasetSample,
+    ) -> TScorerInputSample:
+        raise NotImplementedError()
+
+
+class BaseGenerator(ABC, Generic[TPreprocessedSample, TGenerationResponseSample]):
+    """
+    Base class for all generators. Each generator needs to implement the following methods:
+    - generate(self, preprocessed_dataset)
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def __str__(self) -> str:
+        return self.__class__.__name__
+
+    @abstractmethod
+    async def generate(
+        self, preprocessed_dataset: List[TPreprocessedSample]
+    ) -> List[TGenerationResponseSample]:
+        raise NotImplementedError()
+
+
+class BaseScorer(ABC, Generic[TScorerInputSample]):
+    """
+    Base class for all scorers. Each scorer needs to implement the following methods:
+    - score_sample(self, scorer_input_sample)
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def __str__(self) -> str:
+        return self.__class__.__name__
+
+    @abstractmethod
+    def score_sample(self, scorer_input_sample: TScorerInputSample) -> SingleEvalResult:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        raise NotImplementedError()
+
+    def score(
+        self, prepared_eval_dataset: List[TScorerInputSample]
+    ) -> List[SingleEvalResult]:
+        return [self.score_sample(sample) for sample in prepared_eval_dataset]
+
+
+class BaseTask(ABC):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    @abstractmethod
+    async def run(self, *args, **kwargs) -> EvalResult:
+        raise NotImplementedError()
+
+
+class Evals(Protocol):
 
-    @webmethod(route="/evaluate/question_answering/")
-    def evaluate_question_answering(
+    @webmethod(route="/evals/run_eval_task")
+    async def run_eval_task(
         self,
-        metrics: List[QuestionAnsweringMetric],
-    ) -> EvaluationJob: ...
+        eval_task_config: EvaluateTaskConfig,
+    ) -> EvaluateResponse: ...
 
-    @webmethod(route="/evaluate/summarization/")
-    def evaluate_summarization(
+    @webmethod(route="/evals/run_scorer")
+    async def run_scorer(
         self,
-        metrics: List[SummarizationMetric],
-    ) -> EvaluationJob: ...
+        dataset_config: EvaluateDatasetConfig,
+        eval_scoring_config: EvaluateScoringConfig,
+    ) -> EvaluateResponse: ...
 
-    @webmethod(route="/evaluate/jobs")
-    def get_evaluation_jobs(self) -> List[EvaluationJob]: ...
+    # @webmethod(route="/evals/jobs")
+    # def get_evaluation_jobs(self) -> List[EvaluationJob]: ...
 
-    @webmethod(route="/evaluate/job/status")
-    def get_evaluation_job_status(
-        self, job_uuid: str
-    ) -> EvaluationJobStatusResponse: ...
+    # @webmethod(route="/evals/job/create")
+    # async def create_evaluation_job(
+    #     self, model: str, dataset: str, task: str
+    # ) -> EvaluationJob: ...
 
-    # sends SSE stream of logs
-    @webmethod(route="/evaluate/job/logs")
-    def get_evaluation_job_logstream(self, job_uuid: str) -> EvaluationJobLogStream: ...
+    # @webmethod(route="/evals/job/status")
+    # def get_evaluation_job_status(
+    #     self, job_uuid: str
+    # ) -> EvaluationJobStatusResponse: ...
 
-    @webmethod(route="/evaluate/job/cancel")
-    def cancel_evaluation_job(self, job_uuid: str) -> None: ...
+    # # sends SSE stream of logs
+    # @webmethod(route="/evals/job/logs")
+    # def get_evaluation_job_logstream(self, job_uuid: str) -> EvaluationJobLogStream: ...
 
-    @webmethod(route="/evaluate/job/artifacts")
-    def get_evaluation_job_artifacts(
-        self, job_uuid: str
-    ) -> EvaluationJobArtifactsResponse: ...
+    # @webmethod(route="/evals/job/cancel")
+    # def cancel_evaluation_job(self, job_uuid: str) -> None: ...
diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py
index d943f48b20..cdfe5c4673 100644
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@@ -14,7 +14,7 @@
 from pydantic import BaseModel, Field
 
 from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
 from llama_stack.apis.common.training_types import *  # noqa: F403
 
 
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index 0044de09ee..ce7f5a8e50 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -73,6 +73,16 @@ class RoutingTableProviderSpec(ProviderSpec):
     pip_packages: List[str] = Field(default_factory=list)
 
 
+# Example: /datasets
+class RegistryProviderSpec(ProviderSpec):
+    provider_type: str = "registry"
+    config_class: str = ""
+    docker_image: Optional[str] = None
+
+    module: str
+    pip_packages: List[str] = Field(default_factory=list)
+
+
 class DistributionSpec(BaseModel):
     description: Optional[str] = Field(
         default="",
diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py
index 999646cc06..d96db23b46 100644
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@@ -21,6 +21,19 @@ class AutoRoutedApiInfo(BaseModel):
     router_api: Api
 
 
+class RegistryApiInfo(BaseModel):
+    registry_api: Api
+    # registry: Registry
+
+
+def builtin_registry_apis() -> List[RegistryApiInfo]:
+    return [
+        RegistryApiInfo(
+            registry_api=Api.datasets,
+        )
+    ]
+
+
 def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
     return [
         AutoRoutedApiInfo(
@@ -42,7 +55,12 @@ def providable_apis() -> List[Api]:
     routing_table_apis = set(
         x.routing_table_api for x in builtin_automatically_routed_apis()
     )
-    return [api for api in Api if api not in routing_table_apis and api != Api.inspect]
+    registry_apis = set(
+        x.registry_api for x in builtin_registry_apis() if x.registry_api
+    )
+    non_providable_apis = routing_table_apis | registry_apis | {Api.inspect}
+
+    return [api for api in Api if api not in non_providable_apis]
 
 
 def get_provider_registry() -> Dict[Api, Dict[str, ProviderSpec]]:
diff --git a/llama_stack/distribution/registry/__init__.py b/llama_stack/distribution/registry/__init__.py
new file mode 100644
index 0000000000..6e68333280
--- /dev/null
+++ b/llama_stack/distribution/registry/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any
+
+from llama_stack.providers.datatypes import Api
+from .datasets.dataset import DatasetRegistryImpl
+
+
+async def get_registry_impl(api: Api, _deps) -> Any:
+    api_to_registry = {
+        "datasets": DatasetRegistryImpl,
+    }
+
+    if api.value not in api_to_registry:
+        raise ValueError(f"API {api.value} not found in registry map")
+
+    impl = api_to_registry[api.value]()
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/distribution/registry/datasets/__init__.py b/llama_stack/distribution/registry/datasets/__init__.py
new file mode 100644
index 0000000000..4474c8d7d8
--- /dev/null
+++ b/llama_stack/distribution/registry/datasets/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.datasets import *  # noqa: F403
+from ..registry import Registry
+
+
+DatasetRegistry = Registry[BaseDataset]()
diff --git a/llama_stack/distribution/registry/datasets/dataset.py b/llama_stack/distribution/registry/datasets/dataset.py
new file mode 100644
index 0000000000..838e8c65fa
--- /dev/null
+++ b/llama_stack/distribution/registry/datasets/dataset.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.datasets import *  # noqa: F403
+from llama_stack.distribution.registry.datasets import DatasetRegistry
+from llama_stack.distribution.registry.datasets.dataset_wrappers import (
+    CustomDataset,
+    HuggingfaceDataset,
+)
+
+
+class DatasetRegistryImpl(Datasets):
+    """API Impl to interact with underlying dataset registry"""
+
+    def __init__(
+        self,
+    ) -> None:
+        pass
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def create_dataset(
+        self,
+        dataset_def: DatasetDef,
+    ) -> CreateDatasetResponse:
+        if dataset_def.type == DatasetType.huggingface.value:
+            dataset_cls = HuggingfaceDataset(dataset_def)
+        else:
+            dataset_cls = CustomDataset(dataset_def)
+
+        try:
+            DatasetRegistry.register(
+                dataset_def.identifier,
+                dataset_cls,
+            )
+        except ValueError as e:
+            return CreateDatasetResponse(
+                status=DatasetsResponseStatus.fail,
+                msg=str(e),
+            )
+
+        return CreateDatasetResponse(
+            status=DatasetsResponseStatus.success,
+            msg=f"Dataset '{dataset_def.identifier}' registered",
+        )
+
+    async def get_dataset(
+        self,
+        dataset_identifier: str,
+    ) -> Optional[DatasetDef]:
+        try:
+            dataset_ref = DatasetRegistry.get(dataset_identifier).config
+        except ValueError as e:
+            return None
+
+        return dataset_ref
+
+    async def delete_dataset(self, dataset_identifier: str) -> DeleteDatasetResponse:
+        try:
+            DatasetRegistry.delete(dataset_identifier)
+        except ValueError as e:
+            return DeleteDatasetResponse(
+                status=DatasetsResponseStatus.fail,
+                msg=str(e),
+            )
+
+        return DeleteDatasetResponse(
+            status=DatasetsResponseStatus.success,
+            msg=f"Dataset '{dataset_identifier}' deleted",
+        )
+
+    async def list_datasets(self) -> List[DatasetDef]:
+        return [
+            DatasetRegistry.get(dataset_identifier).config
+            for dataset_identifier in DatasetRegistry.names()
+        ]
diff --git a/llama_stack/distribution/registry/datasets/dataset_wrappers.py b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
new file mode 100644
index 0000000000..6c9af5887c
--- /dev/null
+++ b/llama_stack/distribution/registry/datasets/dataset_wrappers.py
@@ -0,0 +1,115 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import io
+
+import pandas
+from datasets import Dataset, load_dataset
+
+from llama_stack.apis.datasets import *  # noqa: F403
+from llama_stack.providers.utils.memory.vector_store import parse_data_url
+
+
+class CustomDataset(BaseDataset[DictSample]):
+    def __init__(self, config: CustomDatasetDef) -> None:
+        super().__init__()
+        self.config = config
+        self.dataset = None
+        self.index = 0
+
+    @property
+    def dataset_id(self) -> str:
+        return self.config.identifier
+
+    def __iter__(self) -> Iterator[DictSample]:
+        if not self.dataset:
+            self.load()
+        return (DictSample(data=x) for x in self.dataset)
+
+    def __str__(self) -> str:
+        return f"CustomDataset({self.config})"
+
+    def __len__(self) -> int:
+        if not self.dataset:
+            self.load()
+        return len(self.dataset)
+
+    def load(self, n_samples: Optional[int] = None) -> None:
+        if self.dataset:
+            return
+
+        # TODO: more robust support w/ data url
+        if self.config.url.endswith(".csv"):
+            df = pandas.read_csv(self.config.url)
+        elif self.config.url.endswith(".xlsx"):
+            df = pandas.read_excel(self.config.url)
+        elif self.config.url.startswith("data:"):
+            parts = parse_data_url(self.config.url)
+            data = parts["data"]
+            if parts["is_base64"]:
+                data = base64.b64decode(data)
+            else:
+                data = unquote(data)
+                encoding = parts["encoding"] or "utf-8"
+                data = data.encode(encoding)
+
+            mime_type = parts["mimetype"]
+            mime_category = mime_type.split("/")[0]
+            data_bytes = io.BytesIO(data)
+
+            if mime_category == "text":
+                df = pandas.read_csv(data_bytes)
+            else:
+                df = pandas.read_excel(data_bytes)
+        else:
+            raise ValueError(f"Unsupported file type: {self.config.url}")
+
+        if n_samples is not None:
+            df = df.sample(n=min(n_samples, len(df)))
+
+        self.dataset = Dataset.from_pandas(df)
+        if self.config.rename_columns_map:
+            for k, v in self.config.rename_columns_map.items():
+                self.dataset = self.dataset.rename_column(k, v)
+
+
+class HuggingfaceDataset(BaseDataset[DictSample]):
+    def __init__(self, config: HuggingfaceDatasetDef):
+        super().__init__()
+        self.config = config
+        self.dataset = None
+
+    @property
+    def dataset_id(self) -> str:
+        return self.config.identifier
+
+    def __iter__(self) -> Iterator[DictSample]:
+        if not self.dataset:
+            self.load()
+        return (DictSample(data=x) for x in self.dataset)
+
+    def __str__(self):
+        return f"HuggingfaceDataset({self.config})"
+
+    def __len__(self):
+        if not self.dataset:
+            self.load()
+        return len(self.dataset)
+
+    def load(self, n_samples: Optional[int] = None):
+        if self.dataset:
+            return
+
+        if self.config.dataset_name:
+            self.config.kwargs["name"] = self.config.dataset_name
+
+        self.dataset = load_dataset(self.config.dataset_path, **self.config.kwargs)
+
+        if n_samples:
+            self.dataset = self.dataset.select(range(n_samples))
+
+        if self.config.rename_columns_map:
+            for k, v in self.config.rename_columns_map.items():
+                self.dataset = self.dataset.rename_column(k, v)
diff --git a/llama_stack/distribution/registry/generator_processors/__init__.py b/llama_stack/distribution/registry/generator_processors/__init__.py
new file mode 100644
index 0000000000..862984f548
--- /dev/null
+++ b/llama_stack/distribution/registry/generator_processors/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.evals import *  # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.processor import *  # noqa: F403
+
+from ..registry import Registry
+
+# TODO: decide whether we should group dataset+processor together via Tasks
+GeneratorProcessorRegistry = Registry[BaseGeneratorProcessor]()
+
+PROCESSOR_REGISTRY = {
+    "mmlu": MMLUProcessor,
+    "judge": JudgeProcessor,
+}
+
+for k, v in PROCESSOR_REGISTRY.items():
+    GeneratorProcessorRegistry.register(k, v)
diff --git a/llama_stack/distribution/registry/registry.py b/llama_stack/distribution/registry/registry.py
new file mode 100644
index 0000000000..702ed7d869
--- /dev/null
+++ b/llama_stack/distribution/registry/registry.py
@@ -0,0 +1,36 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import AbstractSet, Generic, TypeVar
+
+TRegistry = TypeVar("TRegistry")
+
+
+class Registry(Generic[TRegistry]):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.registry = {}
+
+    def names(self) -> AbstractSet[str]:
+        return self.registry.keys()
+
+    def register(self, name: str, task: TRegistry) -> None:
+        if name in self.registry:
+            raise ValueError(f"Dataset {name} already exists.")
+        self.registry[name] = task
+
+    def get(self, name: str) -> TRegistry:
+        if name not in self.registry:
+            raise ValueError(f"Dataset {name} not found.")
+        return self.registry[name]
+
+    def delete(self, name: str) -> None:
+        if name not in self.registry:
+            raise ValueError(f"Dataset {name} not found.")
+        del self.registry[name]
+
+    def reset(self) -> None:
+        self.registry = {}
diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py
new file mode 100644
index 0000000000..dda71d4e00
--- /dev/null
+++ b/llama_stack/distribution/registry/scorers/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+# TODO: make these import config based
+from llama_stack.apis.evals import *  # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import *  # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.scorer.braintrust_scorer import *  # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.scorer.llm_judge_scorer import *  # noqa: F403
+
+from ..registry import Registry
+
+# TODO: make these import config based
+ScorerRegistry = Registry[BaseScorer]()
+
+SCORER_REGISTRY = {
+    "accuracy": AccuracyScorer,
+    "random": RandomScorer,
+    "llamastack-llm-judge": LlamaStackLLMJudgeScorer,
+    "braintrust::factuality": BraintrustFactualityScorer,
+    "braintrust::answer-correctness": BraintrustAnswerCorrectnessScorer,
+}
+
+for k, v in SCORER_REGISTRY.items():
+    ScorerRegistry.register(k, v)
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index a05e08cd7c..e71c3fd8ce 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -12,6 +12,8 @@
 from llama_stack.distribution.datatypes import *  # noqa: F403
 
 from llama_stack.apis.agents import Agents
+from llama_stack.apis.datasets import Datasets
+from llama_stack.apis.evals import Evals
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.memory import Memory
@@ -22,6 +24,7 @@
 from llama_stack.apis.telemetry import Telemetry
 from llama_stack.distribution.distribution import (
     builtin_automatically_routed_apis,
+    builtin_registry_apis,
     get_provider_registry,
 )
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
@@ -38,6 +41,8 @@ def api_protocol_map() -> Dict[Api, Any]:
         Api.safety: Safety,
         Api.shields: Shields,
         Api.telemetry: Telemetry,
+        Api.evals: Evals,
+        Api.datasets: Datasets,
     }
 
 
@@ -137,6 +142,20 @@ async def resolve_impls_with_routing(run_config: StackRunConfig) -> Dict[Api, An
             )
         }
 
+    for info in builtin_registry_apis():
+        providers_with_specs[info.registry_api.value] = {
+            "__builtin__": ProviderWithSpec(
+                provider_id="__registry__",
+                provider_type="__registry__",
+                config={},
+                spec=RegistryProviderSpec(
+                    api=info.registry_api,
+                    module="llama_stack.distribution.registry",
+                    deps__=[],
+                ),
+            )
+        }
+
     sorted_providers = topological_sort(
         {k: v.values() for k, v in providers_with_specs.items()}
     )
@@ -257,6 +276,12 @@ async def instantiate_provider(
 
         config = None
         args = [provider_spec.api, inner_impls, deps]
+    elif isinstance(provider_spec, RegistryProviderSpec):
+        print("ROUTER PROVIDER SPEC")
+        method = "get_registry_impl"
+
+        config = None
+        args = [provider_spec.api, deps]
     else:
         method = "get_provider_impl"
 
diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py
index 777cd855b7..1d397c9e73 100644
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@@ -32,6 +32,9 @@ class Api(Enum):
     # built-in API
     inspect = "inspect"
 
+    evals = "evals"
+    datasets = "datasets"
+
 
 class ModelsProtocolPrivate(Protocol):
     async def list_models(self) -> List[ModelDef]: ...
diff --git a/llama_stack/providers/impls/meta_reference/evals/__init__.py b/llama_stack/providers/impls/meta_reference/evals/__init__.py
new file mode 100644
index 0000000000..f4dd4b79d6
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import MetaReferenceEvalsImplConfig  # noqa
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.distribution.datatypes import Api, ProviderSpec
+
+
+async def get_provider_impl(
+    config: MetaReferenceEvalsImplConfig, deps: Dict[Api, ProviderSpec]
+):
+    from .evals import MetaReferenceEvalsImpl
+
+    impl = MetaReferenceEvalsImpl(config, deps[Api.inference])
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/providers/impls/meta_reference/evals/config.py b/llama_stack/providers/impls/meta_reference/evals/config.py
new file mode 100644
index 0000000000..05dee366ed
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/config.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pydantic import BaseModel
+
+
+class MetaReferenceEvalsImplConfig(BaseModel): ...
diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py
new file mode 100644
index 0000000000..7d3eaa85d8
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/evals.py
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import json
+
+from termcolor import cprint
+
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.apis.evals import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
+
+from .config import MetaReferenceEvalsImplConfig
+from .tasks.run_eval_task import RunEvalTask
+from .tasks.run_scoring_task import RunScoringTask
+
+
+class MetaReferenceEvalsImpl(Evals):
+    def __init__(self, config: MetaReferenceEvalsImplConfig, inference_api: Inference):
+        self.inference_api = inference_api
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def run_eval_task(
+        self,
+        eval_task_config: EvaluateTaskConfig,
+    ) -> EvaluateResponse:
+        cprint(f"run_eval_task: on {eval_task_config}", "green")
+
+        run_task = RunEvalTask()
+        eval_result = await run_task.run(eval_task_config, self.inference_api)
+
+        return EvaluateResponse(
+            eval_result=eval_result,
+            formatted_report=json.dumps(eval_result.json(), indent=4),
+        )
+
+    async def run_scorer(
+        self,
+        dataset_config: EvaluateDatasetConfig,
+        eval_scoring_config: EvaluateScoringConfig,
+    ) -> EvaluateResponse:
+        cprint(f"run_scorer: on {dataset_config} with {eval_scoring_config}", "green")
+
+        run_task = RunScoringTask()
+        eval_result = await run_task.run(
+            dataset_config, eval_scoring_config, self.inference_api
+        )
+
+        return EvaluateResponse(
+            eval_result=eval_result,
+            formatted_report=json.dumps(eval_result.json(), indent=4),
+        )
diff --git a/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py b/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py
new file mode 100644
index 0000000000..756f351d88
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/generator/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py b/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py
new file mode 100644
index 0000000000..dafbb16f5b
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/generator/inference_generator.py
@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from termcolor import cprint
+
+from llama_stack.apis.evals import *  # noqa: F403
+from llama_stack.apis.inference import *  # noqa: F403
+
+
+class InferenceGenerator(BaseGenerator[PreprocessedSample, GenerationResponseSample]):
+    """
+    InferenceGenerator for LlamaStack
+    """
+
+    def __init__(
+        self,
+        model,
+        inference_api,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.model = model
+        self.inference_api = inference_api
+
+    async def generate(
+        self, preprocessed_dataset: List[PreprocessedSample]
+    ) -> List[GenerationResponseSample]:
+        generation_outputs = []
+        for sample in preprocessed_dataset:
+            response = await self.inference_api.chat_completion(
+                model=self.model,
+                messages=sample.generation_input.messages,
+                stream=False,
+            )
+            cprint(f"response: {response}", "cyan")
+
+            generation_outputs.append(
+                GenerationResponseSample(
+                    generation_output=GenerationOutput(
+                        completion_message=response.completion_message.content
+                    )
+                )
+            )
+        return generation_outputs
diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py
new file mode 100644
index 0000000000..5a7ca27958
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from .judge_processor import JudgeProcessor  # noqa: F401
+from .mmlu_processor import MMLUProcessor  # noqa: F401
diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py b/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py
new file mode 100644
index 0000000000..d7d6ae3eb2
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/processor/judge_processor.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import re
+
+from llama_stack.apis.evals import *  # noqa: F403
+
+JUDGE_PROMPT = """
+You will be given a question, a expected_answer, and a system_answer.
+Your task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question.
+Give your answer as a integer on a scale of 0 to 5, where 0 means that the system_answer is not correct at all compared with expected_answer, and 5 means that the answer completely and correctly answers the question.
+
+Provide your feedback as follows:
+
+Feedback:::
+Total rating: (your rating, as a int between 0 and 5)
+
+Now here are the question, expected_answer, system_answer.
+
+Question: {question}
+Expected Answer: {expected_answer}
+System Answer: {answer}
+
+Feedback:::
+Total rating:
+"""
+
+
+class JudgeProcessor(
+    BaseGeneratorProcessor[
+        DictSample, PreprocessedSample, GenerationResponseSample, ScorerInputSample
+    ]
+):
+    """
+    Generator processor for LLM Judge
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def preprocess_sample(self, sample: DictSample) -> PreprocessedSample:
+        content = JUDGE_PROMPT.format(
+            question=sample.data["input_query"],
+            expected_answer=sample.data["expected_answer"],
+            answer=sample.data["generated_answer"],
+        )
+        preprocessed_msgs = [
+            {
+                "role": "user",
+                "content": content,
+            }
+        ]
+        processed_sample = PreprocessedSample(
+            generation_input=GenerationInput(
+                messages=preprocessed_msgs,
+            )
+        )
+        return processed_sample
+
+    def postprocess_sample(
+        self, generation_sample: GenerationResponseSample, dataset_sample: DictSample
+    ) -> ScorerInputSample:
+        response_text = generation_sample.generation_output.completion_message
+        match = re.search(r"Total rating: (\d+)", response_text)
+        judge_rating = int(match.group(1))
+
+        return ScorerInputSample(
+            generated_answer=str(judge_rating),
+            expected_answer=dataset_sample.data["expected_answer"],
+            generation_output=PostprocessedGeneration(
+                completion_message=response_text,
+            ),
+        )
diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py b/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py
new file mode 100644
index 0000000000..fc2d9eb642
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/processor/mmlu_processor.py
@@ -0,0 +1,161 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import re
+
+from llama_stack.apis.evals import *  # noqa: F403
+
+QUERY_TEMPLATE_MULTICHOICE = """
+Answer the following multiple choice question and make the answer very simple. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD.
+
+{Question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+MULTILINGUAL_ANSWER_REGEXES = [
+    r"Answer\s*:",
+    r"Answer\s*:​​​​​​",  # Korean invisible character
+    r"উত্তর\s*:",
+    r"उत्तर\s*:",
+    r"উত্তরঃ",
+    r"উত্তর\s*:",
+    r"Antwort\s*:",
+    r"답변\s*:",
+    r"정답\s*:",
+    r"답\s*:",
+    r"答案\s*：",
+    r"答案\s*:",
+    r"答\s*：",
+    r"答\s*:",
+    r"答复\s*：",
+    r"答曰\s*：",
+    r"الإجابة:",
+    r"الجواب:",
+    r"إجابة:",
+    r"الإجابة النهائية:",
+    r"الإجابة الصحيحة:",
+    r"الإجابة الصحيحة هي:",
+    r"الإجابة هي:",
+    r"Respuesta\s*:",
+    r"Risposta\s*:",
+    r"答え\s*:",
+    r"答え\s*：",
+    r"回答\s*:",
+    r"回答\s*：",
+    r"解答\s*:",
+    r"Jawaban\s*:",
+    r"Réponse\s*:",
+    r"Resposta\s*:",
+    r"Jibu\s*:",
+    r"Idahun\s*:",
+    r"Ìdáhùn\s*:",
+    r"Idáhùn\s*:",
+    r"Àmọ̀nà\s*:",
+    r"Àdáhùn\s*:",
+    r"Ànúgọ\s*:",
+    r"Àṣàyàn\s*:",
+]
+
+MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = (
+    r"(?i){}\s*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[Ａ]|[Ｂ]|[Ｃ]|[Ｄ])"
+)
+
+
+def normalize_response(response: str) -> str:
+    """
+    Normalize the response by removing markdown and LaTeX formatting that may prevent a match.
+    """
+
+    return (
+        response.replace("**", "")
+        .replace("$\\boxed{", "")
+        .replace("}$", "")
+        .replace("\\$", "")
+        .replace("$\\text{", "")
+        .replace("$", "")
+        .replace("\\mathrm{", "")
+        .replace("\\{", "")
+        .replace("\\text", "")
+        .replace("\\(", "")
+        .replace("\\mathbf{", "")
+        .replace("{", "")
+        .replace("\\boxed", "")
+    )
+
+
+def normalize_extracted_answer(extracted_answer: str) -> str:
+    return (
+        # In arabic these are the letters used for A-D in multiple choice questions
+        extracted_answer.replace("أ", " A")
+        .replace("ب", " B")
+        .replace("ج", " C")
+        .replace("د", " D")
+        # In Bengali these are the letters used for A-D in multiple choice questions
+        .replace("অ", " A")
+        .replace("ব", " B")
+        .replace("ড", " C")
+        .replace("ঢ", " D")
+        # In Japanese these are the letters sometimes used for A-D in multiple choice questions
+        .replace("Ａ", " A")
+        .replace("Ｂ", " B")
+        .replace("Ｃ", " C")
+        .replace("Ｄ", " D")
+        .strip()
+    )
+
+
+class MMLUProcessor(
+    BaseGeneratorProcessor[
+        DictSample, PreprocessedSample, GenerationResponseSample, ScorerInputSample
+    ]
+):
+    """
+    Generator processor for MMLU
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def preprocess_sample(self, sample: DictSample) -> PreprocessedSample:
+        content = QUERY_TEMPLATE_MULTICHOICE.format(**sample.data)
+        preprocessed_msgs = [
+            {
+                "role": "user",
+                "content": content,
+            }
+        ]
+        processed_sample = PreprocessedSample(
+            generation_input=GenerationInput(
+                messages=preprocessed_msgs,
+            )
+        )
+        return processed_sample
+
+    def postprocess_sample(
+        self, generation_sample: GenerationResponseSample, dataset_sample: DictSample
+    ) -> ScorerInputSample:
+        response_text = generation_sample.generation_output.completion_message
+        normalized_response = normalize_response(response_text)
+
+        # extract answer
+        extracted_answer = ""
+        for answer_regex in MULTILINGUAL_ANSWER_REGEXES:
+            regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex)
+            match = re.search(regex, normalized_response)
+            if match:
+                extracted_answer = normalize_extracted_answer(match.group(1))
+                break
+
+        return ScorerInputSample(
+            generated_answer=extracted_answer,
+            expected_answer=dataset_sample.data["Answer"],
+            generation_output=PostprocessedGeneration(
+                completion_message=response_text,
+            ),
+        )
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py
new file mode 100644
index 0000000000..6424963f87
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from .basic_scorers import *  # noqa: F401 F403
+from .aggregate_scorer import *  # noqa: F401 F403
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py
new file mode 100644
index 0000000000..1a0621960e
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult
+from llama_stack.apis.datasets.datasets import *  # noqa: F401 F403
+
+
+class AggregateScorer(BaseScorer[ScorerInputSample]):
+    def __init__(self, scorers: List[BaseScorer[ScorerInputSample]]):
+        self.scorers = scorers
+
+    def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+        all_score_data = {}
+        for scorer in self.scorers:
+            score_data = scorer.score_sample(scorer_input_sample).score_data
+            for k, v in score_data.items():
+                all_score_data[k] = v
+
+        return SingleEvalResult(
+            score_data=all_score_data,
+        )
+
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        all_metrics = {}
+
+        for scorer in self.scorers:
+            metrics = scorer.aggregate_results(eval_results).metrics
+            for k, v in metrics.items():
+                all_metrics[f"{scorer.__class__.__name__}:{k}"] = v
+
+        return EvalResult(
+            metrics=all_metrics,
+        )
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
new file mode 100644
index 0000000000..748f9fc1f8
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import random
+
+from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult
+from llama_stack.apis.datasets.datasets import *  # noqa: F401 F403
+
+
+class RandomScorer(BaseScorer[ScorerInputSample]):
+    def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+        return SingleEvalResult(score_data={"random": random.random()})
+
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        avg_random = sum(
+            [result.score_data["random"] for result in eval_results]
+        ) / len(eval_results)
+        max_random = max([result.score_data["random"] for result in eval_results])
+        return EvalResult(
+            metrics={
+                "avg_random": avg_random,
+                "max_random": max_random,
+            }
+        )
+
+
+class AccuracyScorer(BaseScorer[ScorerInputSample]):
+    def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+        extracted_answer = scorer_input_sample.generated_answer
+        expected_answer = scorer_input_sample.expected_answer
+
+        if isinstance(expected_answer, list):
+            accuracy = (
+                1.0 if extracted_answer and extracted_answer in expected_answer else 0.0
+            )
+        else:
+            accuracy = (
+                1.0 if extracted_answer and extracted_answer == expected_answer else 0.0
+            )
+
+        return SingleEvalResult(score_data={"accuracy": accuracy})
+
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        num_correct = sum([result.score_data["accuracy"] for result in eval_results])
+        num_total = len(eval_results)
+
+        return EvalResult(
+            metrics={
+                "avg_accuracy": num_correct / num_total,
+                "num_correct": num_correct,
+                "num_total": num_total,
+            }
+        )
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py
new file mode 100644
index 0000000000..c124aaad6a
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/braintrust_scorer.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import numpy as np
+
+from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult
+from llama_stack.apis.datasets.datasets import *  # noqa: F401 F403
+from autoevals.llm import *  # noqa: F403
+from autoevals.ragas import *  # noqa: F403
+
+
+class BraintrustFactualityScorer(BaseScorer[ScorerInputSample]):
+    def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+        input_query = scorer_input_sample.input_query
+        generated_answer = scorer_input_sample.generated_answer
+        expected_answer = scorer_input_sample.expected_answer
+
+        evaluator = Factuality()
+        result = evaluator(generated_answer, expected_answer, input=input_query)
+        factuality = result.score
+        return SingleEvalResult(score_data={"factuality": factuality})
+
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        avg_score = np.average(
+            [result.score_data["factuality"] for result in eval_results]
+        )
+
+        return EvalResult(
+            metrics={
+                "avg_factuality_score": avg_score,
+            }
+        )
+
+
+class BraintrustAnswerCorrectnessScorer(BaseScorer[ScorerInputSample]):
+    def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+        input_query = scorer_input_sample.input_query
+        generated_answer = scorer_input_sample.generated_answer
+        expected_answer = scorer_input_sample.expected_answer
+
+        evaluator = AnswerCorrectness()
+        result = evaluator(generated_answer, expected_answer, input=input_query)
+        correctness = result.score
+        return SingleEvalResult(score_data={"answer_correctness": correctness})
+
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        avg_score = np.average(
+            [result.score_data["answer_correctness"] for result in eval_results]
+        )
+
+        return EvalResult(
+            metrics={
+                "avg_correctness_score": avg_score,
+            }
+        )
diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py
new file mode 100644
index 0000000000..f5f56b435f
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/scorer/llm_judge_scorer.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import asyncio
+import threading
+
+import numpy as np
+
+from llama_stack.distribution.registry.generator_processors import (
+    GeneratorProcessorRegistry,
+)
+from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import (
+    InferenceGenerator,
+)
+
+from llama_stack.apis.evals.evals import *  # noqa: F401 F403
+from llama_stack.apis.datasets.datasets import *  # noqa: F401 F403
+from llama_stack.apis.inference import *  # noqa: F403
+
+
+class LlamaStackLLMJudgeScorer(BaseScorer[ScorerInputSample]):
+    def __init__(self, llm_judge_config: LLMJudgeConfig, inference_api: Inference):
+        self.llm_judge_config = llm_judge_config
+        self.inference_api = inference_api
+        # https://stackoverflow.com/questions/74703727/how-to-call-async-function-from-sync-funcion-and-get-result-while-a-loop-is-alr
+        # We will use another thread wih its own event loop to run the async api within sync function
+        self._loop = asyncio.new_event_loop()
+        self._thr = threading.Thread(
+            target=self._loop.run_forever, name="Async Runner", daemon=True
+        )
+        if not self._thr.is_alive():
+            self._thr.start()
+
+    def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult:
+        input_query = scorer_input_sample.input_query
+        generated_answer = scorer_input_sample.generated_answer
+        expected_answer = scorer_input_sample.expected_answer
+
+        # Judge F1
+        processor = GeneratorProcessorRegistry.get(
+            self.llm_judge_config.judge_processor_config.processor_identifier
+        )()
+        data_sample = DictSample(
+            data={
+                "input_query": input_query,
+                "generated_answer": generated_answer,
+                "expected_answer": expected_answer,
+            }
+        )
+        preprocessed_sample = processor.preprocess_sample(data_sample)
+
+        # Judge Generation
+        generator = InferenceGenerator(
+            model=self.llm_judge_config.judge_model_generation_config.model,
+            inference_api=self.inference_api,
+        )
+
+        future = asyncio.run_coroutine_threadsafe(
+            generator.generate([preprocessed_sample]), self._loop
+        )
+        generation_outputs = future.result()
+        # Judge F2
+        postprocessed_sample = processor.postprocess_sample(
+            generation_outputs[0], data_sample
+        )
+
+        # Judge F3
+        score = float(postprocessed_sample.generated_answer)
+
+        return SingleEvalResult(score_data={"judge_score": score})
+
+    def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult:
+        avg_score = np.average(
+            [result.score_data["judge_score"] for result in eval_results]
+        )
+
+        return EvalResult(
+            metrics={
+                "avg_judge_score": avg_score,
+            }
+        )
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py b/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py
new file mode 100644
index 0000000000..756f351d88
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
new file mode 100644
index 0000000000..fbd98128f1
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.distribution.registry.datasets import DatasetRegistry
+from llama_stack.distribution.registry.generator_processors import (
+    GeneratorProcessorRegistry,
+)
+from llama_stack.distribution.registry.scorers import ScorerRegistry
+
+from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import *  # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import *  # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import (
+    InferenceGenerator,
+)
+
+
+from llama_stack.apis.evals import *  # noqa: F403
+from llama_stack.apis.inference import *  # noqa: F403
+from termcolor import cprint
+
+
+class RunEvalTask(BaseTask):
+    """
+    RunEvalTask for LlamaStack
+    """
+
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+
+    async def run(
+        self,
+        eval_task_config: EvaluateTaskConfig,
+        inference_api: Inference,
+        *args,
+        **kwargs,
+    ) -> EvalResult:
+        print(f"Running eval task w/ {eval_task_config}")
+
+        print(DatasetRegistry.names())
+        dataset = DatasetRegistry.get(
+            eval_task_config.dataset_config.dataset_identifier
+        )
+        dataset.load(n_samples=eval_task_config.dataset_config.row_limit)
+        print(f"Running on {len(dataset)} samples")
+
+        # F1
+        print(GeneratorProcessorRegistry.names())
+        processor = GeneratorProcessorRegistry.get(
+            eval_task_config.processor_config.processor_identifier
+        )()
+        preprocessed = processor.preprocess(dataset)
+
+        # Generation
+        generator = InferenceGenerator(
+            model=eval_task_config.generation_config.model,
+            inference_api=inference_api,
+        )
+        generation_outputs = await generator.generate(preprocessed)
+
+        # F2
+        postprocessed = processor.postprocess(generation_outputs, dataset)
+        cprint(postprocessed, "blue")
+
+        # F3 - scorer
+        scorer_config_list = eval_task_config.scoring_config.scorer_config_list
+        scorer_list = []
+        for s_conf in scorer_config_list:
+            scorer = ScorerRegistry.get(s_conf.scorer_name)
+            if s_conf.llm_judge_config:
+                scorer_list.append(
+                    scorer(
+                        llm_judge_config=s_conf.llm_judge_config,
+                        inference_api=inference_api,
+                    )
+                )
+            else:
+                scorer_list.append(scorer())
+
+        scorer = AggregateScorer(
+            scorers=scorer_list,
+        )
+
+        scorer_results = scorer.score(postprocessed)
+        cprint(scorer_results, "magenta")
+        eval_result = scorer.aggregate_results(scorer_results)
+
+        return eval_result
diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
new file mode 100644
index 0000000000..6b11191f1e
--- /dev/null
+++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_scoring_task.py
@@ -0,0 +1,90 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.distribution.registry.datasets import DatasetRegistry
+from llama_stack.distribution.registry.scorers import ScorerRegistry
+
+from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import *  # noqa: F403
+from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import *  # noqa: F403
+
+from llama_stack.apis.evals import *  # noqa: F403
+from llama_stack.apis.inference import *  # noqa: F403
+
+
+class RunScoringTask(BaseTask):
+    """
+    RunScoringTask - only run scoring (F3) based on dataset and scoring config
+    """
+
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+
+    def transform_score_input_sample(
+        self, dataset: BaseDataset
+    ) -> List[ScorerInputSample]:
+        scorer_inputs = []
+        for x in dataset:
+            expected_answer = x.data["expected_answer"]
+            generated_answer = x.data["generated_answer"]
+            input_query = None
+            if "input_query" in x.data:
+                input_query = x.data["input_query"]
+
+            scorer_inputs.append(
+                ScorerInputSample(
+                    expected_answer=expected_answer,
+                    generated_answer=generated_answer,
+                    input_query=input_query,
+                )
+            )
+
+        return scorer_inputs
+
+    async def run(
+        self,
+        dataset_config: EvaluateDatasetConfig,
+        eval_scoring_config: EvaluateScoringConfig,
+        inference_api: Inference,
+        *args,
+        **kwargs,
+    ) -> EvalResult:
+        print(
+            f"Running scoring task w/ dataset={dataset_config} scoring={eval_scoring_config}"
+        )
+
+        dataset = DatasetRegistry.get(dataset_config.dataset_identifier)
+        dataset.load(n_samples=dataset_config.row_limit)
+        print(f"Running on {len(dataset)} samples")
+
+        # transform dataset into List[ScorerInputSample]
+        postprocessed = self.transform_score_input_sample(dataset)
+
+        # F3 - scorer
+        scorer_config_list = eval_scoring_config.scorer_config_list
+        scorer_list = []
+        for s_conf in scorer_config_list:
+            scorer = ScorerRegistry.get(s_conf.scorer_name)
+            if s_conf.llm_judge_config:
+                scorer_list.append(
+                    scorer(
+                        llm_judge_config=s_conf.llm_judge_config,
+                        inference_api=inference_api,
+                    )
+                )
+            else:
+                scorer_list.append(scorer())
+
+        scorer = AggregateScorer(
+            scorers=scorer_list,
+        )
+
+        scorer_results = scorer.score(postprocessed)
+        eval_result = scorer.aggregate_results(scorer_results)
+
+        return eval_result
diff --git a/llama_stack/providers/impls/third_party/evals/__init__.py b/llama_stack/providers/impls/third_party/evals/__init__.py
new file mode 100644
index 0000000000..756f351d88
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py b/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py
new file mode 100644
index 0000000000..9886ed6d6c
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import EleutherEvalsImplConfig  # noqa
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.distribution.datatypes import Api, ProviderSpec
+
+
+async def get_provider_impl(
+    config: EleutherEvalsImplConfig, deps: Dict[Api, ProviderSpec]
+):
+    from .eleuther import EleutherEvalsAdapter
+
+    impl = EleutherEvalsAdapter(config, deps[Api.inference])
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/config.py b/llama_stack/providers/impls/third_party/evals/eleuther/config.py
new file mode 100644
index 0000000000..a9ab297b42
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/config.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pydantic import BaseModel
+
+
+class EleutherEvalsImplConfig(BaseModel): ...
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py
new file mode 100644
index 0000000000..e4b32a45e0
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py
@@ -0,0 +1,170 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.apis.evals import *  # noqa: F403
+import os
+import random
+import threading
+from pathlib import Path
+
+import lm_eval
+import tqdm
+from lm_eval.api.model import LM
+from lm_eval.evaluator import evaluate, get_task_list
+from lm_eval.tasks import get_task_dict, TaskManager
+from termcolor import cprint
+
+from .config import EleutherEvalsImplConfig
+
+
+# https://stackoverflow.com/questions/74703727/how-to-call-async-function-from-sync-funcion-and-get-result-while-a-loop-is-alr
+# We will use another thread wih its own event loop to run the async api within sync function
+_loop = asyncio.new_event_loop()
+_thr = threading.Thread(target=_loop.run_forever, name="Async Runner", daemon=True)
+
+
+class EleutherEvalsWrapper(LM):
+    def __init__(
+        self,
+        inference_api: Inference,
+        model: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.inference_api = inference_api
+        self.model = model
+        self.tokenizer = None
+        self.tokenized_requests = False
+        self.kwargs = kwargs
+
+    @property
+    def eot_token_id(self):
+        raise NotImplementedError("Not implemented")
+
+    @property
+    def max_length(self) -> int:
+        return NotImplementedError("Not implemented")
+
+    @property
+    def max_gen_toks(self) -> int:
+        return NotImplementedError("Not implemented")
+
+    @property
+    def batch_size(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError("No support for logits.")
+
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError("No support for logits.")
+
+    @property
+    def world_size(self):
+        return 1
+
+    def tok_encode(self, string: str) -> List[int]:
+        return NotImplementedError("Not implemented")
+
+    def tok_decode(self, tokens: List[int]) -> str:
+        return NotImplementedError("Not implemented")
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override generate_until
+        raise NotImplementedError()
+
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
+        # TODO: implement inference completion with loglikelihood
+        res = []
+        for req in requests:
+            res.append((-random.random(), False))
+
+        return res
+
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+        res = []
+        if not _thr.is_alive():
+            _thr.start()
+        for req in tqdm.tqdm(requests):
+            chat_completion_coro_fn = self.inference_api.chat_completion(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": req.args[0],
+                    }
+                ],
+                stream=False,
+            )
+            future = asyncio.run_coroutine_threadsafe(chat_completion_coro_fn, _loop)
+            response = future.result()
+            res.append(response.completion_message.content)
+
+        return res
+
+
+class EleutherEvalsAdapter(Evals):
+    def __init__(self, config: EleutherEvalsImplConfig, inference_api: Inference):
+        self.inference_api = inference_api
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def run_evals(
+        self,
+        model: str,
+        task: str,
+        dataset: Optional[str] = None,
+        eval_task_config: Optional[EvaluateTaskConfig] = None,
+    ) -> EvaluateResponse:
+        cprint(f"Eleuther Evals: {model} {dataset} {task}", "red")
+
+        eluther_wrapper = EleutherEvalsWrapper(self.inference_api, model)
+        current_dir = Path(os.path.dirname(os.path.abspath(__file__)))
+
+        # custom registry of harness tasks
+        task_manager = TaskManager(
+            include_path=str(current_dir / "tasks"),
+        )
+
+        task_dict = get_task_dict(task, task_manager)
+        cprint(task_dict, "blue")
+
+        task_types = set([t.task.OUTPUT_TYPE for t in get_task_list(task_dict)])
+        cprint(task_types, "cyan")
+
+        output = evaluate(
+            eluther_wrapper,
+            task_dict,
+            limit=eval_task_config.n_samples,
+        )
+
+        eval_result = EvalResult(
+            metrics={},
+        )
+        formatted_output = lm_eval.utils.make_table(output)
+
+        cprint(formatted_output, "green")
+
+        return EvaluateResponse(
+            eval_result=eval_result,
+            formatted_report=formatted_output,
+        )
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml
new file mode 100644
index 0000000000..e10277a314
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/ifeval.yaml
@@ -0,0 +1,32 @@
+task: meta_ifeval
+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
+dataset_name: Llama-3.1-8B-Instruct-evals__ifeval__strict__details
+output_type: generate_until
+test_split: latest
+process_docs: !function utils.process_docs
+num_fewshot: 0
+doc_to_text: prompt
+doc_to_target: 0
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0.0
+  max_gen_toks: 1280
+process_results: !function utils.process_results
+metric_list:
+  - metric: prompt_level_strict_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: inst_level_strict_acc
+    aggregation: !function utils.agg_inst_level_acc
+    higher_is_better: true
+  - metric: prompt_level_loose_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: inst_level_loose_acc
+    aggregation: !function utils.agg_inst_level_acc
+    higher_is_better: true
+metadata:
+  version: 2.0
+fewshot_config:
+  sampler: first_n
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py
new file mode 100644
index 0000000000..aa171343fd
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_ifeval/utils.py
@@ -0,0 +1,191 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import dataclasses
+from typing import Dict, Optional, Union
+
+import datasets
+
+from lm_eval.tasks.ifeval import instructions_registry
+
+
+@dataclasses.dataclass
+class InputExample:
+    key: int
+    instruction_id_list: list[str]
+    prompt: str
+    kwargs: list[Dict[str, Optional[Union[str, int]]]]
+
+
+@dataclasses.dataclass
+class OutputExample:
+    instruction_id_list: list[str]
+    prompt: str
+    response: str
+    follow_all_instructions: bool
+    follow_instruction_list: list[bool]
+
+
+def test_instruction_following_strict(
+    inp,
+    response,
+):
+    """Tests response to see if instructions are followed."""
+    instruction_list = inp.instruction_id_list
+    is_following_list = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=inp.prompt)
+
+        if response.strip() and instruction.check_following(response):
+            is_following_list.append(True)
+        else:
+            is_following_list.append(False)
+
+    return OutputExample(
+        instruction_id_list=inp.instruction_id_list,
+        prompt=inp.prompt,
+        response=response,
+        follow_all_instructions=all(is_following_list),
+        follow_instruction_list=is_following_list,
+    )
+
+
+def test_instruction_following_loose(
+    inp,
+    response,
+):
+    """Tests response for an upper bound for following instructions."""
+    r = response.split("\n")
+    response_remove_first = "\n".join(r[1:]).strip()
+    response_remove_last = "\n".join(r[:-1]).strip()
+    response_remove_both = "\n".join(r[1:-1]).strip()
+    revised_response = response.replace("*", "")
+    revised_response_remove_first = response_remove_first.replace("*", "")
+    revised_response_remove_last = response_remove_last.replace("*", "")
+    revised_response_remove_both = response_remove_both.replace("*", "")
+    all_responses = [
+        response,
+        revised_response,
+        response_remove_first,
+        response_remove_last,
+        response_remove_both,
+        revised_response_remove_first,
+        revised_response_remove_last,
+        revised_response_remove_both,
+    ]
+    instruction_list = inp.instruction_id_list
+    is_following_list = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=inp.prompt)
+
+        is_following = False
+        for r in all_responses:
+            if r.strip() and instruction.check_following(r):
+                is_following = True
+                break
+
+        is_following_list.append(is_following)
+
+    return OutputExample(
+        instruction_id_list=inp.instruction_id_list,
+        prompt=inp.prompt,
+        response=response,
+        follow_all_instructions=all(is_following_list),
+        follow_instruction_list=is_following_list,
+    )
+
+
+def process_results(doc, results):
+    new_kwargs = []
+    for item in doc["kwargs"]:
+        if item["nth_paragraph"]:
+            item["nth_paragraph"] = int(item["nth_paragraph"])
+        new_kwargs.append(item)
+    inp = InputExample(
+        key=doc["key"],
+        instruction_id_list=doc["instruction_id_list"],
+        prompt=doc["prompt"],
+        kwargs=new_kwargs,
+    )
+    response = results[0]
+
+    out_strict = test_instruction_following_strict(inp, response)
+    out_loose = test_instruction_following_loose(inp, response)
+
+    return {
+        "prompt_level_strict_acc": out_strict.follow_all_instructions,
+        "inst_level_strict_acc": out_strict.follow_instruction_list,
+        "prompt_level_loose_acc": out_loose.follow_all_instructions,
+        "inst_level_loose_acc": out_loose.follow_instruction_list,
+    }
+
+
+def agg_inst_level_acc(items):
+    flat_items = [item for sublist in items for item in sublist]
+    inst_level_acc = sum(flat_items) / len(flat_items)
+    return inst_level_acc
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _get_question(example: dict) -> dict:
+        # get the question from the ifeval dataset
+        example["input_question"] = (
+            eval(
+                example["input_question"]
+                .replace("null", "None")
+                .replace("true", "True")
+                .replace("false", "False")
+            )["dialog"][0]["body"]
+            .replace("Is it True that the first song", "Is it true that the first song")
+            .replace("Is the following True", "Is the following true")
+        )
+        example["input_final_prompts"] = example["input_final_prompts"][0]
+        return example
+
+    original_dataset_name = "wis-k/instruction-following-eval"
+    ifeval_data = datasets.load_dataset(original_dataset_name, split="train")
+    ifeval_df = ifeval_data.to_pandas()
+    ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
+
+    meta_dataset = dataset.map(_get_question)
+    meta_df = meta_dataset.to_pandas()
+
+    # join the two datasets on the input_question column
+    joined = meta_df.join(ifeval_df.set_index("input_question"), on="input_question")
+    joined = joined.rename(columns={"input_final_prompts": "prompt"})
+    joined = joined.rename(columns={"is_correct": "previous_is_correct"})
+    joined = datasets.Dataset.from_pandas(joined)
+    joined = joined.select_columns(
+        [
+            "input_question",
+            "prompt",
+            "previous_is_correct",
+            "instruction_id_list",
+            "kwargs",
+            "output_prediction_text",
+            "key",
+        ]
+    )
+    joined.rename_column("output_prediction_text", "previous_output_prediction_text")
+    return joined
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
new file mode 100644
index 0000000000..1ec3c107d8
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
@@ -0,0 +1,29 @@
+task: meta_mmlu_pro_instruct
+dataset_path: meta-llama/Llama-3.1-8B-Instruct-evals
+dataset_name: Llama-3.1-8B-Instruct-evals__mmlu_pro__details
+test_split: latest
+output_type: generate_until
+process_docs: !function utils.process_docs
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: 'best answer is ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 1024
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py
new file mode 100644
index 0000000000..6b8bc3e5b2
--- /dev/null
+++ b/llama_stack/providers/impls/third_party/evals/eleuther/tasks/meta_mmlu_pro/utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import datasets
+
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+        }
+        return out_doc
+
+    dataset = dataset.select_columns(
+        [
+            "input_question",
+            "input_correct_responses",
+            "input_final_prompts",
+            "is_correct",
+            "input_question_hash",
+            "input_choice_list",
+            "output_prediction_text",
+        ],
+    )
+    dataset = dataset.rename_column("is_correct", "previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset
diff --git a/llama_stack/providers/registry/evals.py b/llama_stack/providers/registry/evals.py
new file mode 100644
index 0000000000..a8a7e735ff
--- /dev/null
+++ b/llama_stack/providers/registry/evals.py
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from llama_stack.distribution.datatypes import *  # noqa: F403
+
+
+def available_providers() -> List[ProviderSpec]:
+    return [
+        InlineProviderSpec(
+            api=Api.evals,
+            provider_type="meta-reference",
+            pip_packages=[
+                "matplotlib",
+                "pillow",
+                "pandas",
+                "scikit-learn",
+                "datasets",
+                "numpy",
+                "autoevals",
+                "openpyxl",
+            ],
+            module="llama_stack.providers.impls.meta_reference.evals",
+            config_class="llama_stack.providers.impls.meta_reference.evals.MetaReferenceEvalsImplConfig",
+            api_dependencies=[
+                Api.inference,
+            ],
+        ),
+        InlineProviderSpec(
+            api=Api.evals,
+            provider_type="eleuther",
+            pip_packages=[
+                "lm-eval",
+            ],
+            module="llama_stack.providers.impls.third_party.evals.eleuther",
+            config_class="llama_stack.providers.impls.third_party.evals.eleuther.EleutherEvalsImplConfig",
+            api_dependencies=[
+                Api.inference,
+            ],
+        ),
+    ]
diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py
index 9fffc0f99a..2070649043 100644
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@@ -152,7 +152,7 @@ def severity(levelname: str) -> LogSeverity:
     elif levelname == "INFO":
         return LogSeverity.INFO
     elif levelname == "WARNING":
-        return LogSeverity.WARNING
+        return LogSeverity.WARN
     elif levelname == "ERROR":
         return LogSeverity.ERROR
     elif levelname == "CRITICAL":
diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml
index e12f6e8528..31fb726708 100644
--- a/tests/examples/local-run.yaml
+++ b/tests/examples/local-run.yaml
@@ -11,16 +11,26 @@ apis:
 - memory_banks
 - inference
 - safety
+- evals
+- datasets
 providers:
-  inference:
+  evals:
   - provider_id: meta-reference
     provider_type: meta-reference
+    config: {}
+  inference:
+  - provider_id: remote::tgi
+    provider_type: remote::tgi
     config:
-      model: Llama3.1-8B-Instruct
-      quantization: null
-      torch_seed: null
-      max_seq_len: 4096
-      max_batch_size: 1
+      url: http://127.0.0.1:5009
+  # - provider_id: meta-reference
+  #   provider_type: meta-reference
+  #   config:
+  #     model: Llama3.1-8B-Instruct
+  #     quantization: null
+  #     torch_seed: null
+  #     max_seq_len: 4096
+  #     max_batch_size: 1
   safety:
   - provider_id: meta-reference
     provider_type: meta-reference