microsoft · AlonsoGuevara · May 29, 2024 · May 13, 2024 · May 14, 2024 · May 14, 2024
@@ -0,0 +1,4 @@
+{
+  "type": "minor",
+  "description": "Add config parameter for cognitive services endpoint."
+}
@@ -57,6 +57,7 @@ This is the base LLM configuration section. Other steps may override this config
 * `api_version` **str** - The API version
 * `organization` **str** - The client organization.
 * `proxy` **str** - The proxy URL to use.
+* `cognitive_services_endpoint` **str** - The url endpoint for cognitive services.
 * `deployment_name` **str** - The deployment name to use (Azure).
 * `model_supports_json` **bool** - Whether the model supports JSON-mode output.
 * `tokens_per_minute` **int** - Set a leaky-bucket throttle on tokens-per-minute.

@@ -82,6 +82,10 @@ def hydrate_llm_params(
             llm_type = LLMType(llm_type) if llm_type else base.type
             api_key = reader.str(Fragment.api_key) or base.api_key
             api_base = reader.str(Fragment.api_base) or base.api_base
+            cognitive_services_endpoint = (
+                reader.str(Fragment.cognitive_services_endpoint)
+                or base.cognitive_services_endpoint
+            )
             deployment_name = (
                 reader.str(Fragment.deployment_name) or base.deployment_name
             )
@@ -111,6 +115,7 @@ def hydrate_llm_params(
                 or base.model_supports_json,
                 request_timeout=reader.float(Fragment.request_timeout)
                 or base.request_timeout,
+                cognitive_services_endpoint=cognitive_services_endpoint,
                 deployment_name=deployment_name,
                 tokens_per_minute=reader.int("tokens_per_minute", Fragment.tpm)
                 or base.tokens_per_minute,
@@ -135,6 +140,10 @@ def hydrate_embeddings_params(
             api_proxy = reader.str("proxy") or base.proxy
             api_type = reader.str(Fragment.type) or defs.EMBEDDING_TYPE
             api_type = LLMType(api_type) if api_type else defs.LLM_TYPE
+            cognitive_services_endpoint = (
+                reader.str(Fragment.cognitive_services_endpoint)
+                or base.cognitive_services_endpoint
+            )
             deployment_name = reader.str(Fragment.deployment_name)
 
             if api_key is None and not _is_azure(api_type):
@@ -159,6 +168,7 @@ def hydrate_embeddings_params(
                 model=reader.str(Fragment.model) or defs.EMBEDDING_MODEL,
                 request_timeout=reader.float(Fragment.request_timeout)
                 or defs.LLM_REQUEST_TIMEOUT,
+                cognitive_services_endpoint=cognitive_services_endpoint,
                 deployment_name=deployment_name,
                 tokens_per_minute=reader.int("tokens_per_minute", Fragment.tpm)
                 or defs.LLM_TOKENS_PER_MINUTE,
@@ -209,6 +219,9 @@ def hydrate_parallelization_params(
                 api_base = reader.str(Fragment.api_base) or fallback_oai_base
                 api_version = reader.str(Fragment.api_version) or fallback_oai_version
                 api_proxy = reader.str(Fragment.api_proxy) or fallback_oai_proxy
+                cognitive_services_endpoint = reader.str(
+                    Fragment.cognitive_services_endpoint
+                )
                 deployment_name = reader.str(Fragment.deployment_name)
 
                 if api_key is None and not _is_azure(llm_type):
@@ -235,6 +248,7 @@ def hydrate_parallelization_params(
                     model_supports_json=reader.bool(Fragment.model_supports_json),
                     request_timeout=reader.float(Fragment.request_timeout)
                     or defs.LLM_REQUEST_TIMEOUT,
+                    cognitive_services_endpoint=cognitive_services_endpoint,
                     deployment_name=deployment_name,
                     tokens_per_minute=reader.int(Fragment.tpm)
                     or defs.LLM_TOKENS_PER_MINUTE,
@@ -521,6 +535,7 @@ class Fragment(str, Enum):
     api_organization = "API_ORGANIZATION"
     api_proxy = "API_PROXY"
     async_mode = "ASYNC_MODE"
+    cognitive_services_endpoint = "COGNITIVE_SERVICES_ENDPOINT"
     concurrent_requests = "CONCURRENT_REQUESTS"
     conn_string = "CONNECTION_STRING"
     container_name = "CONTAINER_NAME"

@@ -20,6 +20,7 @@ class LLMParametersInput(TypedDict):
     api_version: NotRequired[str | None]
     organization: NotRequired[str | None]
     proxy: NotRequired[str | None]
+    cognitive_services_endpoint: NotRequired[str | None]
     deployment_name: NotRequired[str | None]
     model_supports_json: NotRequired[bool | str | None]
     tokens_per_minute: NotRequired[int | str | None]

@@ -40,6 +40,9 @@ class LLMParameters(BaseModel):
     proxy: str | None = Field(
         description="The proxy to use for the LLM service.", default=None
     )
+    cognitive_services_endpoint: str | None = Field(
+        description="The endpoint to reach cognitives services.", default=None
+    )
     deployment_name: str | None = Field(
         description="The deployment name to use for the LLM service.", default=None
     )

@@ -198,6 +198,7 @@ def _get_base_config(config: dict[str, Any]) -> dict[str, Any]:
         "model_supports_json": config.get("model_supports_json"),
         "concurrent_requests": config.get("concurrent_requests", 4),
         "encoding_model": config.get("encoding_model", "cl100k_base"),
+        "cognitive_services_endpoint": config.get("cognitive_services_endpoint"),
     }