Skip to content

Commit

Permalink
[model server] add metrics smoke tests (#83)
Browse files Browse the repository at this point in the history
* Create size-labeler.yml

* Delete .github/workflows/size-labeler.yml

* add smoke metrics

* add smoke metrics

* add smoke metrics

* add smoke metrics

* add smoke metrics

* add smoke metrics

* add smoke metrics

* fix metric fixture and rename var

* fix metric fixture and rename var

* fix model name

* fix model name

* fix model name

* fix model name

* addd cpu

* add bug id

* add clean metrics

* use new fixtures
  • Loading branch information
rnetser authored Jan 9, 2025
1 parent 966b50d commit 18849ec
Show file tree
Hide file tree
Showing 7 changed files with 206 additions and 3 deletions.
Empty file.
31 changes: 31 additions & 0 deletions tests/model_serving/model_server/metrics/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pytest
import requests
from kubernetes.dynamic import DynamicClient
from ocp_utilities.monitoring import Prometheus
from simple_logger.logger import get_logger

from utilities.infra import get_openshift_token


LOGGER = get_logger(name=__name__)


@pytest.fixture(scope="session")
def prometheus(admin_client: DynamicClient) -> Prometheus:
return Prometheus(
client=admin_client,
resource_name="thanos-querier",
verify_ssl=False,
bearer_token=get_openshift_token(),
)


@pytest.fixture(scope="class")
def deleted_metrics(prometheus: Prometheus) -> None:
for metric in ("tgi_request_success", "tgi_request_count"):
LOGGER.info(f"deleting {metric} metric")
requests.get(
f"{prometheus.api_url}/api/v1/admin/tsdb/delete_series?match[]={metric}",
headers=prometheus.headers,
verify=prometheus.verify_ssl,
)
91 changes: 91 additions & 0 deletions tests/model_serving/model_server/metrics/test_model_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import pytest

from tests.model_serving.model_server.metrics.utils import run_inference_multiple_times
from tests.model_serving.model_server.utils import verify_inference_response
from utilities.constants import (
KServeDeploymentType,
ModelFormat,
ModelInferenceRuntime,
ModelStoragePath,
Protocols,
RuntimeTemplates,
)
from utilities.inference_utils import Inference
from utilities.monitoring import get_metrics_value, validate_metrics_value

pytestmark = pytest.mark.usefixtures("skip_if_no_deployed_openshift_serverless", "valid_aws_config", "deleted_metrics")


@pytest.mark.serverless
@pytest.mark.jira("RHOAIENG-3236", run=False)
@pytest.mark.parametrize(
"model_namespace, s3_models_storage_uri, serving_runtime_from_template, s3_models_inference_service",
[
pytest.param(
{"name": "kserve-tgis-metrics"},
{"model-dir": ModelStoragePath.FLAN_T5_SMALL},
{
"name": f"{Protocols.HTTP}-{ModelInferenceRuntime.CAIKIT_TGIS_RUNTIME}",
"template-name": RuntimeTemplates.CAIKIT_TGIS_SERVING,
"multi-model": False,
"enable-http": True,
},
{"name": f"{Protocols.HTTP}-{ModelFormat.CAIKIT}", "deployment-mode": KServeDeploymentType.SERVERLESS},
)
],
indirect=True,
)
class TestModelMetrics:
@pytest.mark.smoke
@pytest.mark.polarion("ODS-2555")
@pytest.mark.dependency(name="test_model_metrics_num_success_requests")
def test_model_metrics_num_success_requests(self, s3_models_inference_service, prometheus):
"""Verify number of successful model requests in OpenShift monitoring system (UserWorkloadMonitoring)metrics"""
verify_inference_response(
inference_service=s3_models_inference_service,
runtime=ModelInferenceRuntime.CAIKIT_TGIS_RUNTIME,
inference_type=Inference.ALL_TOKENS,
protocol=Protocols.HTTPS,
model_name=ModelFormat.CAIKIT,
use_default_query=True,
)
validate_metrics_value(
prometheus=prometheus,
metrics_query="tgi_request_success",
expected_value="1",
)

@pytest.mark.smoke
@pytest.mark.polarion("ODS-2555")
@pytest.mark.dependency(
name="test_model_metrics_num_total_requests",
depends=["test_model_metrics_num_success_requests"],
)
def test_model_metrics_num_total_requests(self, s3_models_inference_service, prometheus):
"""Verify number of total model requests in OpenShift monitoring system (UserWorkloadMonitoring)metrics"""
total_runs = 5

run_inference_multiple_times(
isvc=s3_models_inference_service,
runtime=ModelInferenceRuntime.CAIKIT_TGIS_RUNTIME,
inference_type=Inference.ALL_TOKENS,
protocol=Protocols.HTTPS,
model_name=ModelFormat.CAIKIT,
iterations=total_runs,
run_in_parallel=True,
)
validate_metrics_value(
prometheus=prometheus,
metrics_query="tgi_request_count",
expected_value=str(total_runs + 1),
)

@pytest.mark.smoke
@pytest.mark.polarion("ODS-2555")
@pytest.mark.dependency(depends=["test_model_metrics_num_total_requests"])
def test_model_metrics_cpu_utilization(self, s3_models_inference_service, prometheus):
"""Verify CPU utilization data in OpenShift monitoring system (UserWorkloadMonitoring)metrics"""
assert get_metrics_value(
prometheus=prometheus,
metrics_query=f"pod:container_cpu_usage:sum{{namespace='${s3_models_inference_service.namespace}'}}",
)
43 changes: 43 additions & 0 deletions tests/model_serving/model_server/metrics/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from concurrent.futures import ThreadPoolExecutor, as_completed

from ocp_resources.inference_service import InferenceService
from simple_logger.logger import get_logger

from tests.model_serving.model_server.utils import verify_inference_response


LOGGER = get_logger(name=__name__)


def run_inference_multiple_times(
isvc: InferenceService,
runtime: str,
inference_type: str,
protocol: str,
model_name: str,
iterations: int,
run_in_parallel: bool = False,
) -> None:
futures = []

with ThreadPoolExecutor() as executor:
for iteration in range(iterations):
infer_kwargs = {
"inference_service": isvc,
"runtime": runtime,
"inference_type": inference_type,
"protocol": protocol,
"model_name": model_name,
"use_default_query": True,
}

if run_in_parallel:
futures.append(executor.submit(verify_inference_response, **infer_kwargs))
else:
verify_inference_response(**infer_kwargs)

if futures:
for result in as_completed(futures):
_exception = result.exception()
if _exception:
LOGGER.error(f"Failed to run inference. Error: {_exception}")
5 changes: 2 additions & 3 deletions tests/trustyai/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import subprocess

import pytest
import yaml
from kubernetes.dynamic import DynamicClient
Expand All @@ -15,6 +13,7 @@
from tests.trustyai.constants import TRUSTYAI_SERVICE
from utilities.constants import MODELMESH_SERVING
from tests.trustyai.utils import update_configmap_data
from utilities.infra import get_openshift_token

MINIO: str = "minio"
OPENDATAHUB_IO: str = "opendatahub.io"
Expand Down Expand Up @@ -45,7 +44,7 @@ def trustyai_service_with_pvc_storage(

@pytest.fixture(scope="class")
def openshift_token(ns_with_modelmesh_enabled):
return subprocess.check_output(["oc", "whoami", "-t", ns_with_modelmesh_enabled.name]).decode().strip()
return get_openshift_token()


@pytest.fixture(scope="class")
Expand Down
4 changes: 4 additions & 0 deletions utilities/infra.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,3 +293,7 @@ def get_pods_by_isvc_label(client: DynamicClient, isvc: InferenceService) -> Lis
return pods

raise ResourceNotFoundError(f"{isvc.name} has no pods")


def get_openshift_token() -> str:
return run_command(command=shlex.split("oc whoami -t"))[1].strip()
35 changes: 35 additions & 0 deletions utilities/monitoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from typing import Any

from ocp_resources.prometheus import Prometheus
from simple_logger.logger import get_logger
from timeout_sampler import TimeoutExpiredError, TimeoutSampler

LOGGER = get_logger(name=__name__)


def validate_metrics_value(
prometheus: Prometheus, metrics_query: str, expected_value: Any, timeout: int = 60 * 4
) -> None:
sample = None
try:
for sample in TimeoutSampler(
wait_timeout=timeout,
sleep=15,
func=get_metrics_value,
prometheus=prometheus,
metrics_query=metrics_query,
):
if sample:
LOGGER.info(f"metric: {metrics_query} value is: {sample}, the expected value is {expected_value}")
if sample == expected_value:
LOGGER.info("Metrics value matches the expected value!")
return
except TimeoutExpiredError:
LOGGER.info(f"Metrics value: {sample}, expected: {expected_value}")
raise


def get_metrics_value(prometheus: Prometheus, metrics_query: str) -> Any:
metric_results = prometheus.query_sampler(query=metrics_query)
if metric_values_list := [value for metric_val in metric_results for value in metric_val.get("value")]:
return metric_values_list[1]

0 comments on commit 18849ec

Please sign in to comment.