diff --git a/README.md b/README.md index a3352a5a..093c58b9 100644 --- a/README.md +++ b/README.md @@ -212,6 +212,13 @@ A variety of metrics are available for the EC2 instances that are launched durin In addition to the standard metrics, we also provide a custom metric for `GPUUtilization`. This can be found in the `CloudWatch` section under `All metrics` -> `Custom namespaces` -> `EC2`. Please note that the `GPUUtilization` metric is also updated every five minutes. +We provide an option to save aggregated (average) custom hardware metrics (`GPUUtilization` and `CPUUtilization` logged in 5s intervals) to the benchmark directory under the provided S3 bucket, simply use the option when running benchmark: + +``` +agbench run --save-hardware-metrics +``` + +Note that currently this command waits for all jobs to become successful to pull the hardware metrics. ## Evaluating benchmark runs diff --git a/src/autogluon/bench/eval/hardware_metrics/hardware_metrics.py b/src/autogluon/bench/eval/hardware_metrics/hardware_metrics.py new file mode 100644 index 00000000..6b0756eb --- /dev/null +++ b/src/autogluon/bench/eval/hardware_metrics/hardware_metrics.py @@ -0,0 +1,301 @@ +import csv +import logging +import os +import tempfile +from datetime import datetime, timedelta +from typing import List, Optional + +import boto3 +import pandas as pd +import typer +import yaml + +from autogluon.bench.utils.general_utils import upload_to_s3 + +aws_account_id = None +aws_account_region = None + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def find_s3_file(s3_bucket: str, prefix: str, file: str): + s3 = boto3.client("s3") + paginator = s3.get_paginator("list_objects_v2") + page_iterator = paginator.paginate(Bucket=s3_bucket, Prefix=prefix) + + for page in page_iterator: + if "Contents" in page: + for obj in page["Contents"]: + if obj["Key"].endswith("results.csv"): + return f"s3://{s3_bucket}/{obj['Key']}" + return None + + +def get_job_ids(config_file: str): + """ + This function returns a list of job IDs of all jobs ran for a benchmark run + Parameters + ---------- + config_file: str, + Path to config file containing job IDs + """ + job_ids = list(config_file.get("job_configs", {}).keys()) + return job_ids + + +def get_instance_id(job_id): + """ + This function returns the instance ID (ARN) of the EC2 instance that was used to run a job with given job ID. + Parameters + ---------- + job_id: str + """ + batch_client = boto3.client("batch", region_name=aws_account_region) + ecs_client = boto3.client("ecs", region_name=aws_account_region) + + response = batch_client.describe_jobs(jobs=[f"{job_id}"]) + if response: + container_arn = response["jobs"][0]["container"]["containerInstanceArn"] + cluster_arn = response["jobs"][0]["container"]["taskArn"].split("/") + cluster = f"arn:aws:ecs:{aws_account_region}:{aws_account_id}:cluster/" + cluster_arn[1] + + response = ecs_client.describe_container_instances(cluster=cluster, containerInstances=[container_arn]) + instance_id = response["containerInstances"][0]["ec2InstanceId"] + return instance_id + + +def get_instance_util( + namespace: str, + instance_id: str, + metric: str, + start_time: datetime, + end_time: datetime, + cloudwatch_client: boto3.client, + period: int = 360, + statistics: Optional[List[str]] = ["Average"], +) -> dict: + """ + This function returns the instance ID of the EC2 instance that was used to run a job with given job ID. + Refer to https://docs.aws.amazon.com/cli/latest/reference/cloudwatch/get-metric-statistics.html for docs on how to interact with the CloudWatch API + Also refer to https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/cloudwatch/client/get_metric_statistics.html for documentation on how to interact with the API through Python + Parameters + ---------- + instance_id: str, + EC2 instance ARN + metric: str, + Name of metric to pass into the CloudWatch API. Example: CPUUtilization + Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/viewing_metrics_with_cloudwatch.html#ec2-cloudwatch-metrics + start_time: datetime, + end_time: datetime, + statistics: Optional[List[str]] = ["Average"], + The metric statistics, other than percentile. For percentile statistics, use `ExtendedStatistics` . When calling `get_metric_statistics` , you must specify either `Statistics` or `ExtendedStatistics` , but not both. + Examples: Average, Maximum, Minimum + """ + return cloudwatch_client.get_metric_statistics( + Namespace=namespace, + MetricName=metric, + Dimensions=[ + {"Name": "InstanceId", "Value": instance_id}, + ], + Statistics=statistics, + StartTime=start_time, + EndTime=end_time, + Period=period, + ) + + +def format_metrics( + instance_metrics: dict, + framework: str, + dataset: str, + fold: int, + mode: str, + statistics: Optional[List[str]] = ["Average"], +): + """ + This function returns a formatted version of the dictionary of metrics provided by the CloudWatch API so it can be easily added to a CSV file and passed into `autogluon-dashboard`. + Parameters + ---------- + instance_metrics: dict, + Dictionary of instance metrics for a given EC2 instance provided by CloudWatch + framework: str, + Name of the framework + dataset: str, + Name of the dataset + fold: int, + Fold # + mode: str, + Mode -> Training or Prediction + statistics: Optional[List[str]] = ["Average"], + The metric statistics, other than percentile. For percentile statistics, use `ExtendedStatistics` . When calling `get_metric_statistics` , you must specify either `Statistics` or `ExtendedStatistics` , but not both. + Examples: Average, Maximum, Minimum + """ + output_dict = {} + output_dict["framework"] = framework + output_dict["dataset"] = dataset + output_dict["mode"] = mode + output_dict["fold"] = fold + output_dict["metric"] = instance_metrics["Label"] + for i in range(len(instance_metrics["Datapoints"])): + for stat in statistics: + output_dict["framework"] = framework + output_dict["dataset"] = dataset + output_dict["mode"] = mode + output_dict["fold"] = fold + output_dict["metric"] = instance_metrics["Label"] + output_dict["statistic_type"] = stat + output_dict["statistic_value"] = instance_metrics["Datapoints"][i][f"{stat}"] + output_dict["unit"] = instance_metrics["Datapoints"][i]["Unit"] + return output_dict + + +def get_metrics( + job_id: str, + s3_bucket: str, + module: str, + benchmark_name: str, + sub_folder: str, + cloudwatch_client: boto3.client, + namespace: str = "EC2", # CloudWatch "Custom" namespace, i.e. Custom/EC2 +): + """ + Parameters + ---------- + job_id: str, + metrics: list, + List of metrics to pass into the CloudWatch API. Example: CPUUtilization + Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/viewing_metrics_with_cloudwatch.html#ec2-cloudwatch-metrics + s3_bucket: str, + module: str, + benchmark_name: str, + sub_folder: str, + Sub folder for results.csv file. + Passed in from `get_hardware_metrics` function + namespace: str, + CloudWatch Metrics Namespace, default: AWS/EC2 + """ + path_prefix = f"{module}/{benchmark_name}/{sub_folder}/" + s3_path_to_csv = find_s3_file(s3_bucket=s3_bucket, prefix=path_prefix, file="results.csv") + results = pd.read_csv(s3_path_to_csv) + metrics_list = [] + instance_id = get_instance_id(job_id) + metrics_data = cloudwatch_client.list_metrics( + Dimensions=[ + {"Name": "InstanceId", "Value": instance_id}, + ], + Namespace=namespace, + )["Metrics"] + metrics_pool = [item["MetricName"] for item in metrics_data] + + for metric in metrics_pool: + for i in results.index: + framework, dataset, utc, train_time, predict_time, fold = ( + results["framework"][i], + results["task"][i], + results["utc"][i], + results["training_duration"][i], + results["predict_duration"][i], + results["fold"][i], + ) + utc_dt = datetime.strptime(utc, "%Y-%m-%dT%H:%M:%S") + period = int((timedelta(seconds=train_time) + timedelta(seconds=predict_time)).total_seconds()) + if period < 60: + period = 60 + elif period % 60 != 0: + period = (period // 60) * 60 # Round down to the nearest multiple of 60 + + training_util = get_instance_util( + namespace=namespace, + instance_id=instance_id, + metric=metric, + start_time=utc_dt, + end_time=utc_dt + timedelta(seconds=train_time) + timedelta(seconds=predict_time), + period=period, + cloudwatch_client=cloudwatch_client, + ) + predict_util = get_instance_util( + namespace=namespace, + instance_id=instance_id, + metric=metric, + start_time=utc_dt - timedelta(minutes=predict_time), + end_time=utc_dt, + period=period, + cloudwatch_client=cloudwatch_client, + ) + if training_util["Datapoints"]: + metrics_list.append(format_metrics(training_util, framework, dataset, fold, "Training")) + if predict_util["Datapoints"]: + metrics_list.append(format_metrics(predict_util, framework, dataset, fold, "Prediction")) + return metrics_list + + +def save_results(metrics_list: list, path: str): + """ + Writes the formatted dictionary of metrics to a csv to pass into `autogluon-dashboard`. + Parameters + ---------- + metrics_list: list, + List of hardware metrics to write to CSV + path: str: + Path to save file + """ + csv_headers = ["framework", "dataset", "mode", "fold", "metric", "statistic_type", "statistic_value", "unit"] + csv_location = os.path.join(path, "hardware_metrics.csv") + with open(csv_location, "w", newline="") as csvFile: + writer = csv.DictWriter(csvFile, fieldnames=csv_headers) + writer.writeheader() + writer.writerows(metrics_list) + return csv_location + + +def get_hardware_metrics( + config_file: str = typer.Argument(help="Path to YAML config file containing job ids."), + s3_bucket: str = typer.Argument(help="Name of the S3 bucket to which the benchmark results were outputted."), + module: str = typer.Argument(help="Can be one of ['tabular', 'timeseries', 'multimodal']."), + benchmark_name: str = typer.Argument( + help="Folder name of benchmark run in which all objects with path 'scores/results.csv' get aggregated." + ), +): + """ + External API function to interact with the script. + Parameters + ---------- + config_file: str, + Path to config file containing job IDs + s3_bucket: str, + Name of the S3 bucket to which the benchmark results were outputted. + module: str, + Benchmark module: tabular or multimodal + benchmark_name: str, + Name of the benchmark + Example: ag_bench_20230817T123456 + """ + if not config_file: + raise ValueError("Invalid Config File") + logger.info(f"Getting hardware metrics for jobs under config file: {config_file}") + with open(config_file, "r") as f: + config = yaml.safe_load(f) + job_ids = get_job_ids(config) + + global aws_account_id, aws_account_region + aws_account_id = config.get("CDK_DEPLOY_ACCOUNT") + aws_account_region = config.get("CDK_DEPLOY_REGION") + + cloudwatch_client = boto3.client("cloudwatch", region_name=aws_account_region) + + metrics_list = [] + for job_id in job_ids: + sub_folder = config["job_configs"][f"{job_id}"].split("/")[-1].split(".")[0].replace("_split", "") + metrics_list += get_metrics( + job_id=job_id, + s3_bucket=s3_bucket, + module=module, + benchmark_name=benchmark_name, + sub_folder=sub_folder, + cloudwatch_client=cloudwatch_client, + ) + if metrics_list: + with tempfile.TemporaryDirectory() as temp_dir: + local_path = save_results(metrics_list, temp_dir) + upload_to_s3(s3_bucket=s3_bucket, s3_dir=f"{module}/{benchmark_name}", local_path=local_path) diff --git a/src/autogluon/bench/frameworks/multimodal/exec.py b/src/autogluon/bench/frameworks/multimodal/exec.py index 125b0889..c378e3f7 100644 --- a/src/autogluon/bench/frameworks/multimodal/exec.py +++ b/src/autogluon/bench/frameworks/multimodal/exec.py @@ -212,7 +212,7 @@ def run( "type": predictor.problem_type, "result": scores[test_data.metric], "metric": test_data.metric, - "utc_time": utc_time, + "utc": utc_time, "training_duration": training_duration, "predict_duration": predict_duration, "scores": scores, diff --git a/src/autogluon/bench/main.py b/src/autogluon/bench/main.py index 8c5c6006..fbf678b4 100644 --- a/src/autogluon/bench/main.py +++ b/src/autogluon/bench/main.py @@ -1,6 +1,7 @@ import typer from autogluon.bench.cloud.aws.stack_handler import destroy_stack +from autogluon.bench.eval.hardware_metrics.hardware_metrics import get_hardware_metrics from autogluon.bench.eval.scripts.aggregate_amlb_results import aggregate_amlb_results from autogluon.bench.eval.scripts.run_evaluation_openml import evaluate_amlb_results from autogluon.bench.eval.scripts.run_generate_clean_openml import clean_amlb_results @@ -16,6 +17,7 @@ app.command()(aggregate_amlb_results) app.command()(clean_amlb_results) app.command()(evaluate_amlb_results) +app.command()(get_hardware_metrics) if __name__ == "__main__": app() diff --git a/src/autogluon/bench/runbenchmark.py b/src/autogluon/bench/runbenchmark.py index e495ec5b..9c52671b 100644 --- a/src/autogluon/bench/runbenchmark.py +++ b/src/autogluon/bench/runbenchmark.py @@ -14,6 +14,7 @@ from autogluon.bench import __version__ as agbench_version from autogluon.bench.cloud.aws.stack_handler import deploy_stack, destroy_stack +from autogluon.bench.eval.hardware_metrics.hardware_metrics import get_hardware_metrics from autogluon.bench.frameworks.multimodal.multimodal_benchmark import MultiModalBenchmark from autogluon.bench.frameworks.tabular.tabular_benchmark import TabularBenchmark from autogluon.bench.frameworks.timeseries.timeseries_benchmark import TimeSeriesBenchmark @@ -344,6 +345,7 @@ def run( skip_setup: bool = typer.Option( False, help="Whether to skip setting up framework in local mode, default to False." ), + save_hardware_metrics: bool = typer.Option(False, help="Whether to query and save the hardware metrics."), ): """Main function that runs the benchmark based on the provided configuration options.""" configs = {} @@ -443,7 +445,7 @@ def run( if remove_resources: wait = True - if wait: + if wait or save_hardware_metrics: logger.info( "Waiting for jobs to complete. You can quit at anytime and the benchmark will continue to run on the cloud" ) @@ -464,6 +466,13 @@ def run( logger.error("Resources are not being removed due to errors.") else: logger.info("All job succeeded.") + if save_hardware_metrics: + get_hardware_metrics( + config_file=aws_config_path, + s3_bucket=infra_configs["METRICS_BUCKET"], + module=module, + benchmark_name=benchmark_name, + ) if remove_resources: logger.info("Removing resoureces...") destroy_stack( diff --git a/tests/unittests/benchmark/test_runbenchmarks.py b/tests/unittests/benchmark/test_runbenchmarks.py index 01f678cc..c8a71dae 100644 --- a/tests/unittests/benchmark/test_runbenchmarks.py +++ b/tests/unittests/benchmark/test_runbenchmarks.py @@ -67,6 +67,7 @@ def mock_yaml_side_effect(file_obj): mocker.patch("autogluon.bench.runbenchmark._get_benchmark_name", return_value="test_benchmark") mocker.patch("autogluon.bench.runbenchmark.formatted_time", return_value="test_time") mocker.patch("autogluon.bench.runbenchmark._dump_configs", return_value="test_dump") + mocker.patch("autogluon.bench.runbenchmark._dump_configs", return_value="test_dump") mocker.patch("os.environ.__setitem__") mock_deploy_stack = mocker.patch("autogluon.bench.runbenchmark.deploy_stack", return_value=infra_configs) @@ -74,6 +75,7 @@ def mock_yaml_side_effect(file_obj): mock_invoke_lambda = mocker.patch("autogluon.bench.runbenchmark.invoke_lambda", return_value={}) mock_wait_for_jobs = mocker.patch("autogluon.bench.runbenchmark.wait_for_jobs", return_value=[]) + mock_get_hardware_metrics = mocker.patch("autogluon.bench.runbenchmark.get_hardware_metrics") mock_destroy_stack = mocker.patch("autogluon.bench.runbenchmark.destroy_stack") mock_mount = mocker.patch("autogluon.bench.runbenchmark._mount_dir") mock_umount = mocker.patch("autogluon.bench.runbenchmark._umount_if_needed") @@ -86,6 +88,7 @@ def mock_yaml_side_effect(file_obj): "mock_invoke_lambda": mock_invoke_lambda, "mock_wait_for_jobs": mock_wait_for_jobs, "mock_destroy_stack": mock_destroy_stack, + "mock_get_hardware_metrics": mock_get_hardware_metrics, "mock_mount": mock_mount, "mock_umount": mock_umount, "cdk_context": cdk_context, @@ -177,7 +180,14 @@ def test_invoke_lambda(mocker): def test_run_aws_mode(mocker, tmp_path): setup = setup_mock(mocker, tmp_path) - run(setup["config_file"], remove_resources=False, wait=False, dev_branch=None, skip_setup=True) + run( + setup["config_file"], + remove_resources=False, + wait=False, + dev_branch=None, + skip_setup=True, + save_hardware_metrics=False, + ) setup["mock_deploy_stack"].assert_called_once_with(custom_configs=setup["custom_configs"]) setup["mock_upload_to_s3"].assert_called_once_with( @@ -185,13 +195,21 @@ def test_run_aws_mode(mocker, tmp_path): ) setup["mock_invoke_lambda"].assert_called_once_with(configs=setup["infra_configs"], config_file="test_s3_path") setup["mock_wait_for_jobs"].assert_not_called() + setup["mock_get_hardware_metrics"].assert_not_called(), setup["mock_destroy_stack"].assert_not_called() def test_run_aws_mode_remove_resources(mocker, tmp_path): setup = setup_mock(mocker, tmp_path) - run(setup["config_file"], remove_resources=True, wait=False, dev_branch=None, skip_setup=True) + run( + setup["config_file"], + remove_resources=True, + wait=False, + dev_branch=None, + skip_setup=True, + save_hardware_metrics=False, + ) setup["mock_deploy_stack"].assert_called_once_with(custom_configs=setup["custom_configs"]) setup["mock_upload_to_s3"].assert_called_once_with( @@ -212,7 +230,14 @@ def test_run_aws_mode_remove_resources(mocker, tmp_path): def test_run_aws_mode_wait(mocker, tmp_path): setup = setup_mock(mocker, tmp_path) - run(setup["config_file"], remove_resources=False, wait=True, dev_branch=None, skip_setup=True) + run( + setup["config_file"], + remove_resources=False, + wait=True, + dev_branch=None, + skip_setup=True, + save_hardware_metrics=False, + ) setup["mock_deploy_stack"].assert_called_once_with(custom_configs=setup["custom_configs"]) setup["mock_upload_to_s3"].assert_called_once_with( @@ -221,13 +246,21 @@ def test_run_aws_mode_wait(mocker, tmp_path): setup["mock_invoke_lambda"].assert_called_once_with(configs=setup["infra_configs"], config_file="test_s3_path") setup["mock_wait_for_jobs"].assert_called_once_with(config_file="test_dump") + setup["mock_get_hardware_metrics"].assert_not_called() def test_run_aws_mode_dev_branch(mocker, tmp_path): setup = setup_mock(mocker, tmp_path) dev_branch = "dev_branch_url" - run(setup["config_file"], remove_resources=False, wait=False, dev_branch=dev_branch, skip_setup=True) + run( + setup["config_file"], + remove_resources=False, + wait=False, + dev_branch=dev_branch, + skip_setup=True, + save_hardware_metrics=False, + ) assert os.environ["AG_BENCH_DEV_URL"] == dev_branch setup["mock_deploy_stack"].assert_called_once_with(custom_configs=setup["custom_configs"]) @@ -248,6 +281,7 @@ def test_run_aws_tabular_user_dir(mocker, tmp_path): wait=False, dev_branch="https://git_url#git_branch", skip_setup=True, + save_hardware_metrics=False, ) assert os.environ["AG_BENCH_DEV_URL"] == "https://git_url#git_branch" assert os.environ["FRAMEWORK_PATH"] == "frameworks/tabular" @@ -273,6 +307,7 @@ def test_run_aws_multimodal_custom_dataloader(mocker, tmp_path): wait=False, dev_branch="https://git_url#git_branch", skip_setup=True, + save_hardware_metrics=False, ) assert setup["custom_configs"]["custom_dataloader"]["dataloader_file"] == "dataloaders/dataset.py" assert setup["custom_configs"]["custom_dataloader"]["dataset_config_file"] == "dataloaders/datasets.yaml" @@ -296,7 +331,14 @@ def test_run_local_mode(mocker, tmp_path): mocker.patch("autogluon.bench.runbenchmark.formatted_time", return_value="test_time") mock_run_benchmark = mocker.patch("autogluon.bench.runbenchmark.run_benchmark") - run(str(config_file), remove_resources=False, wait=False, dev_branch=None, skip_setup=False) + run( + str(config_file), + remove_resources=False, + wait=False, + dev_branch=None, + skip_setup=False, + save_hardware_metrics=False, + ) mock_open.assert_called_with(str(config_file), "r") mock_run_benchmark.assert_called_with( diff --git a/tests/unittests/evaluation/hardware_metrics/resources/expected_metrics.py b/tests/unittests/evaluation/hardware_metrics/resources/expected_metrics.py new file mode 100644 index 00000000..790d63ba --- /dev/null +++ b/tests/unittests/evaluation/hardware_metrics/resources/expected_metrics.py @@ -0,0 +1,82 @@ +metrics = [ + { + "framework": "AutoGluon", + "dataset": "credit-g", + "mode": "Training", + "fold": 0, + "metric": "CPUUtilization", + "statistic_type": "Average", + "statistic_value": 11.472356376239336, + "unit": "Percent", + }, + { + "framework": "AutoGluon", + "dataset": "credit-g", + "mode": "Prediction", + "fold": 0, + "metric": "CPUUtilization", + "statistic_type": "Average", + "statistic_value": 11.472356376239336, + "unit": "Percent", + }, + { + "framework": "AutoGluon", + "dataset": "credit-g", + "mode": "Training", + "fold": 1, + "metric": "CPUUtilization", + "statistic_type": "Average", + "statistic_value": 11.472356376239336, + "unit": "Percent", + }, + { + "framework": "AutoGluon", + "dataset": "credit-g", + "mode": "Prediction", + "fold": 1, + "metric": "CPUUtilization", + "statistic_type": "Average", + "statistic_value": 11.472356376239336, + "unit": "Percent", + }, + { + "framework": "AutoGluon", + "dataset": "vehicle", + "mode": "Training", + "fold": 0, + "metric": "CPUUtilization", + "statistic_type": "Average", + "statistic_value": 11.472356376239336, + "unit": "Percent", + }, + { + "framework": "AutoGluon", + "dataset": "vehicle", + "mode": "Prediction", + "fold": 0, + "metric": "CPUUtilization", + "statistic_type": "Average", + "statistic_value": 11.472356376239336, + "unit": "Percent", + }, + { + "framework": "AutoGluon", + "dataset": "vehicle", + "mode": "Training", + "fold": 1, + "metric": "CPUUtilization", + "statistic_type": "Average", + "statistic_value": 11.472356376239336, + "unit": "Percent", + }, + { + "framework": "AutoGluon", + "dataset": "vehicle", + "mode": "Prediction", + "fold": 1, + "metric": "CPUUtilization", + "statistic_type": "Average", + "statistic_value": 11.472356376239336, + "unit": "Percent", + }, +] diff --git a/tests/unittests/evaluation/hardware_metrics/resources/results.csv b/tests/unittests/evaluation/hardware_metrics/resources/results.csv new file mode 100644 index 00000000..17ca9b21 --- /dev/null +++ b/tests/unittests/evaluation/hardware_metrics/resources/results.csv @@ -0,0 +1,5 @@ +id,task,framework,constraint,fold,type,result,metric,mode,version,params,app_version,utc,duration,training_duration,predict_duration,models_count,seed,info,acc,auc,balacc,logloss,models_ensemble_count +openml.org/t/31,credit-g,AutoGluon,test,0,binary,0.80381,auc,local,0.8.2,,2.1.6,2023-07-13T22:19:45,23.4,20.7,0.06,14,937966977,,0.72,0.80381,0.666667,0.503194,8 +openml.org/t/31,credit-g,AutoGluon,test,1,binary,0.760952,auc,local,0.8.2,,2.1.6,2023-07-13T22:20:06,20.9,17.7,0.3,14,937966978,,0.74,0.760952,0.604762,0.507339,9 +openml.org/t/53,vehicle,AutoGluon,test,0,multiclass,-0.385097,neg_logloss,local,0.8.2,,2.1.6,2023-07-13T22:20:33,24.4,22.4,0.02,14,937966977,,0.8,,0.800812,0.385097,4 +openml.org/t/53,vehicle,AutoGluon,test,1,multiclass,-0.390919,neg_logloss,local,0.8.2,,2.1.6,2023-07-13T22:20:56,23.1,21.1,0.02,14,937966978,,0.8,,0.80303,0.390919,3 diff --git a/tests/unittests/evaluation/hardware_metrics/resources/test_config.yaml b/tests/unittests/evaluation/hardware_metrics/resources/test_config.yaml new file mode 100644 index 00000000..7dc3bba2 --- /dev/null +++ b/tests/unittests/evaluation/hardware_metrics/resources/test_config.yaml @@ -0,0 +1,20 @@ +BATCH_STACK_NAME: ag-bench-batch-stack +BLOCK_DEVICE_VOLUME: 100 +CDK_DEPLOY_ACCOUNT: 123456789 +CDK_DEPLOY_REGION: us-east-2 +COMPUTE_ENV_MAXV_CPUS: 10 +CONTAINER_GPU: 1 +CONTAINER_MEMORY: 100 +CONTAINER_VCPU: 8 +DATA_BUCKET: null +INSTANCE_TYPES: +- g4dn.2xlarge +LAMBDA_FUNCTION_NAME: ag-bench-batch-job-function +METRICS_BUCKET: benchmark-test +STACK_NAME_PREFIX: ag-bench +STACK_NAME_TAG: benchmark +STATIC_RESOURCE_STACK_NAME: ag-bench-static-resource-stack +VPC_NAME: null +job_configs: + 123456-abc-efg: s3://benchmark-test/configs/ag_bench_20230720T102030/ag_bench_20230720T102030_split_2d42d496266911ee8df28ee9311e6528.yaml + 010101-xxx-zzz: s3://benchmark-test/configs/ag_bench_20230720T102030/ag_bench_20230720T102030_split_2d794800266911ee8df28ee9311e6528.yaml diff --git a/tests/unittests/evaluation/hardware_metrics/test_hardware_metrics.py b/tests/unittests/evaluation/hardware_metrics/test_hardware_metrics.py new file mode 100644 index 00000000..b03e7329 --- /dev/null +++ b/tests/unittests/evaluation/hardware_metrics/test_hardware_metrics.py @@ -0,0 +1,163 @@ +import datetime +import os +import unittest +from unittest.mock import ANY, MagicMock, call, patch + +import pandas as pd +import yaml + +from autogluon.bench.eval.hardware_metrics import hardware_metrics +from autogluon.bench.eval.hardware_metrics.hardware_metrics import ( + get_hardware_metrics, + get_instance_id, + get_instance_util, + get_job_ids, + get_metrics, +) + +test_dir = os.path.dirname(__file__) +config_file = os.path.join(test_dir, "resources/test_config.yaml") +if not config_file: + raise ValueError("Invalid Config File") +with open(config_file, "r") as f: + config = yaml.safe_load(f) + +hardware_metrics.aws_account_id = config["CDK_DEPLOY_ACCOUNT"] +hardware_metrics.aws_account_region = config["CDK_DEPLOY_REGION"] + +from resources.expected_metrics import metrics + +mock_cloudwatch_response = { + "Label": "CPUUtilization", + "Datapoints": [ + {"Timestamp": datetime.datetime(2023, 7, 12, 17, 39), "Average": 11.472356376239336, "Unit": "Percent"} + ], + "ResponseMetadata": { + "RequestId": "93ed0de6-7f3c-4af8-8650-2310042c97f8", + "HTTPStatusCode": 200, + "HTTPHeaders": { + "x-amzn-requestid": "93ed0de6-7f3c-4af8-8650-2310042c97f8", + "content-type": "text/xml", + "content-length": "512", + "date": "Wed, 12 Jul 2023 18:19:56 GMT", + }, + "RetryAttempts": 0, + }, +} + +mock_results_df = pd.read_csv(os.path.join(test_dir, "resources/results.csv")) + + +class TestHardwareMetrics(unittest.TestCase): + def test_get_job_ids(self): + job_ids = get_job_ids(config) + self.assertEqual(job_ids, ["123456-abc-efg", "010101-xxx-zzz"]) + + @patch("boto3.client") + def test_get_instance_id(self, mock_client): + job_ids = get_job_ids(config) + batch_client, ecs_client = MagicMock(), MagicMock() + mock_client.side_effect = [batch_client, ecs_client] + mock_batch_response = { + "jobs": [ + { + "container": { + "containerInstanceArn": "abc", + "taskArn": "arn:aws:ecs:us-east-2:123456789:task/agbenchcomputeenvironmen-DhbZ6yaLr_Batch/b3bb44aa78f", + } + } + ] + } + batch_client.describe_jobs.return_value = mock_batch_response + + mock_ecs_response = {"containerInstances": [{"ec2InstanceId": 12345}]} + ecs_client.describe_container_instances.return_value = mock_ecs_response + instance_id = get_instance_id(job_ids[0]) + self.assertEqual(instance_id, 12345) + cluster = f"arn:aws:ecs:{hardware_metrics.aws_account_region}:{hardware_metrics.aws_account_id}:cluster/agbenchcomputeenvironmen-DhbZ6yaLr_Batch" + ecs_client.describe_container_instances.assert_called_once_with(cluster=cluster, containerInstances=["abc"]) + + @patch("boto3.client") + def test_get_instance_util(self, mock_client): + cloudwatch_client = MagicMock() + mock_client.side_effect = [cloudwatch_client] + cloudwatch_client.get_metric_statistics.return_value = mock_cloudwatch_response + self.assertEqual( + get_instance_util( + namespace="namespace", + instance_id="1234", + metric="CPUUtilization", + start_time=datetime.datetime(2023, 7, 12, 17, 39), + end_time=datetime.datetime(2023, 7, 12, 16, 39), + cloudwatch_client=cloudwatch_client, + ), + cloudwatch_client.get_metric_statistics.return_value, + ) + + @patch("pandas.read_csv") + @patch("autogluon.bench.eval.hardware_metrics.hardware_metrics.get_instance_id") + @patch("autogluon.bench.eval.hardware_metrics.hardware_metrics.get_instance_util") + @patch("autogluon.bench.eval.hardware_metrics.hardware_metrics.find_s3_file") + def test_get_metrics(self, mock_s3_file, mock_instance_util, mock_instance_id, mock_csv): + mock_csv.return_value = mock_results_df + mock_instance_id.return_value = "12345" + job_id = list(config.get("job_configs", {}).keys())[0] + mock_instance_util.return_value = mock_cloudwatch_response + mock_list_metrics_return = { + "Metrics": [ + {"MetricName": "CPUUtilization"}, + ] + } + mock_cloudwatch_client = MagicMock() + mock_cloudwatch_client.list_metrics.return_value = mock_list_metrics_return + + metrics_list = get_metrics( + job_id=job_id, + s3_bucket="some bucket", + module="tabular", + benchmark_name="some_benchmark", + sub_folder="test_folder", + cloudwatch_client=mock_cloudwatch_client, + ) + self.assertEqual(metrics_list, metrics) + + @patch("tempfile.TemporaryDirectory") + @patch("autogluon.bench.eval.hardware_metrics.hardware_metrics.save_results") + @patch("autogluon.bench.eval.hardware_metrics.hardware_metrics.upload_to_s3") + @patch("autogluon.bench.eval.hardware_metrics.hardware_metrics.get_metrics") + @patch("boto3.client") + def test_get_hardware_metrics(self, mock_boto3, mock_metrics, mock_upload_to_s3, mock_save_results, mock_temp_dir): + mock_metrics.return_value = ["metrics"] + get_hardware_metrics(config_file, "some bucket", "tabular", "some_benchmark") + job_ids = get_job_ids(config) + calls = [ + call( + job_id=job_ids[0], + s3_bucket="some bucket", + module="tabular", + benchmark_name="some_benchmark", + sub_folder="ag_bench_20230720T102030_2d42d496266911ee8df28ee9311e6528", + cloudwatch_client=ANY, + ), + call( + job_id=job_ids[1], + s3_bucket="some bucket", + module="tabular", + benchmark_name="some_benchmark", + sub_folder="ag_bench_20230720T102030_2d794800266911ee8df28ee9311e6528", + cloudwatch_client=ANY, + ), + ] + mock_metrics.assert_has_calls(calls, any_order=False) + mock_save_results.assert_called_once() + mock_upload_to_s3.assert_called_once() + + def test_invalid_config_file(self): + with self.assertRaises(ValueError): + get_hardware_metrics(None, "some bucket", "tabular", "some_benchmark") + with self.assertRaises(FileNotFoundError): + get_hardware_metrics("incorrect config path", "some bucket", "tabular", "some_benchmark") + + +if __name__ == "__main__": + unittest.main()