Skip to content

Commit

Permalink
Code for getting hardware metrics from EC2 instances used for benchma…
Browse files Browse the repository at this point in the history
…rk jobs (#52)

* hardware metrics code

* formatting

* formatting

* Add config file

* add results.csv

* add docstrings

* address PR comments

* ran black

* fix unittest

* fix unittest

* fix

* update test

* update README

---------

Co-authored-by: Su Zhou <[email protected]>
  • Loading branch information
shreyash2106 and suzhoum authored Sep 20, 2023
1 parent 92ef181 commit 0492b88
Show file tree
Hide file tree
Showing 10 changed files with 638 additions and 7 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,13 @@ A variety of metrics are available for the EC2 instances that are launched durin

In addition to the standard metrics, we also provide a custom metric for `GPUUtilization`. This can be found in the `CloudWatch` section under `All metrics` -> `Custom namespaces` -> `EC2`. Please note that the `GPUUtilization` metric is also updated every five minutes.

We provide an option to save aggregated (average) custom hardware metrics (`GPUUtilization` and `CPUUtilization` logged in 5s intervals) to the benchmark directory under the provided S3 bucket, simply use the option when running benchmark:

```
agbench run --save-hardware-metrics
```

Note that currently this command waits for all jobs to become successful to pull the hardware metrics.

## Evaluating benchmark runs

Expand Down
301 changes: 301 additions & 0 deletions src/autogluon/bench/eval/hardware_metrics/hardware_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,301 @@
import csv
import logging
import os
import tempfile
from datetime import datetime, timedelta
from typing import List, Optional

import boto3
import pandas as pd
import typer
import yaml

from autogluon.bench.utils.general_utils import upload_to_s3

aws_account_id = None
aws_account_region = None

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def find_s3_file(s3_bucket: str, prefix: str, file: str):
s3 = boto3.client("s3")
paginator = s3.get_paginator("list_objects_v2")
page_iterator = paginator.paginate(Bucket=s3_bucket, Prefix=prefix)

for page in page_iterator:
if "Contents" in page:
for obj in page["Contents"]:
if obj["Key"].endswith("results.csv"):
return f"s3://{s3_bucket}/{obj['Key']}"
return None


def get_job_ids(config_file: str):
"""
This function returns a list of job IDs of all jobs ran for a benchmark run
Parameters
----------
config_file: str,
Path to config file containing job IDs
"""
job_ids = list(config_file.get("job_configs", {}).keys())
return job_ids


def get_instance_id(job_id):
"""
This function returns the instance ID (ARN) of the EC2 instance that was used to run a job with given job ID.
Parameters
----------
job_id: str
"""
batch_client = boto3.client("batch", region_name=aws_account_region)
ecs_client = boto3.client("ecs", region_name=aws_account_region)

response = batch_client.describe_jobs(jobs=[f"{job_id}"])
if response:
container_arn = response["jobs"][0]["container"]["containerInstanceArn"]
cluster_arn = response["jobs"][0]["container"]["taskArn"].split("/")
cluster = f"arn:aws:ecs:{aws_account_region}:{aws_account_id}:cluster/" + cluster_arn[1]

response = ecs_client.describe_container_instances(cluster=cluster, containerInstances=[container_arn])
instance_id = response["containerInstances"][0]["ec2InstanceId"]
return instance_id


def get_instance_util(
namespace: str,
instance_id: str,
metric: str,
start_time: datetime,
end_time: datetime,
cloudwatch_client: boto3.client,
period: int = 360,
statistics: Optional[List[str]] = ["Average"],
) -> dict:
"""
This function returns the instance ID of the EC2 instance that was used to run a job with given job ID.
Refer to https://docs.aws.amazon.com/cli/latest/reference/cloudwatch/get-metric-statistics.html for docs on how to interact with the CloudWatch API
Also refer to https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/cloudwatch/client/get_metric_statistics.html for documentation on how to interact with the API through Python
Parameters
----------
instance_id: str,
EC2 instance ARN
metric: str,
Name of metric to pass into the CloudWatch API. Example: CPUUtilization
Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/viewing_metrics_with_cloudwatch.html#ec2-cloudwatch-metrics
start_time: datetime,
end_time: datetime,
statistics: Optional[List[str]] = ["Average"],
The metric statistics, other than percentile. For percentile statistics, use `ExtendedStatistics` . When calling `get_metric_statistics` , you must specify either `Statistics` or `ExtendedStatistics` , but not both.
Examples: Average, Maximum, Minimum
"""
return cloudwatch_client.get_metric_statistics(
Namespace=namespace,
MetricName=metric,
Dimensions=[
{"Name": "InstanceId", "Value": instance_id},
],
Statistics=statistics,
StartTime=start_time,
EndTime=end_time,
Period=period,
)


def format_metrics(
instance_metrics: dict,
framework: str,
dataset: str,
fold: int,
mode: str,
statistics: Optional[List[str]] = ["Average"],
):
"""
This function returns a formatted version of the dictionary of metrics provided by the CloudWatch API so it can be easily added to a CSV file and passed into `autogluon-dashboard`.
Parameters
----------
instance_metrics: dict,
Dictionary of instance metrics for a given EC2 instance provided by CloudWatch
framework: str,
Name of the framework
dataset: str,
Name of the dataset
fold: int,
Fold #
mode: str,
Mode -> Training or Prediction
statistics: Optional[List[str]] = ["Average"],
The metric statistics, other than percentile. For percentile statistics, use `ExtendedStatistics` . When calling `get_metric_statistics` , you must specify either `Statistics` or `ExtendedStatistics` , but not both.
Examples: Average, Maximum, Minimum
"""
output_dict = {}
output_dict["framework"] = framework
output_dict["dataset"] = dataset
output_dict["mode"] = mode
output_dict["fold"] = fold
output_dict["metric"] = instance_metrics["Label"]
for i in range(len(instance_metrics["Datapoints"])):
for stat in statistics:
output_dict["framework"] = framework
output_dict["dataset"] = dataset
output_dict["mode"] = mode
output_dict["fold"] = fold
output_dict["metric"] = instance_metrics["Label"]
output_dict["statistic_type"] = stat
output_dict["statistic_value"] = instance_metrics["Datapoints"][i][f"{stat}"]
output_dict["unit"] = instance_metrics["Datapoints"][i]["Unit"]
return output_dict


def get_metrics(
job_id: str,
s3_bucket: str,
module: str,
benchmark_name: str,
sub_folder: str,
cloudwatch_client: boto3.client,
namespace: str = "EC2", # CloudWatch "Custom" namespace, i.e. Custom/EC2
):
"""
Parameters
----------
job_id: str,
metrics: list,
List of metrics to pass into the CloudWatch API. Example: CPUUtilization
Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/viewing_metrics_with_cloudwatch.html#ec2-cloudwatch-metrics
s3_bucket: str,
module: str,
benchmark_name: str,
sub_folder: str,
Sub folder for results.csv file.
Passed in from `get_hardware_metrics` function
namespace: str,
CloudWatch Metrics Namespace, default: AWS/EC2
"""
path_prefix = f"{module}/{benchmark_name}/{sub_folder}/"
s3_path_to_csv = find_s3_file(s3_bucket=s3_bucket, prefix=path_prefix, file="results.csv")
results = pd.read_csv(s3_path_to_csv)
metrics_list = []
instance_id = get_instance_id(job_id)
metrics_data = cloudwatch_client.list_metrics(
Dimensions=[
{"Name": "InstanceId", "Value": instance_id},
],
Namespace=namespace,
)["Metrics"]
metrics_pool = [item["MetricName"] for item in metrics_data]

for metric in metrics_pool:
for i in results.index:
framework, dataset, utc, train_time, predict_time, fold = (
results["framework"][i],
results["task"][i],
results["utc"][i],
results["training_duration"][i],
results["predict_duration"][i],
results["fold"][i],
)
utc_dt = datetime.strptime(utc, "%Y-%m-%dT%H:%M:%S")
period = int((timedelta(seconds=train_time) + timedelta(seconds=predict_time)).total_seconds())
if period < 60:
period = 60
elif period % 60 != 0:
period = (period // 60) * 60 # Round down to the nearest multiple of 60

training_util = get_instance_util(
namespace=namespace,
instance_id=instance_id,
metric=metric,
start_time=utc_dt,
end_time=utc_dt + timedelta(seconds=train_time) + timedelta(seconds=predict_time),
period=period,
cloudwatch_client=cloudwatch_client,
)
predict_util = get_instance_util(
namespace=namespace,
instance_id=instance_id,
metric=metric,
start_time=utc_dt - timedelta(minutes=predict_time),
end_time=utc_dt,
period=period,
cloudwatch_client=cloudwatch_client,
)
if training_util["Datapoints"]:
metrics_list.append(format_metrics(training_util, framework, dataset, fold, "Training"))
if predict_util["Datapoints"]:
metrics_list.append(format_metrics(predict_util, framework, dataset, fold, "Prediction"))
return metrics_list


def save_results(metrics_list: list, path: str):
"""
Writes the formatted dictionary of metrics to a csv to pass into `autogluon-dashboard`.
Parameters
----------
metrics_list: list,
List of hardware metrics to write to CSV
path: str:
Path to save file
"""
csv_headers = ["framework", "dataset", "mode", "fold", "metric", "statistic_type", "statistic_value", "unit"]
csv_location = os.path.join(path, "hardware_metrics.csv")
with open(csv_location, "w", newline="") as csvFile:
writer = csv.DictWriter(csvFile, fieldnames=csv_headers)
writer.writeheader()
writer.writerows(metrics_list)
return csv_location


def get_hardware_metrics(
config_file: str = typer.Argument(help="Path to YAML config file containing job ids."),
s3_bucket: str = typer.Argument(help="Name of the S3 bucket to which the benchmark results were outputted."),
module: str = typer.Argument(help="Can be one of ['tabular', 'timeseries', 'multimodal']."),
benchmark_name: str = typer.Argument(
help="Folder name of benchmark run in which all objects with path 'scores/results.csv' get aggregated."
),
):
"""
External API function to interact with the script.
Parameters
----------
config_file: str,
Path to config file containing job IDs
s3_bucket: str,
Name of the S3 bucket to which the benchmark results were outputted.
module: str,
Benchmark module: tabular or multimodal
benchmark_name: str,
Name of the benchmark
Example: ag_bench_20230817T123456
"""
if not config_file:
raise ValueError("Invalid Config File")
logger.info(f"Getting hardware metrics for jobs under config file: {config_file}")
with open(config_file, "r") as f:
config = yaml.safe_load(f)
job_ids = get_job_ids(config)

global aws_account_id, aws_account_region
aws_account_id = config.get("CDK_DEPLOY_ACCOUNT")
aws_account_region = config.get("CDK_DEPLOY_REGION")

cloudwatch_client = boto3.client("cloudwatch", region_name=aws_account_region)

metrics_list = []
for job_id in job_ids:
sub_folder = config["job_configs"][f"{job_id}"].split("/")[-1].split(".")[0].replace("_split", "")
metrics_list += get_metrics(
job_id=job_id,
s3_bucket=s3_bucket,
module=module,
benchmark_name=benchmark_name,
sub_folder=sub_folder,
cloudwatch_client=cloudwatch_client,
)
if metrics_list:
with tempfile.TemporaryDirectory() as temp_dir:
local_path = save_results(metrics_list, temp_dir)
upload_to_s3(s3_bucket=s3_bucket, s3_dir=f"{module}/{benchmark_name}", local_path=local_path)
2 changes: 1 addition & 1 deletion src/autogluon/bench/frameworks/multimodal/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def run(
"type": predictor.problem_type,
"result": scores[test_data.metric],
"metric": test_data.metric,
"utc_time": utc_time,
"utc": utc_time,
"training_duration": training_duration,
"predict_duration": predict_duration,
"scores": scores,
Expand Down
2 changes: 2 additions & 0 deletions src/autogluon/bench/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import typer

from autogluon.bench.cloud.aws.stack_handler import destroy_stack
from autogluon.bench.eval.hardware_metrics.hardware_metrics import get_hardware_metrics
from autogluon.bench.eval.scripts.aggregate_amlb_results import aggregate_amlb_results
from autogluon.bench.eval.scripts.run_evaluation_openml import evaluate_amlb_results
from autogluon.bench.eval.scripts.run_generate_clean_openml import clean_amlb_results
Expand All @@ -16,6 +17,7 @@
app.command()(aggregate_amlb_results)
app.command()(clean_amlb_results)
app.command()(evaluate_amlb_results)
app.command()(get_hardware_metrics)

if __name__ == "__main__":
app()
11 changes: 10 additions & 1 deletion src/autogluon/bench/runbenchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from autogluon.bench import __version__ as agbench_version
from autogluon.bench.cloud.aws.stack_handler import deploy_stack, destroy_stack
from autogluon.bench.eval.hardware_metrics.hardware_metrics import get_hardware_metrics
from autogluon.bench.frameworks.multimodal.multimodal_benchmark import MultiModalBenchmark
from autogluon.bench.frameworks.tabular.tabular_benchmark import TabularBenchmark
from autogluon.bench.frameworks.timeseries.timeseries_benchmark import TimeSeriesBenchmark
Expand Down Expand Up @@ -344,6 +345,7 @@ def run(
skip_setup: bool = typer.Option(
False, help="Whether to skip setting up framework in local mode, default to False."
),
save_hardware_metrics: bool = typer.Option(False, help="Whether to query and save the hardware metrics."),
):
"""Main function that runs the benchmark based on the provided configuration options."""
configs = {}
Expand Down Expand Up @@ -443,7 +445,7 @@ def run(

if remove_resources:
wait = True
if wait:
if wait or save_hardware_metrics:
logger.info(
"Waiting for jobs to complete. You can quit at anytime and the benchmark will continue to run on the cloud"
)
Expand All @@ -464,6 +466,13 @@ def run(
logger.error("Resources are not being removed due to errors.")
else:
logger.info("All job succeeded.")
if save_hardware_metrics:
get_hardware_metrics(
config_file=aws_config_path,
s3_bucket=infra_configs["METRICS_BUCKET"],
module=module,
benchmark_name=benchmark_name,
)
if remove_resources:
logger.info("Removing resoureces...")
destroy_stack(
Expand Down
Loading

0 comments on commit 0492b88

Please sign in to comment.