diff --git a/README.md b/README.md
index a3352a5a..093c58b9 100644
--- a/README.md
+++ b/README.md
@@ -212,6 +212,13 @@ A variety of metrics are available for the EC2 instances that are launched durin
 
 In addition to the standard metrics, we also provide a custom metric for `GPUUtilization`. This can be found in the `CloudWatch` section under `All metrics` -> `Custom namespaces` -> `EC2`. Please note that the `GPUUtilization` metric is also updated every five minutes.
 
+We provide an option to save aggregated (average) custom hardware metrics (`GPUUtilization` and `CPUUtilization` logged in 5s intervals) to the benchmark directory under the provided S3 bucket, simply use the option when running benchmark:
+
+```
+agbench run --save-hardware-metrics
+```
+
+Note that currently this command waits for all jobs to become successful to pull the hardware metrics.
 
 ## Evaluating benchmark runs
 
diff --git a/src/autogluon/bench/eval/hardware_metrics/hardware_metrics.py b/src/autogluon/bench/eval/hardware_metrics/hardware_metrics.py
new file mode 100644
index 00000000..6b0756eb
--- /dev/null
+++ b/src/autogluon/bench/eval/hardware_metrics/hardware_metrics.py
@@ -0,0 +1,301 @@
+import csv
+import logging
+import os
+import tempfile
+from datetime import datetime, timedelta
+from typing import List, Optional
+
+import boto3
+import pandas as pd
+import typer
+import yaml
+
+from autogluon.bench.utils.general_utils import upload_to_s3
+
+aws_account_id = None
+aws_account_region = None
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def find_s3_file(s3_bucket: str, prefix: str, file: str):
+    s3 = boto3.client("s3")
+    paginator = s3.get_paginator("list_objects_v2")
+    page_iterator = paginator.paginate(Bucket=s3_bucket, Prefix=prefix)
+
+    for page in page_iterator:
+        if "Contents" in page:
+            for obj in page["Contents"]:
+                if obj["Key"].endswith("results.csv"):
+                    return f"s3://{s3_bucket}/{obj['Key']}"
+    return None
+
+
+def get_job_ids(config_file: str):
+    """
+    This function returns a list of job IDs of all jobs ran for a benchmark run
+    Parameters
+    ----------
+    config_file: str,
+        Path to config file containing job IDs
+    """
+    job_ids = list(config_file.get("job_configs", {}).keys())
+    return job_ids
+
+
+def get_instance_id(job_id):
+    """
+    This function returns the instance ID (ARN) of the EC2 instance that was used to run a job with given job ID.
+    Parameters
+    ----------
+    job_id: str
+    """
+    batch_client = boto3.client("batch", region_name=aws_account_region)
+    ecs_client = boto3.client("ecs", region_name=aws_account_region)
+
+    response = batch_client.describe_jobs(jobs=[f"{job_id}"])
+    if response:
+        container_arn = response["jobs"][0]["container"]["containerInstanceArn"]
+        cluster_arn = response["jobs"][0]["container"]["taskArn"].split("/")
+        cluster = f"arn:aws:ecs:{aws_account_region}:{aws_account_id}:cluster/" + cluster_arn[1]
+
+    response = ecs_client.describe_container_instances(cluster=cluster, containerInstances=[container_arn])
+    instance_id = response["containerInstances"][0]["ec2InstanceId"]
+    return instance_id
+
+
+def get_instance_util(
+    namespace: str,
+    instance_id: str,
+    metric: str,
+    start_time: datetime,
+    end_time: datetime,
+    cloudwatch_client: boto3.client,
+    period: int = 360,
+    statistics: Optional[List[str]] = ["Average"],
+) -> dict:
+    """
+    This function returns the instance ID of the EC2 instance that was used to run a job with given job ID.
+    Refer to https://docs.aws.amazon.com/cli/latest/reference/cloudwatch/get-metric-statistics.html for docs on how to interact with the CloudWatch API
+    Also refer to https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/cloudwatch/client/get_metric_statistics.html for documentation on how to interact with the API through Python
+    Parameters
+    ----------
+    instance_id: str,
+        EC2 instance ARN
+    metric: str,
+        Name of metric to pass into the CloudWatch API. Example: CPUUtilization
+        Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/viewing_metrics_with_cloudwatch.html#ec2-cloudwatch-metrics
+    start_time: datetime,
+    end_time: datetime,
+    statistics: Optional[List[str]] = ["Average"],
+        The metric statistics, other than percentile. For percentile statistics, use `ExtendedStatistics` . When calling `get_metric_statistics` , you must specify either `Statistics` or `ExtendedStatistics` , but not both.
+        Examples: Average, Maximum, Minimum
+    """
+    return cloudwatch_client.get_metric_statistics(
+        Namespace=namespace,
+        MetricName=metric,
+        Dimensions=[
+            {"Name": "InstanceId", "Value": instance_id},
+        ],
+        Statistics=statistics,
+        StartTime=start_time,
+        EndTime=end_time,
+        Period=period,
+    )
+
+
+def format_metrics(
+    instance_metrics: dict,
+    framework: str,
+    dataset: str,
+    fold: int,
+    mode: str,
+    statistics: Optional[List[str]] = ["Average"],
+):
+    """
+    This function returns a formatted version of the dictionary of metrics provided by the CloudWatch API so it can be easily added to a CSV file and passed into `autogluon-dashboard`.
+    Parameters
+    ----------
+    instance_metrics: dict,
+        Dictionary of instance metrics for a given EC2 instance provided by CloudWatch
+    framework: str,
+        Name of the framework
+    dataset: str,
+        Name of the dataset
+    fold: int,
+        Fold #
+    mode: str,
+        Mode -> Training or Prediction
+    statistics: Optional[List[str]] = ["Average"],
+        The metric statistics, other than percentile. For percentile statistics, use `ExtendedStatistics` . When calling `get_metric_statistics` , you must specify either `Statistics` or `ExtendedStatistics` , but not both.
+        Examples: Average, Maximum, Minimum
+    """
+    output_dict = {}
+    output_dict["framework"] = framework
+    output_dict["dataset"] = dataset
+    output_dict["mode"] = mode
+    output_dict["fold"] = fold
+    output_dict["metric"] = instance_metrics["Label"]
+    for i in range(len(instance_metrics["Datapoints"])):
+        for stat in statistics:
+            output_dict["framework"] = framework
+            output_dict["dataset"] = dataset
+            output_dict["mode"] = mode
+            output_dict["fold"] = fold
+            output_dict["metric"] = instance_metrics["Label"]
+            output_dict["statistic_type"] = stat
+            output_dict["statistic_value"] = instance_metrics["Datapoints"][i][f"{stat}"]
+            output_dict["unit"] = instance_metrics["Datapoints"][i]["Unit"]
+    return output_dict
+
+
+def get_metrics(
+    job_id: str,
+    s3_bucket: str,
+    module: str,
+    benchmark_name: str,
+    sub_folder: str,
+    cloudwatch_client: boto3.client,
+    namespace: str = "EC2",  # CloudWatch "Custom" namespace, i.e. Custom/EC2
+):
+    """
+    Parameters
+    ----------
+    job_id: str,
+    metrics: list,
+        List of metrics to pass into the CloudWatch API. Example: CPUUtilization
+        Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/viewing_metrics_with_cloudwatch.html#ec2-cloudwatch-metrics
+    s3_bucket: str,
+    module: str,
+    benchmark_name: str,
+    sub_folder: str,
+        Sub folder for results.csv file.
+        Passed in from `get_hardware_metrics` function
+    namespace: str,
+        CloudWatch Metrics Namespace, default: AWS/EC2
+    """
+    path_prefix = f"{module}/{benchmark_name}/{sub_folder}/"
+    s3_path_to_csv = find_s3_file(s3_bucket=s3_bucket, prefix=path_prefix, file="results.csv")
+    results = pd.read_csv(s3_path_to_csv)
+    metrics_list = []
+    instance_id = get_instance_id(job_id)
+    metrics_data = cloudwatch_client.list_metrics(
+        Dimensions=[
+            {"Name": "InstanceId", "Value": instance_id},
+        ],
+        Namespace=namespace,
+    )["Metrics"]
+    metrics_pool = [item["MetricName"] for item in metrics_data]
+
+    for metric in metrics_pool:
+        for i in results.index:
+            framework, dataset, utc, train_time, predict_time, fold = (
+                results["framework"][i],
+                results["task"][i],
+                results["utc"][i],
+                results["training_duration"][i],
+                results["predict_duration"][i],
+                results["fold"][i],
+            )
+            utc_dt = datetime.strptime(utc, "%Y-%m-%dT%H:%M:%S")
+            period = int((timedelta(seconds=train_time) + timedelta(seconds=predict_time)).total_seconds())
+            if period < 60:
+                period = 60
+            elif period % 60 != 0:
+                period = (period // 60) * 60  # Round down to the nearest multiple of 60
+
+            training_util = get_instance_util(
+                namespace=namespace,
+                instance_id=instance_id,
+                metric=metric,
+                start_time=utc_dt,
+                end_time=utc_dt + timedelta(seconds=train_time) + timedelta(seconds=predict_time),
+                period=period,
+                cloudwatch_client=cloudwatch_client,
+            )
+            predict_util = get_instance_util(
+                namespace=namespace,
+                instance_id=instance_id,
+                metric=metric,
+                start_time=utc_dt - timedelta(minutes=predict_time),
+                end_time=utc_dt,
+                period=period,
+                cloudwatch_client=cloudwatch_client,
+            )
+            if training_util["Datapoints"]:
+                metrics_list.append(format_metrics(training_util, framework, dataset, fold, "Training"))
+            if predict_util["Datapoints"]:
+                metrics_list.append(format_metrics(predict_util, framework, dataset, fold, "Prediction"))
+    return metrics_list
+
+
+def save_results(metrics_list: list, path: str):
+    """
+    Writes the formatted dictionary of metrics to a csv to pass into `autogluon-dashboard`.
+    Parameters
+    ----------
+    metrics_list: list,
+        List of hardware metrics to write to CSV
+    path: str:
+        Path to save file
+    """
+    csv_headers = ["framework", "dataset", "mode", "fold", "metric", "statistic_type", "statistic_value", "unit"]
+    csv_location = os.path.join(path, "hardware_metrics.csv")
+    with open(csv_location, "w", newline="") as csvFile:
+        writer = csv.DictWriter(csvFile, fieldnames=csv_headers)
+        writer.writeheader()
+        writer.writerows(metrics_list)
+    return csv_location
+
+
+def get_hardware_metrics(
+    config_file: str = typer.Argument(help="Path to YAML config file containing job ids."),
+    s3_bucket: str = typer.Argument(help="Name of the S3 bucket to which the benchmark results were outputted."),
+    module: str = typer.Argument(help="Can be one of ['tabular', 'timeseries', 'multimodal']."),
+    benchmark_name: str = typer.Argument(
+        help="Folder name of benchmark run in which all objects with path 'scores/results.csv' get aggregated."
+    ),
+):
+    """
+    External API function to interact with the script.
+    Parameters
+    ----------
+    config_file: str,
+        Path to config file containing job IDs
+    s3_bucket: str,
+        Name of the S3 bucket to which the benchmark results were outputted.
+    module: str,
+        Benchmark module: tabular or multimodal
+    benchmark_name: str,
+        Name of the benchmark
+        Example: ag_bench_20230817T123456
+    """
+    if not config_file:
+        raise ValueError("Invalid Config File")
+    logger.info(f"Getting hardware metrics for jobs under config file: {config_file}")
+    with open(config_file, "r") as f:
+        config = yaml.safe_load(f)
+    job_ids = get_job_ids(config)
+
+    global aws_account_id, aws_account_region
+    aws_account_id = config.get("CDK_DEPLOY_ACCOUNT")
+    aws_account_region = config.get("CDK_DEPLOY_REGION")
+
+    cloudwatch_client = boto3.client("cloudwatch", region_name=aws_account_region)
+
+    metrics_list = []
+    for job_id in job_ids:
+        sub_folder = config["job_configs"][f"{job_id}"].split("/")[-1].split(".")[0].replace("_split", "")
+        metrics_list += get_metrics(
+            job_id=job_id,
+            s3_bucket=s3_bucket,
+            module=module,
+            benchmark_name=benchmark_name,
+            sub_folder=sub_folder,
+            cloudwatch_client=cloudwatch_client,
+        )
+    if metrics_list:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            local_path = save_results(metrics_list, temp_dir)
+            upload_to_s3(s3_bucket=s3_bucket, s3_dir=f"{module}/{benchmark_name}", local_path=local_path)
diff --git a/src/autogluon/bench/frameworks/multimodal/exec.py b/src/autogluon/bench/frameworks/multimodal/exec.py
index 125b0889..c378e3f7 100644
--- a/src/autogluon/bench/frameworks/multimodal/exec.py
+++ b/src/autogluon/bench/frameworks/multimodal/exec.py
@@ -212,7 +212,7 @@ def run(
         "type": predictor.problem_type,
         "result": scores[test_data.metric],
         "metric": test_data.metric,
-        "utc_time": utc_time,
+        "utc": utc_time,
         "training_duration": training_duration,
         "predict_duration": predict_duration,
         "scores": scores,
diff --git a/src/autogluon/bench/main.py b/src/autogluon/bench/main.py
index 8c5c6006..fbf678b4 100644
--- a/src/autogluon/bench/main.py
+++ b/src/autogluon/bench/main.py
@@ -1,6 +1,7 @@
 import typer
 
 from autogluon.bench.cloud.aws.stack_handler import destroy_stack
+from autogluon.bench.eval.hardware_metrics.hardware_metrics import get_hardware_metrics
 from autogluon.bench.eval.scripts.aggregate_amlb_results import aggregate_amlb_results
 from autogluon.bench.eval.scripts.run_evaluation_openml import evaluate_amlb_results
 from autogluon.bench.eval.scripts.run_generate_clean_openml import clean_amlb_results
@@ -16,6 +17,7 @@
 app.command()(aggregate_amlb_results)
 app.command()(clean_amlb_results)
 app.command()(evaluate_amlb_results)
+app.command()(get_hardware_metrics)
 
 if __name__ == "__main__":
     app()
diff --git a/src/autogluon/bench/runbenchmark.py b/src/autogluon/bench/runbenchmark.py
index e495ec5b..9c52671b 100644
--- a/src/autogluon/bench/runbenchmark.py
+++ b/src/autogluon/bench/runbenchmark.py
@@ -14,6 +14,7 @@
 
 from autogluon.bench import __version__ as agbench_version
 from autogluon.bench.cloud.aws.stack_handler import deploy_stack, destroy_stack
+from autogluon.bench.eval.hardware_metrics.hardware_metrics import get_hardware_metrics
 from autogluon.bench.frameworks.multimodal.multimodal_benchmark import MultiModalBenchmark
 from autogluon.bench.frameworks.tabular.tabular_benchmark import TabularBenchmark
 from autogluon.bench.frameworks.timeseries.timeseries_benchmark import TimeSeriesBenchmark
@@ -344,6 +345,7 @@ def run(
     skip_setup: bool = typer.Option(
         False, help="Whether to skip setting up framework in local mode, default to False."
     ),
+    save_hardware_metrics: bool = typer.Option(False, help="Whether to query and save the hardware metrics."),
 ):
     """Main function that runs the benchmark based on the provided configuration options."""
     configs = {}
@@ -443,7 +445,7 @@ def run(
 
             if remove_resources:
                 wait = True
-            if wait:
+            if wait or save_hardware_metrics:
                 logger.info(
                     "Waiting for jobs to complete. You can quit at anytime and the benchmark will continue to run on the cloud"
                 )
@@ -464,6 +466,13 @@ def run(
                         logger.error("Resources are not being removed due to errors.")
                 else:
                     logger.info("All job succeeded.")
+                    if save_hardware_metrics:
+                        get_hardware_metrics(
+                            config_file=aws_config_path,
+                            s3_bucket=infra_configs["METRICS_BUCKET"],
+                            module=module,
+                            benchmark_name=benchmark_name,
+                        )
                     if remove_resources:
                         logger.info("Removing resoureces...")
                         destroy_stack(
diff --git a/tests/unittests/benchmark/test_runbenchmarks.py b/tests/unittests/benchmark/test_runbenchmarks.py
index 01f678cc..c8a71dae 100644
--- a/tests/unittests/benchmark/test_runbenchmarks.py
+++ b/tests/unittests/benchmark/test_runbenchmarks.py
@@ -67,6 +67,7 @@ def mock_yaml_side_effect(file_obj):
     mocker.patch("autogluon.bench.runbenchmark._get_benchmark_name", return_value="test_benchmark")
     mocker.patch("autogluon.bench.runbenchmark.formatted_time", return_value="test_time")
     mocker.patch("autogluon.bench.runbenchmark._dump_configs", return_value="test_dump")
+    mocker.patch("autogluon.bench.runbenchmark._dump_configs", return_value="test_dump")
     mocker.patch("os.environ.__setitem__")
 
     mock_deploy_stack = mocker.patch("autogluon.bench.runbenchmark.deploy_stack", return_value=infra_configs)
@@ -74,6 +75,7 @@ def mock_yaml_side_effect(file_obj):
     mock_invoke_lambda = mocker.patch("autogluon.bench.runbenchmark.invoke_lambda", return_value={})
 
     mock_wait_for_jobs = mocker.patch("autogluon.bench.runbenchmark.wait_for_jobs", return_value=[])
+    mock_get_hardware_metrics = mocker.patch("autogluon.bench.runbenchmark.get_hardware_metrics")
     mock_destroy_stack = mocker.patch("autogluon.bench.runbenchmark.destroy_stack")
     mock_mount = mocker.patch("autogluon.bench.runbenchmark._mount_dir")
     mock_umount = mocker.patch("autogluon.bench.runbenchmark._umount_if_needed")
@@ -86,6 +88,7 @@ def mock_yaml_side_effect(file_obj):
         "mock_invoke_lambda": mock_invoke_lambda,
         "mock_wait_for_jobs": mock_wait_for_jobs,
         "mock_destroy_stack": mock_destroy_stack,
+        "mock_get_hardware_metrics": mock_get_hardware_metrics,
         "mock_mount": mock_mount,
         "mock_umount": mock_umount,
         "cdk_context": cdk_context,
@@ -177,7 +180,14 @@ def test_invoke_lambda(mocker):
 def test_run_aws_mode(mocker, tmp_path):
     setup = setup_mock(mocker, tmp_path)
 
-    run(setup["config_file"], remove_resources=False, wait=False, dev_branch=None, skip_setup=True)
+    run(
+        setup["config_file"],
+        remove_resources=False,
+        wait=False,
+        dev_branch=None,
+        skip_setup=True,
+        save_hardware_metrics=False,
+    )
 
     setup["mock_deploy_stack"].assert_called_once_with(custom_configs=setup["custom_configs"])
     setup["mock_upload_to_s3"].assert_called_once_with(
@@ -185,13 +195,21 @@ def test_run_aws_mode(mocker, tmp_path):
     )
     setup["mock_invoke_lambda"].assert_called_once_with(configs=setup["infra_configs"], config_file="test_s3_path")
     setup["mock_wait_for_jobs"].assert_not_called()
+    setup["mock_get_hardware_metrics"].assert_not_called(),
     setup["mock_destroy_stack"].assert_not_called()
 
 
 def test_run_aws_mode_remove_resources(mocker, tmp_path):
     setup = setup_mock(mocker, tmp_path)
 
-    run(setup["config_file"], remove_resources=True, wait=False, dev_branch=None, skip_setup=True)
+    run(
+        setup["config_file"],
+        remove_resources=True,
+        wait=False,
+        dev_branch=None,
+        skip_setup=True,
+        save_hardware_metrics=False,
+    )
 
     setup["mock_deploy_stack"].assert_called_once_with(custom_configs=setup["custom_configs"])
     setup["mock_upload_to_s3"].assert_called_once_with(
@@ -212,7 +230,14 @@ def test_run_aws_mode_remove_resources(mocker, tmp_path):
 def test_run_aws_mode_wait(mocker, tmp_path):
     setup = setup_mock(mocker, tmp_path)
 
-    run(setup["config_file"], remove_resources=False, wait=True, dev_branch=None, skip_setup=True)
+    run(
+        setup["config_file"],
+        remove_resources=False,
+        wait=True,
+        dev_branch=None,
+        skip_setup=True,
+        save_hardware_metrics=False,
+    )
 
     setup["mock_deploy_stack"].assert_called_once_with(custom_configs=setup["custom_configs"])
     setup["mock_upload_to_s3"].assert_called_once_with(
@@ -221,13 +246,21 @@ def test_run_aws_mode_wait(mocker, tmp_path):
     setup["mock_invoke_lambda"].assert_called_once_with(configs=setup["infra_configs"], config_file="test_s3_path")
 
     setup["mock_wait_for_jobs"].assert_called_once_with(config_file="test_dump")
+    setup["mock_get_hardware_metrics"].assert_not_called()
 
 
 def test_run_aws_mode_dev_branch(mocker, tmp_path):
     setup = setup_mock(mocker, tmp_path)
     dev_branch = "dev_branch_url"
 
-    run(setup["config_file"], remove_resources=False, wait=False, dev_branch=dev_branch, skip_setup=True)
+    run(
+        setup["config_file"],
+        remove_resources=False,
+        wait=False,
+        dev_branch=dev_branch,
+        skip_setup=True,
+        save_hardware_metrics=False,
+    )
 
     assert os.environ["AG_BENCH_DEV_URL"] == dev_branch
     setup["mock_deploy_stack"].assert_called_once_with(custom_configs=setup["custom_configs"])
@@ -248,6 +281,7 @@ def test_run_aws_tabular_user_dir(mocker, tmp_path):
         wait=False,
         dev_branch="https://git_url#git_branch",
         skip_setup=True,
+        save_hardware_metrics=False,
     )
     assert os.environ["AG_BENCH_DEV_URL"] == "https://git_url#git_branch"
     assert os.environ["FRAMEWORK_PATH"] == "frameworks/tabular"
@@ -273,6 +307,7 @@ def test_run_aws_multimodal_custom_dataloader(mocker, tmp_path):
         wait=False,
         dev_branch="https://git_url#git_branch",
         skip_setup=True,
+        save_hardware_metrics=False,
     )
     assert setup["custom_configs"]["custom_dataloader"]["dataloader_file"] == "dataloaders/dataset.py"
     assert setup["custom_configs"]["custom_dataloader"]["dataset_config_file"] == "dataloaders/datasets.yaml"
@@ -296,7 +331,14 @@ def test_run_local_mode(mocker, tmp_path):
     mocker.patch("autogluon.bench.runbenchmark.formatted_time", return_value="test_time")
     mock_run_benchmark = mocker.patch("autogluon.bench.runbenchmark.run_benchmark")
 
-    run(str(config_file), remove_resources=False, wait=False, dev_branch=None, skip_setup=False)
+    run(
+        str(config_file),
+        remove_resources=False,
+        wait=False,
+        dev_branch=None,
+        skip_setup=False,
+        save_hardware_metrics=False,
+    )
 
     mock_open.assert_called_with(str(config_file), "r")
     mock_run_benchmark.assert_called_with(
diff --git a/tests/unittests/evaluation/hardware_metrics/resources/expected_metrics.py b/tests/unittests/evaluation/hardware_metrics/resources/expected_metrics.py
new file mode 100644
index 00000000..790d63ba
--- /dev/null
+++ b/tests/unittests/evaluation/hardware_metrics/resources/expected_metrics.py
@@ -0,0 +1,82 @@
+metrics = [
+    {
+        "framework": "AutoGluon",
+        "dataset": "credit-g",
+        "mode": "Training",
+        "fold": 0,
+        "metric": "CPUUtilization",
+        "statistic_type": "Average",
+        "statistic_value": 11.472356376239336,
+        "unit": "Percent",
+    },
+    {
+        "framework": "AutoGluon",
+        "dataset": "credit-g",
+        "mode": "Prediction",
+        "fold": 0,
+        "metric": "CPUUtilization",
+        "statistic_type": "Average",
+        "statistic_value": 11.472356376239336,
+        "unit": "Percent",
+    },
+    {
+        "framework": "AutoGluon",
+        "dataset": "credit-g",
+        "mode": "Training",
+        "fold": 1,
+        "metric": "CPUUtilization",
+        "statistic_type": "Average",
+        "statistic_value": 11.472356376239336,
+        "unit": "Percent",
+    },
+    {
+        "framework": "AutoGluon",
+        "dataset": "credit-g",
+        "mode": "Prediction",
+        "fold": 1,
+        "metric": "CPUUtilization",
+        "statistic_type": "Average",
+        "statistic_value": 11.472356376239336,
+        "unit": "Percent",
+    },
+    {
+        "framework": "AutoGluon",
+        "dataset": "vehicle",
+        "mode": "Training",
+        "fold": 0,
+        "metric": "CPUUtilization",
+        "statistic_type": "Average",
+        "statistic_value": 11.472356376239336,
+        "unit": "Percent",
+    },
+    {
+        "framework": "AutoGluon",
+        "dataset": "vehicle",
+        "mode": "Prediction",
+        "fold": 0,
+        "metric": "CPUUtilization",
+        "statistic_type": "Average",
+        "statistic_value": 11.472356376239336,
+        "unit": "Percent",
+    },
+    {
+        "framework": "AutoGluon",
+        "dataset": "vehicle",
+        "mode": "Training",
+        "fold": 1,
+        "metric": "CPUUtilization",
+        "statistic_type": "Average",
+        "statistic_value": 11.472356376239336,
+        "unit": "Percent",
+    },
+    {
+        "framework": "AutoGluon",
+        "dataset": "vehicle",
+        "mode": "Prediction",
+        "fold": 1,
+        "metric": "CPUUtilization",
+        "statistic_type": "Average",
+        "statistic_value": 11.472356376239336,
+        "unit": "Percent",
+    },
+]
diff --git a/tests/unittests/evaluation/hardware_metrics/resources/results.csv b/tests/unittests/evaluation/hardware_metrics/resources/results.csv
new file mode 100644
index 00000000..17ca9b21
--- /dev/null
+++ b/tests/unittests/evaluation/hardware_metrics/resources/results.csv
@@ -0,0 +1,5 @@
+id,task,framework,constraint,fold,type,result,metric,mode,version,params,app_version,utc,duration,training_duration,predict_duration,models_count,seed,info,acc,auc,balacc,logloss,models_ensemble_count
+openml.org/t/31,credit-g,AutoGluon,test,0,binary,0.80381,auc,local,0.8.2,,2.1.6,2023-07-13T22:19:45,23.4,20.7,0.06,14,937966977,,0.72,0.80381,0.666667,0.503194,8
+openml.org/t/31,credit-g,AutoGluon,test,1,binary,0.760952,auc,local,0.8.2,,2.1.6,2023-07-13T22:20:06,20.9,17.7,0.3,14,937966978,,0.74,0.760952,0.604762,0.507339,9
+openml.org/t/53,vehicle,AutoGluon,test,0,multiclass,-0.385097,neg_logloss,local,0.8.2,,2.1.6,2023-07-13T22:20:33,24.4,22.4,0.02,14,937966977,,0.8,,0.800812,0.385097,4
+openml.org/t/53,vehicle,AutoGluon,test,1,multiclass,-0.390919,neg_logloss,local,0.8.2,,2.1.6,2023-07-13T22:20:56,23.1,21.1,0.02,14,937966978,,0.8,,0.80303,0.390919,3
diff --git a/tests/unittests/evaluation/hardware_metrics/resources/test_config.yaml b/tests/unittests/evaluation/hardware_metrics/resources/test_config.yaml
new file mode 100644
index 00000000..7dc3bba2
--- /dev/null
+++ b/tests/unittests/evaluation/hardware_metrics/resources/test_config.yaml
@@ -0,0 +1,20 @@
+BATCH_STACK_NAME: ag-bench-batch-stack
+BLOCK_DEVICE_VOLUME: 100
+CDK_DEPLOY_ACCOUNT: 123456789
+CDK_DEPLOY_REGION: us-east-2
+COMPUTE_ENV_MAXV_CPUS: 10
+CONTAINER_GPU: 1
+CONTAINER_MEMORY: 100
+CONTAINER_VCPU: 8
+DATA_BUCKET: null
+INSTANCE_TYPES:
+- g4dn.2xlarge
+LAMBDA_FUNCTION_NAME: ag-bench-batch-job-function
+METRICS_BUCKET: benchmark-test
+STACK_NAME_PREFIX: ag-bench
+STACK_NAME_TAG: benchmark
+STATIC_RESOURCE_STACK_NAME: ag-bench-static-resource-stack
+VPC_NAME: null
+job_configs:
+  123456-abc-efg: s3://benchmark-test/configs/ag_bench_20230720T102030/ag_bench_20230720T102030_split_2d42d496266911ee8df28ee9311e6528.yaml
+  010101-xxx-zzz: s3://benchmark-test/configs/ag_bench_20230720T102030/ag_bench_20230720T102030_split_2d794800266911ee8df28ee9311e6528.yaml
diff --git a/tests/unittests/evaluation/hardware_metrics/test_hardware_metrics.py b/tests/unittests/evaluation/hardware_metrics/test_hardware_metrics.py
new file mode 100644
index 00000000..b03e7329
--- /dev/null
+++ b/tests/unittests/evaluation/hardware_metrics/test_hardware_metrics.py
@@ -0,0 +1,163 @@
+import datetime
+import os
+import unittest
+from unittest.mock import ANY, MagicMock, call, patch
+
+import pandas as pd
+import yaml
+
+from autogluon.bench.eval.hardware_metrics import hardware_metrics
+from autogluon.bench.eval.hardware_metrics.hardware_metrics import (
+    get_hardware_metrics,
+    get_instance_id,
+    get_instance_util,
+    get_job_ids,
+    get_metrics,
+)
+
+test_dir = os.path.dirname(__file__)
+config_file = os.path.join(test_dir, "resources/test_config.yaml")
+if not config_file:
+    raise ValueError("Invalid Config File")
+with open(config_file, "r") as f:
+    config = yaml.safe_load(f)
+
+hardware_metrics.aws_account_id = config["CDK_DEPLOY_ACCOUNT"]
+hardware_metrics.aws_account_region = config["CDK_DEPLOY_REGION"]
+
+from resources.expected_metrics import metrics
+
+mock_cloudwatch_response = {
+    "Label": "CPUUtilization",
+    "Datapoints": [
+        {"Timestamp": datetime.datetime(2023, 7, 12, 17, 39), "Average": 11.472356376239336, "Unit": "Percent"}
+    ],
+    "ResponseMetadata": {
+        "RequestId": "93ed0de6-7f3c-4af8-8650-2310042c97f8",
+        "HTTPStatusCode": 200,
+        "HTTPHeaders": {
+            "x-amzn-requestid": "93ed0de6-7f3c-4af8-8650-2310042c97f8",
+            "content-type": "text/xml",
+            "content-length": "512",
+            "date": "Wed, 12 Jul 2023 18:19:56 GMT",
+        },
+        "RetryAttempts": 0,
+    },
+}
+
+mock_results_df = pd.read_csv(os.path.join(test_dir, "resources/results.csv"))
+
+
+class TestHardwareMetrics(unittest.TestCase):
+    def test_get_job_ids(self):
+        job_ids = get_job_ids(config)
+        self.assertEqual(job_ids, ["123456-abc-efg", "010101-xxx-zzz"])
+
+    @patch("boto3.client")
+    def test_get_instance_id(self, mock_client):
+        job_ids = get_job_ids(config)
+        batch_client, ecs_client = MagicMock(), MagicMock()
+        mock_client.side_effect = [batch_client, ecs_client]
+        mock_batch_response = {
+            "jobs": [
+                {
+                    "container": {
+                        "containerInstanceArn": "abc",
+                        "taskArn": "arn:aws:ecs:us-east-2:123456789:task/agbenchcomputeenvironmen-DhbZ6yaLr_Batch/b3bb44aa78f",
+                    }
+                }
+            ]
+        }
+        batch_client.describe_jobs.return_value = mock_batch_response
+
+        mock_ecs_response = {"containerInstances": [{"ec2InstanceId": 12345}]}
+        ecs_client.describe_container_instances.return_value = mock_ecs_response
+        instance_id = get_instance_id(job_ids[0])
+        self.assertEqual(instance_id, 12345)
+        cluster = f"arn:aws:ecs:{hardware_metrics.aws_account_region}:{hardware_metrics.aws_account_id}:cluster/agbenchcomputeenvironmen-DhbZ6yaLr_Batch"
+        ecs_client.describe_container_instances.assert_called_once_with(cluster=cluster, containerInstances=["abc"])
+
+    @patch("boto3.client")
+    def test_get_instance_util(self, mock_client):
+        cloudwatch_client = MagicMock()
+        mock_client.side_effect = [cloudwatch_client]
+        cloudwatch_client.get_metric_statistics.return_value = mock_cloudwatch_response
+        self.assertEqual(
+            get_instance_util(
+                namespace="namespace",
+                instance_id="1234",
+                metric="CPUUtilization",
+                start_time=datetime.datetime(2023, 7, 12, 17, 39),
+                end_time=datetime.datetime(2023, 7, 12, 16, 39),
+                cloudwatch_client=cloudwatch_client,
+            ),
+            cloudwatch_client.get_metric_statistics.return_value,
+        )
+
+    @patch("pandas.read_csv")
+    @patch("autogluon.bench.eval.hardware_metrics.hardware_metrics.get_instance_id")
+    @patch("autogluon.bench.eval.hardware_metrics.hardware_metrics.get_instance_util")
+    @patch("autogluon.bench.eval.hardware_metrics.hardware_metrics.find_s3_file")
+    def test_get_metrics(self, mock_s3_file, mock_instance_util, mock_instance_id, mock_csv):
+        mock_csv.return_value = mock_results_df
+        mock_instance_id.return_value = "12345"
+        job_id = list(config.get("job_configs", {}).keys())[0]
+        mock_instance_util.return_value = mock_cloudwatch_response
+        mock_list_metrics_return = {
+            "Metrics": [
+                {"MetricName": "CPUUtilization"},
+            ]
+        }
+        mock_cloudwatch_client = MagicMock()
+        mock_cloudwatch_client.list_metrics.return_value = mock_list_metrics_return
+
+        metrics_list = get_metrics(
+            job_id=job_id,
+            s3_bucket="some bucket",
+            module="tabular",
+            benchmark_name="some_benchmark",
+            sub_folder="test_folder",
+            cloudwatch_client=mock_cloudwatch_client,
+        )
+        self.assertEqual(metrics_list, metrics)
+
+    @patch("tempfile.TemporaryDirectory")
+    @patch("autogluon.bench.eval.hardware_metrics.hardware_metrics.save_results")
+    @patch("autogluon.bench.eval.hardware_metrics.hardware_metrics.upload_to_s3")
+    @patch("autogluon.bench.eval.hardware_metrics.hardware_metrics.get_metrics")
+    @patch("boto3.client")
+    def test_get_hardware_metrics(self, mock_boto3, mock_metrics, mock_upload_to_s3, mock_save_results, mock_temp_dir):
+        mock_metrics.return_value = ["metrics"]
+        get_hardware_metrics(config_file, "some bucket", "tabular", "some_benchmark")
+        job_ids = get_job_ids(config)
+        calls = [
+            call(
+                job_id=job_ids[0],
+                s3_bucket="some bucket",
+                module="tabular",
+                benchmark_name="some_benchmark",
+                sub_folder="ag_bench_20230720T102030_2d42d496266911ee8df28ee9311e6528",
+                cloudwatch_client=ANY,
+            ),
+            call(
+                job_id=job_ids[1],
+                s3_bucket="some bucket",
+                module="tabular",
+                benchmark_name="some_benchmark",
+                sub_folder="ag_bench_20230720T102030_2d794800266911ee8df28ee9311e6528",
+                cloudwatch_client=ANY,
+            ),
+        ]
+        mock_metrics.assert_has_calls(calls, any_order=False)
+        mock_save_results.assert_called_once()
+        mock_upload_to_s3.assert_called_once()
+
+    def test_invalid_config_file(self):
+        with self.assertRaises(ValueError):
+            get_hardware_metrics(None, "some bucket", "tabular", "some_benchmark")
+        with self.assertRaises(FileNotFoundError):
+            get_hardware_metrics("incorrect config path", "some bucket", "tabular", "some_benchmark")
+
+
+if __name__ == "__main__":
+    unittest.main()