From 58de12a24ebff2c3399ebdba57a466e8c92646f4 Mon Sep 17 00:00:00 2001
From: Matthieu Dufour <mdufou@amazon.co.uk>
Date: Fri, 10 Jan 2025 16:13:46 +0000
Subject: [PATCH] chore(benchmarks): tidy up benchmark (#292)

Delete leftover script files. Add minor change to Jupyter Notebook
(`"ec2_metadata"` key in results table). Simplify pyproject.toml
dependencies list. Change some example parameters in the Hydra config
files, add clarifying comments. Rework the READMEs. Tune the
utils/prepare_nvme.sh to work for both Amazon Linux and Ubuntu EC2
instances. Update global .gitignore. Delete
utils/prepare_ec2_instance.sh, and add its content to the README. For
dataset scenario, add training time measurement around epochs. Minor
Python code improvements.
---
 .gitignore                                    |   6 +-
 s3torchbenchmarking/README.md                 | 255 +++++++++---------
 .../benchmark_results_aggregator.ipynb        |  25 +-
 s3torchbenchmarking/conf/aws/s3.yaml          |   5 -
 s3torchbenchmarking/conf/dataset.yaml         |  34 ++-
 s3torchbenchmarking/conf/dcp.yaml             |  25 +-
 .../conf/lightning_checkpointing.yaml         |  18 +-
 .../conf/pytorch_checkpointing.yaml           |  16 +-
 s3torchbenchmarking/pyproject.toml            |  25 +-
 .../s3torchbenchmarking/benchmark_utils.py    |  31 +--
 .../src/s3torchbenchmarking/datagen.py        |  44 +--
 .../s3torchbenchmarking/dataset/benchmark.py  |  19 +-
 .../src/s3torchbenchmarking/dcp/README.md     | 165 +-----------
 .../src/s3torchbenchmarking/hydra_callback.py |   2 +-
 .../src/s3torchbenchmarking/models.py         |  37 +--
 .../utils/prepare_and_run_benchmark.sh        |  15 --
 .../utils/prepare_ec2_instance.sh             |  23 --
 s3torchbenchmarking/utils/prepare_nvme.sh     |  16 +-
 s3torchbenchmarking/utils/run_benchmarks.sh   |  19 +-
 .../utils/run_checkpoint_benchmarks.sh        |   2 +-
 .../utils/run_dataloading_benchmarks.sh       |  32 ---
 .../utils/run_dataset_benchmarks.sh           |   3 +-
 .../utils/run_dcp_benchmarks.sh               |   2 +-
 .../utils/run_lightning_benchmarks.sh         |   2 +-
 .../utils/upload_colated_results_to_s3.py     |  50 ----
 25 files changed, 315 insertions(+), 556 deletions(-)
 delete mode 100644 s3torchbenchmarking/conf/aws/s3.yaml
 delete mode 100755 s3torchbenchmarking/utils/prepare_and_run_benchmark.sh
 delete mode 100755 s3torchbenchmarking/utils/prepare_ec2_instance.sh
 delete mode 100755 s3torchbenchmarking/utils/run_dataloading_benchmarks.sh
 delete mode 100644 s3torchbenchmarking/utils/upload_colated_results_to_s3.py

diff --git a/.gitignore b/.gitignore
index 6b9126ed..e6b9f81d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,8 +60,10 @@ venv.bak/
 .dmypy.json
 dmypy.json
 
-# Hydra (https://hydra.cc/)
-multirun/
+# PyTorch benchmarks: Hydra, NVMe directory, and CSV results
+s3torchbenchmarking/**/multirun/
+s3torchbenchmarking/**/nvme/
+s3torchbenchmarking/**/*.csv
 
 # Rust .gitignore (https://github.com/github/gitignore/blob/main/Rust.gitignore) -- cherry-picked ######################
 
diff --git a/s3torchbenchmarking/README.md b/s3torchbenchmarking/README.md
index ddae9a4c..5a07164d 100644
--- a/s3torchbenchmarking/README.md
+++ b/s3torchbenchmarking/README.md
@@ -1,179 +1,180 @@
-# Benchmarking the S3 Connector for PyTorch
+# s3torchbenchmarking
 
-This directory contains a modular component for the experimental evaluation of the performance of the Amazon S3 Connector for
-PyTorch.
-The goal of this component is to be able to run performance benchmarks for PyTorch connectors in an easy-to-reproduce and
-extensible fashion. This way, users can experiment with different settings and arrive at the optimal configuration for their workloads,
-before committing to a setup.
+This Python package houses a set of benchmarks for experimentally evaluating the performance of
+the **Amazon S3 Connector for PyTorch** library.
 
-By managing complex configuration space with [Hydra](https://hydra.cc/) we are able to define modular configuration pieces mapped to various
-stages of the training pipeline. This approach allows one to mix and match configurations and measure the performance 
-impact to the end-to-end training process.
+With the use of the [Hydra](https://hydra.cc/) framework, we are able to define modular configuration pieces mapped to
+various stages of the training pipeline. This approach allows one to mix and match configurations and measure the
+performance impact to the end-to-end training process.
 
-There are **three scenarios** available:
+**Four scenarios** are available:
 
-- **Data loading benchmarks**: measure our connector against other Dataset classes (i.e., classes used to fetch and
-  index actual datasets); all save to S3.
-- **PyTorch Lightning Checkpointing benchmarks**: measure our connector, using the PyTorch Lightning framework, against
-  the latter default implementation of checkpointing.
-- **PyTorch’s Distributed Checkpointing (DCP) benchmarks**: measure our connector against PyTorch default distributed
-  checkpointing mechanism — learn more in [this dedicated README](src/s3torchbenchmarking/dcp/README.md).
+1. **Dataset benchmarks**
+    - Compare our connector against other Dataset classes
+    - All scenarios save data to S3
+    - Measure performance in data fetching and indexing
+2. **PyTorch's Distributed Checkpointing (DCP) benchmarks**
+    - Assess our connector's performance versus PyTorch's default distributed checkpointing mechanism
+    - For detailed information, refer to the [dedicated DCP `README`](src/s3torchbenchmarking/dcp/README.md)
+3. **PyTorch Lightning Checkpointing benchmarks**
+    - Evaluate our connector within the PyTorch Lightning framework
+    - Compare against PyTorch Lightning's default checkpointing implementation
+4. **PyTorch Checkpointing benchmarks**
+    - TODO!
 
-## Getting Started
+## Getting started
 
-The benchmarking code is available within the `src/s3torchbenchmarking` module.
+The benchmarking code is located in the `src/s3torchbenchmarking` module. The scenarios are designed to be run on an EC2
+instance with one (or many) GPU(s).
 
-The tests can be run locally, or you can launch an EC2 instance with a GPU (we used a [g5.2xlarge][g5.2xlarge]),
-choosing the [AWS Deep Learning AMI GPU PyTorch 2.5 (Ubuntu 22.04)][dl-ami] as your AMI.
+### EC2 instance setup (recommended)
 
-First, activate the Conda env within this machine by running:
+From your EC2 AWS Console, launch an instance with one (or many) GPU(s) (e.g., G5 instance type); we recommend using
+an [AWS Deep Learning AMI (DLAMI)][dlami], such
+as [AWS Deep Learning AMI GPU PyTorch 2.5 (Amazon Linux 2023)][dlami-pytorch].
+
+> [!NOTE]
+> Some benchmarks can be long-running. To avoid the shortcomings around expired AWS tokens, we recommend attaching a
+> role to your EC2 instance with:
+>
+> - Full access to S3
+> - (Optional) Full access to DynamoDB — for writing run results
+>
+> See the [Running the benchmarks](#running-the-benchmarks) section for more details.
+
+For optimal results, it is recommended to run the benchmarks on a dedicated EC2 instance _without_ other
+resource-intensive processes.
+
+### Creating a new Conda environment (env)
+
+> [!WARNING]
+> While some DLAMIs provide a pre-configured Conda env (`source activate pytorch`), we have observed compatibility
+> issues with the latest PyTorch versions (2.5.X) at the time of writing. We recommend creating a new one from scratch
+> as detailed below.
+
+Once your instance is running, `ssh` into it, and create a new Conda env:
 
 ```shell
-source activate pytorch
+conda create -n pytorch-benchmarks python=3.12
+conda init
 ```
 
-If running locally you can optionally configure a Python venv:
+Then, activate it (_you will need to log out and in again in the meantime, as signaled by `conda init`_):
 
 ```shell
-python -m venv <ENV-NAME>
-source <PATH-TO-VENV>/bin/activate
+source activate pytorch-benchmarks
 ```
 
-Then, `cd` to the `s3torchbenchmarking` directory, and run the `utils/prepare_ec2_instance.sh` script: the latter will
-take care of updating the instance's packages (through either `yum` or `apt`), install Mountpoint for Amazon S3, and 
-install the required Python packages.
+Finally, from within this directory, install the `s3torchbenchmarking` module:
+
+```shell
+# `-e` so local modifications get picked up, if any
+pip install -e .
+```
 
 > [!NOTE]
-> Some errors may arise while trying to run the benchmarks; below are some workarounds to execute in such cases.
-
-- Error `RuntimeError: operator torchvision::nms does not exist` while trying the run the benchmarks:
-  ```shell
-  conda install -y pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia
-  ```
-- Error `TypeError: canonicalize_version() got an unexpected keyword argument 'strip_trailing_zero'` while trying to
-  install `s3torchbenchmarking` package:
-  ```shell
-  pip install "setuptools<71"
-  ```
+> For some scenarios, you may be required to install the [Mountpoint for Amazon S3][mountpoint-s3] file client: please
+> refer to their README for instructions.
 
 ### (Pre-requisite) Configure AWS Credentials
 
-The commands provided below (`datagen.py`, `benchmark.py`) rely on the
-standard [AWS credential discovery mechanism][credentials]. Supplement the command as necessary to ensure the AWS
-credentials are made available to the process, e.g., by setting the `AWS_PROFILE` environment variable.
+The benchmarks and other commands provided below rely on the standard [AWS credential discovery mechanism][credentials].
+Supplement the command as necessary to ensure the AWS credentials are made available to the process, e.g., by setting
+the `AWS_PROFILE` environment variable.
 
-### Configuring the dataset
+### Creating a dataset (optional; for "dataset" benchmarks only)
 
-_Note: This is a one-time setup for each dataset configuration. The dataset configuration files, once created locally
-and can be used in subsequent benchmarks, as long as the dataset on the S3 bucket is intact._
+You can use your own dataset for the benchmarks, or you can generate one on-the-fly using the `s3torch-datagen` command.
 
-If you already have a dataset, you only need upload it to an S3 bucket and set up a YAML file under
-`./conf/dataset/` in the following format:
-
-```yaml
-# custom_dataset.yaml
+Here are some sample dataset configurations that we ran our benchmarks against:
 
-prefix_uri: s3://<S3_BUCKET>/<S3_PREFIX>/
-region: <AWS_REGION>
-sharding: TAR|null # if the samples have been packed into TAR archives.
+```shell
+s3torch-datagen -n 100k --shard-size 128MiB --s3-bucket my-bucket --region us-east-1
 ```
 
-This dataset can then be referenced in an experiment with an entry like `dataset: custom_dataset` (note that we're 
-omitting the *.yaml extension). This will result in running the benchmarks against this dataset. Some experiments have 
-already been defined for reference - see `./conf/dataloading.yaml` or `./conf/sharding.yaml`.
+## Running the benchmarks
 
-_Note: Ensure the bucket is in the same region as the EC2 instance to eliminate network latency effects in your
-measurements._
+You can run the different benchmarks by editing their corresponding config files, then running one of those shell
+script (specifically, you must provide a value for all keys marked with `???`):
 
-Alternatively, you can use the `s3torch-datagen` command to procedurally generate an image dataset and upload it to 
-Amazon S3. The script also creates a Hydra configuration file at the appropriate path.
+```shell
+# Dataset benchmarks
+vim ./conf/dataset.yaml           # 1. edit config
+./utils/run_dataset_benchmarks.sh # 2. run scenario
 
-```
-$ s3torch-datagen --help
-Usage: s3torch-datagen [OPTIONS]
-
-  Synthesizes a dataset that will be used for benchmarking and uploads it to
-  an S3 bucket.
-
-Options:
-  -n, --num-samples FLOAT  Number of samples to generate.  Can be supplied as
-                           an IEC or SI prefix. Eg: 1k, 2M. Note: these are
-                           case-sensitive notations. [default: 1k]
-  --resolution TEXT        Resolution written in 'widthxheight' format
-                           [default: 496x387]
-  --shard-size TEXT        If supplied, the images are grouped into tar files
-                           of the given size. Size can be supplied as an IEC
-                           or SI prefix. Eg: 16Mib, 4Kb, 1Gib. Note: these are
-                           case-sensitive notations.
-  --s3-bucket TEXT         S3 Bucket name. Note: Ensure the credentials are
-                           made available either through environment variables
-                           or a shared credentials file.  [required]
-  --s3-prefix TEXT         Optional S3 Key prefix where the dataset will be
-                           uploaded. Note: a prefix will be autogenerated. eg:
-                           s3://<BUCKET>/1k_256x256_16Mib_sharded/
-  --region TEXT            Region where the S3 bucket is hosted.  [default:
-                           us-east-1]
-  --help                   Show this message and exit.
+# PyTorch Checkpointing benchmarks
+vim ./conf/pytorch_checkpointing.yaml # 1. edit config
+./utils/run_checkpoints_benchmarks.sh # 2. run scenario
+
+# PyTorch Lightning Checkpointing benchmarks
+vim ./conf/lightning_checkpointing.yaml # 1. edit config
+./utils/run_lighning_benchmarks.sh      # 2. run scenario
 
+# PyTorch’s Distributed Checkpointing (DCP) benchmarks
+vim ./conf/dcp.yaml           # 1. edit config
+./utils/run_dcp_benchmarks.sh # 2. run scenario
 ```
 
-Here are some sample dataset configurations that we ran our benchmarks against:
+> [!NOTE]
+> Ensure the bucket is in the same region as the EC2 instance, to eliminate network latency effects in your
+> measurements.
 
-- `-n 20k --resolution 496x387`
-- `-n 20k --resolution 496x387 --shard-size {4, 8, 16, 32, 64}MiB`
+Each of those scripts rely on Hydra config files, located under the [`conf`](conf) directory. You may edit those as you
+see fit to configure the runs: in particular, parameters under the `hydra.sweeper.params` path will create as many jobs
+as the cartesian product of those.
 
-Example:
+Also, as the scripts pass the inline parameters you give them to Hydra, you may override their behaviors this way:
 
-```
-$ s3torch-datagen -n 20k \
-   --resolution 496x387 \
-   --shard-size 4MB \
-   --s3-bucket swift-benchmark-dataset \
-   --region eu-west-2
-
-Generating data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1243.50it/s]
-Uploading to S3: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 3378.87it/s]
-Dataset uploaded to: s3://swift-benchmark-dataset/20k_496x387_images_4MB_shards/
-Dataset Configuration created at: ./conf/dataset/20k_496x387_images_4MB_shards.yaml
-Configure your experiment by setting the entry:
-    dataset: 20k_496x387_images_4MB_shards
-Alternatively, you can run specify it on the cmd-line when running the benchmark like so:
-    s3torch-benchmark -cd conf  -m -cn <CONFIG-NAME> 'dataset=20k_496x387_images_4MB_shards'
+```shell
+./utils/run_dataset_benchmarks.sh +disambiguator=some_key
 ```
 
----
+## Getting the results
 
-Finally, once the dataset and other configuration modules have been defined, you can kick off the benchmark by running:
+### Scenario organization
 
-```shell
-# For data loading benchmarks:
-$ . utils/run_dataset_benchmarks.sh 
+Benchmark results are organized as follows, inside a default `./multirun` directory (e.g.):
+
+```
+./multirun
+└── dataset
+    └── 2024-12-20_13-42-27
+        ├── 0
+        │   ├── benchmark.log
+        │   └── job_results.json
+        ├── 1
+        │   ├── benchmark.log
+        │   └── job_resutls.json
+        ├── multirun.yaml
+        └── run_results.json
+```
 
-# For PyTorch Checkpointing benchmarks:
-$ . utils/run_checkpoints_benchmarks.sh
+Scenarios are organized at the top level, each in its own directory named after the scenario (e.g., `dataset`). Within
+each scenario directory, you'll find individual run directories, automatically named by Hydra using the creation
+timestamp (e.g., `2024-12-20_13-42-27`).
 
-# For PyTorch Lightning Checkpointing benchmarks:
-$ . utils/run_lighning_benchmarks.sh
+Each run directory contains job subdirectories (e.g., `0`, `1`, etc.), corresponding to a specific subset of parameters.
 
-# For PyTorch’s Distributed Checkpointing (DCP) benchmarks:
-$ . utils/run_dcp_benchmarks.sh
-```
+### Experiment reporting
 
-_Note: For overriding any other benchmark parameters, see [Hydra Overrides][hydra-overrides]. You can also run 
-`s3torch-benchmark --hydra-help` to learn more._
+Experiments will report various metrics, such as throughput and processed time — the exact types vary per scenarios.
+Results are stored in two locations:
 
-Experiments will report various metrics, like throughput, processed time, etc. The results for individual jobs and runs 
-(one run will contain 1 to N jobs) will be written out to dedicated files, respectively `job_results.json` and
-`run_results.json`, within their corresponding output directory (see the YAML config files).
+1. In the job subdirectories:
+    - `benchmark.log`: Individual job logs (collected by Hydra)
+    - `job_results.json`: Individual job results
+2. In the run directory:
+    - `multirun.yaml`: Global Hydra configuration for the run
+    - `run_results.json`: Comprehensive run results, including additional metadata
 
-## Next Steps
+If a DynamoDB table is defined in the [`conf/aws/dynamodb.yaml`](conf/aws/dynamodb.yaml) configuration file, results
+will also be written to the specified table.
 
-- Add more models (LLMs?) to monitor training performance.
-- Support plugging in user-defined models and automatic discovery of the same.
+[dlami]: https://docs.aws.amazon.com/dlami/
 
-[g5.2xlarge]: https://aws.amazon.com/ec2/instance-types/g5/
+[dlami-pytorch]: https://aws.amazon.com/releasenotes/aws-deep-learning-ami-gpu-pytorch-2-5-amazon-linux-2023/
 
-[dl-ami]: https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html
+[mountpoint-s3]: https://github.com/awslabs/mountpoint-s3/tree/main
 
 [credentials]: https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html
 
diff --git a/s3torchbenchmarking/benchmark_results_aggregator.ipynb b/s3torchbenchmarking/benchmark_results_aggregator.ipynb
index c06fb9c1..315be844 100644
--- a/s3torchbenchmarking/benchmark_results_aggregator.ipynb
+++ b/s3torchbenchmarking/benchmark_results_aggregator.ipynb
@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
    "id": "6522fc8a931ffbc3",
    "metadata": {
     "ExecuteTime": {
@@ -39,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "id": "a371fc9062af6126",
    "metadata": {
     "ExecuteTime": {
@@ -76,7 +76,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 3,
    "id": "e14b9efad6ae3ad6",
    "metadata": {
     "ExecuteTime": {
@@ -127,6 +127,7 @@
     "                ),\n",
     "                **metrics_averaged,\n",
     "                \"config\": job_result[\"config\"],\n",
+    "                \"ec2_metadata\": run_result[\"ec2_metadata\"],\n",
     "            }\n",
     "            rows.append(row)\n",
     "\n",
@@ -143,7 +144,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 4,
    "id": "be008fb6acf09055",
    "metadata": {
     "ExecuteTime": {
@@ -170,14 +171,18 @@
    "source": [
     "import pandas as pd\n",
     "\n",
-    "_data = transform(_run_results)\n",
-    "_table = pd.json_normalize(_data).set_index(\"version\")\n",
+    "_table = pd.DataFrame()\n",
+    "\n",
+    "if _run_results:\n",
+    "    _data = transform(_run_results)\n",
+    "    _table = pd.json_normalize(_data).set_index(\"version\")\n",
+    "\n",
     "_table"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "id": "b4eed2752e6add17",
    "metadata": {
     "ExecuteTime": {
@@ -191,7 +196,11 @@
     "import random\n",
     "\n",
     "_suffix = \"\".join(random.choices(string.ascii_letters, k=5))\n",
-    "_table.to_csv(f\"benchmark_results_{_suffix}.csv\")"
+    "_filename = f\"benchmark_results_{_suffix}.csv\"\n",
+    "\n",
+    "if not _table.empty:\n",
+    "    _table.to_csv(_filename)\n",
+    "    print(f\"CSV written to {_filename}\")"
    ]
   }
  ],
diff --git a/s3torchbenchmarking/conf/aws/s3.yaml b/s3torchbenchmarking/conf/aws/s3.yaml
deleted file mode 100644
index 4340abf2..00000000
--- a/s3torchbenchmarking/conf/aws/s3.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-# @package _global_
-# S3 config; used for checkpoint storage
-s3:
-  region: ???
-  uri: ???
diff --git a/s3torchbenchmarking/conf/dataset.yaml b/s3torchbenchmarking/conf/dataset.yaml
index 83ed5e07..69f65b43 100644
--- a/s3torchbenchmarking/conf/dataset.yaml
+++ b/s3torchbenchmarking/conf/dataset.yaml
@@ -1,19 +1,25 @@
 defaults:
-  - hydra/callbacks:
-      - collate_results
-  - aws:
-      - s3
-      - dynamodb # save run results to DynamoDB (see also conf/aws/dynamodb.yaml) -- comment me if not required
+  - hydra/callbacks/collate_results
+  - aws/dynamodb # save run results to DynamoDB -- comment me if not required
   - _self_
 
-prefix_uri: ??? # where the dataset are stored in S3
-region: ???
-sharding: False
-epochs: 1
+# S3 bucket where the dataset is stored.
+# NOTE: a non-existing bucket will fail the benchmarks.
+s3:
+  region: ??? # e.g., eu-west-1
+  bucket: ??? # e.g., my-bucket (*not* an S3 URI)
+# Boolean flag to tell whether the dataset is sharded or not.
+sharding: True
+# Number of iterations for training a model.
+epochs: 10
 checkpoint:
-  save_one_in: 25
+  # Number of training steps between checkpoints.
+  save_one_in: 0
+  # Checkpoint storage location.
   destination: disk
+  # Path for checkpoint saving (local disk or S3 URI).
   uri: ./nvme/checkpoints/
+  # S3 region.
   region: eu-west-2
 
 hydra:
@@ -22,5 +28,9 @@ hydra:
     dir: multirun/${hydra.job.config_name}/${now:%Y-%m-%d_%H-%M-%S}
   sweeper:
     params:
-      +model: entitlement, vit
-      +dataloader: s3iterabledataset, s3mapdataset, fsspec, mountpoint, mountpointcache
+      # Name of a model (valid options: "entitlement", "vit").
+      +model: entitlement
+      # Kind of the dataloader (valid options: "fsspec", "s3iterabledataset", "mountpoint", "mountpointcache").
+      +dataloader: fsspec, s3iterabledataset, mountpoint, mountpointcache
+      # Dataset name (corresponds to the name of a folder in S3); will be used to build an S3 URI
+      +dataset: 100k_496x387_images
diff --git a/s3torchbenchmarking/conf/dcp.yaml b/s3torchbenchmarking/conf/dcp.yaml
index 7da5665d..0aa5b3ba 100644
--- a/s3torchbenchmarking/conf/dcp.yaml
+++ b/s3torchbenchmarking/conf/dcp.yaml
@@ -1,11 +1,15 @@
 defaults:
-  - hydra/callbacks:
-      - collate_results
-  - aws:
-      - s3
-      - dynamodb # save run results to DynamoDB (see also conf/aws/dynamodb.yaml) -- comment me if not required
+  - hydra/callbacks/collate_results
+  - aws/dynamodb # save run results to DynamoDB -- comment me if not required
   - _self_
 
+# S3 bucket to use to save checkpoints.
+# NOTE: a non-existing bucket will fail the benchmarks.
+s3:
+  region: ??? # e.g., eu-west-1
+  uri: ???    # e.g., s3://my-bucket/
+# Number of iterations for "saving" a model's checkpoint.
+# NOTE: this does not affect model training, as no actual training occurs in these benchmarks.
 epochs: 4
 
 hydra:
@@ -14,8 +18,13 @@ hydra:
     dir: multirun/${hydra.job.config_name}/${now:%Y-%m-%d_%H-%M-%S}
   sweeper:
     params:
+      # Short name of a pre-trained model (from Hugging Face), listed in `models.py`.
       +model: vit-base, T0_3B
-      +backend: nccl, gloo # nccl == GPU, gloo == CPU
-      +world_size: 1, 2, 4, 8 # == total number of workers to use
-      +thread_count: 1, 2, 4, 8
+      # Type of Torch distributed backend (valid options: "nccl", "gloo").
+      +backend: nccl
+      # Number of workers.
+      +world_size: 4
+      # Number of threads to use for saving the checkpoints.
+      +thread_count: 4
+      # Checkpoint storage location (valid options: "disk", "s3").
       +checkpoint.storage: disk, s3
diff --git a/s3torchbenchmarking/conf/lightning_checkpointing.yaml b/s3torchbenchmarking/conf/lightning_checkpointing.yaml
index ebeae64c..51b8a04a 100644
--- a/s3torchbenchmarking/conf/lightning_checkpointing.yaml
+++ b/s3torchbenchmarking/conf/lightning_checkpointing.yaml
@@ -1,12 +1,16 @@
 defaults:
-  - hydra/callbacks:
-      - collate_results
-  - aws:
-      - s3
-      - dynamodb # save run results to DynamoDB (see also conf/aws/dynamodb.yaml) -- comment me if not required
+  - hydra/callbacks/collate_results
+  - aws/dynamodb # save run results to DynamoDB -- comment me if not required
   - _self_
 
+# S3 bucket to use to save checkpoints.
+# NOTE: a non-existing bucket will fail the benchmarks.
+s3:
+  region: ??? # e.g., eu-west-1
+  uri: ???    # e.g., s3://my-bucket/
+# Number of iterations for "saving" a model's checkpoint.
 epochs: 5
+# Number of training steps between checkpoints.
 save_one_in: 1
 
 hydra:
@@ -15,5 +19,7 @@ hydra:
     dir: multirun/${hydra.job.config_name}/${now:%Y-%m-%d_%H-%M-%S}
   sweeper:
     params:
-      +model: vit-base, whisper, clip-vit, T0_3B, T0pp
+      # Short name of a pre-trained model (from Hugging Face), listed in `models.py`.
+      +model: clip-vit, T0_3B, T0pp
+      # Checkpoint storage location (valid options: "disk", "s3").
       +checkpoint.storage: disk, s3
diff --git a/s3torchbenchmarking/conf/pytorch_checkpointing.yaml b/s3torchbenchmarking/conf/pytorch_checkpointing.yaml
index ebeae64c..d8d9136b 100644
--- a/s3torchbenchmarking/conf/pytorch_checkpointing.yaml
+++ b/s3torchbenchmarking/conf/pytorch_checkpointing.yaml
@@ -1,12 +1,16 @@
 defaults:
-  - hydra/callbacks:
-      - collate_results
-  - aws:
-      - s3
-      - dynamodb # save run results to DynamoDB (see also conf/aws/dynamodb.yaml) -- comment me if not required
+  - hydra/callbacks/collate_results
+  - aws/dynamodb # save run results to DynamoDB -- comment me if not required
   - _self_
 
+# S3 bucket to use to save checkpoints.
+# NOTE: a non-existing bucket will fail the benchmarks.
+s3:
+  region: ??? # e.g., eu-west-1
+  uri: ???    # e.g., s3://my-bucket/
+# Number of iterations for "saving" a model's checkpoint.
 epochs: 5
+# Number of training steps between checkpoints.
 save_one_in: 1
 
 hydra:
@@ -15,5 +19,7 @@ hydra:
     dir: multirun/${hydra.job.config_name}/${now:%Y-%m-%d_%H-%M-%S}
   sweeper:
     params:
+      # Short name of a pre-trained model (from Hugging Face), listed in `models.py`.
       +model: vit-base, whisper, clip-vit, T0_3B, T0pp
+      # Checkpoint storage location (valid options: "disk", "s3").
       +checkpoint.storage: disk, s3
diff --git a/s3torchbenchmarking/pyproject.toml b/s3torchbenchmarking/pyproject.toml
index e3fb993a..9b1a16de 100644
--- a/s3torchbenchmarking/pyproject.toml
+++ b/s3torchbenchmarking/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools", "build"]
+requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -9,23 +9,20 @@ description = "Tools to run and compare benchmarks against various PyTorch conne
 requires-python = ">=3.8,<3.13"
 readme = "README.md"
 dependencies = [
-    "torch >= 2.0.1, != 2.3.0, != 2.5.0",
-    "lightning >= 2.0",
-    "s3torchconnector",
-    "hydra-core",
-    "torchdata>=0.6.1, <=0.9.0", # we have dependency on deprecated DataPipes, which were removed in 0.10.0
-    "torchvision",
-    "s3fs>=2024.6.1",
-    "transformers",
-    "numpy",
-    "psutil",
-    "pynvml",
+    "s3torchconnector[lightning,dcp]",
     "boto3",
-    "prefixed",
     "click",
-    "accelerate",
+    "hydra-core",
     "pandas",
+    "pillow",
+    "prefixed",
+    "psutil",
+    "pynvml",
     "requests",
+    "s3fs>=2024", # prevents "UserWarning: Your installed version of s3fs is very old" type of warnings
+    "torchdata<0.10.0", # we have dependency on deprecated DataPipes, which were removed in 0.10.0
+    "torchvision",
+    "transformers",
 ]
 
 [project.optional-dependencies]
diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/benchmark_utils.py b/s3torchbenchmarking/src/s3torchbenchmarking/benchmark_utils.py
index ad768670..85739ea4 100644
--- a/s3torchbenchmarking/src/s3torchbenchmarking/benchmark_utils.py
+++ b/s3torchbenchmarking/src/s3torchbenchmarking/benchmark_utils.py
@@ -6,14 +6,12 @@
 import time
 from collections import defaultdict
 from collections import deque
-from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, Optional, List
+from typing import Dict, Optional, List, TypedDict
 
 import numpy as np
 import psutil
 import torch.cuda
-from PIL import Image
 from pynvml import (  # type: ignore
     nvmlInit,
     nvmlDeviceGetUtilizationRates,
@@ -52,12 +50,12 @@ def summarize(self) -> dict:
         }
 
 
-@dataclass
-class ExperimentResult:
-    elapsed_time: float
+class ExperimentResult(TypedDict, total=False):
+    training_duration_s: float
+    epoch_durations_s: List[float]
     volume: int
-    checkpoint_times: Optional[List[float]] = None
-    utilization: Optional[Dict[str, Distribution]] = None
+    checkpoint_times: Optional[List[float]]
+    utilization: Dict[str, Distribution]
 
 
 class ResourceMonitor:
@@ -116,23 +114,6 @@ def stop(self):
         self.monitor_thread.join()
 
 
-class Transforms:
-    IMG_TRANSFORMS = v2.Compose(
-        [
-            v2.ToImage(),
-            v2.ToDtype(torch.uint8, scale=True),
-            v2.RandomResizedCrop((224, 224), antialias=True),
-            v2.ToDtype(torch.float32, scale=True),
-            v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-        ]
-    )
-
-    @staticmethod
-    def transform_image(data):
-        img = Image.open(data)
-        return Transforms.IMG_TRANSFORMS(img)
-
-
 def build_random_suffix() -> str:
     """Build a random suffix for use in filepaths or S3 URIs."""
     return "".join(random.choices(string.ascii_letters, k=7))
diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/datagen.py b/s3torchbenchmarking/src/s3torchbenchmarking/datagen.py
index fa5941f2..be6da48b 100644
--- a/s3torchbenchmarking/src/s3torchbenchmarking/datagen.py
+++ b/s3torchbenchmarking/src/s3torchbenchmarking/datagen.py
@@ -1,23 +1,25 @@
 #  Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #  // SPDX-License-Identifier: BSD
-import abc
+
 import io
+import logging
 import os
 import tarfile
 import time
+from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from multiprocessing import Queue
-from pathlib import Path
 from threading import Thread, Lock, Barrier
-from typing import Any, Callable, Iterator, TypeVar, Tuple, Dict, List, Optional, Union
+from typing import Callable, Iterator, TypeVar, Tuple, List, Optional, Union
 
 import boto3  # type: ignore
 import click
 import numpy as np
 import prefixed as pr  # type: ignore
-import yaml
 from PIL import Image
 
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class LabelledSample:
@@ -29,8 +31,8 @@ class Sentinel:
     pass
 
 
-class DataGenerator(abc.ABC):
-    @abc.abstractmethod
+class DataGenerator(ABC):
+    @abstractmethod
     def create(self, idx_gen: Iterator[int]) -> Iterator[LabelledSample]:
         pass
 
@@ -128,14 +130,6 @@ def parse_human_readable_bytes(
         if value is not None:
             return pr.Float(value.rstrip("b").rstrip("B"))
 
-    @staticmethod
-    def write_dataset_config(disambiguator: str, dataset_cfg: Dict[str, Any]):
-        current_dir = os.getcwd()
-        cfg_path = Path(current_dir) / "conf" / "dataset" / f"{disambiguator}.yaml"
-        with open(cfg_path, "w") as outfile:
-            yaml.dump(dataset_cfg, outfile, default_flow_style=False)
-            click.echo(f"Dataset Configuration created at: {cfg_path}")
-
     @staticmethod
     def validate_image_format(ctx: click.Context, param: str, value: str):
         supported_formats = set(Image.registered_extensions().values())
@@ -242,13 +236,8 @@ def producer(generator: Iterator, barrier: Barrier, queue: Queue, identifier: in
         queue.put(Sentinel)
 
 
-def consumer(
-    queue: Queue,
-    activity: Callable[[Union[LabelledSample, Sentinel]], None],
-    identifier: int,
-):
+def consumer(queue: Queue, activity: Callable[[Union[LabelledSample, Sentinel]], None]):
     while True:
-        # click.echo(f"Consumer running on thread {threading.current_thread().ident}")
         item: Union[LabelledSample, Sentinel] = queue.get()
         if item is Sentinel:
             # add signal back for other consumers
@@ -314,9 +303,7 @@ def synthesize_dataset(
     s3_prefix: str,
     region: str,
 ):
-    """
-    Synthesizes a dataset that will be used for s3torchbenchmarking and uploads it to an S3 bucket.
-    """
+    """Synthesizes a dataset that will be used for s3torchbenchmarking and uploads it to an S3 bucket."""
     num_workers = os.cpu_count() or 1
     task_queue: Queue = Queue(num_workers)
 
@@ -348,6 +335,7 @@ def synthesize_dataset(
     # kick off consumers and producers
     for worker in [*consumers, *producers]:
         worker.start()
+
     # wait for all threads to finish. Note: order is important since we wait to drain all pending messages from
     # producers first.
     for worker in [*producers, *consumers]:
@@ -355,16 +343,6 @@ def synthesize_dataset(
 
     fq_key = f"s3://{s3_bucket}/{disambiguator}/"
     click.echo(f"Dataset uploaded to: {fq_key}")
-    # generate hydra dataset config file
-    Utils.write_dataset_config(
-        disambiguator=disambiguator,
-        dataset_cfg={
-            "prefix_uri": fq_key,
-            "region": region,
-            # TODO: extend this when introduce other sharding types
-            "sharding": "TAR" if shard_size else None,
-        },
-    )
 
     click.echo(
         f"Configure your experiment by setting the entry:\n\tdataset: {disambiguator}"
diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/dataset/benchmark.py b/s3torchbenchmarking/src/s3torchbenchmarking/dataset/benchmark.py
index b6602e8a..4e9e9e20 100644
--- a/s3torchbenchmarking/src/s3torchbenchmarking/dataset/benchmark.py
+++ b/s3torchbenchmarking/src/s3torchbenchmarking/dataset/benchmark.py
@@ -13,6 +13,7 @@
 from torch.utils.data import DataLoader, Dataset, default_collate
 from torchdata.datapipes.utils import StreamWrapper  # type: ignore
 
+from s3torchbenchmarking.benchmark_utils import ExperimentResult
 from s3torchbenchmarking.models import (
     Entitlement,
     ViT,
@@ -26,11 +27,16 @@
 @hydra.main(version_base=None)
 def run_experiment(config: DictConfig) -> dict:
     model = make_model(config)
+
+    fully_qualified_uri = (
+        "s3://" + config.s3.bucket.strip("/") + "/" + config.dataset.strip("/")
+    )
+
     dataset = make_dataset(
         kind=config.dataloader.kind,
         sharding=config.sharding,
-        prefix_uri=config.prefix_uri,
-        region=config.region,
+        prefix_uri=fully_qualified_uri,
+        region=config.s3.region,
         load_sample=model.load_sample,
         num_workers=config.dataloader.num_workers,
     )
@@ -40,12 +46,13 @@ def run_experiment(config: DictConfig) -> dict:
         batch_size=config.dataloader.batch_size,
     )
 
-    result = model.train(dataloader, config.epochs)
+    result: ExperimentResult = model.train(dataloader, config.epochs)
 
     metrics = {
-        "throughput_mibs": [result.volume / result.elapsed_time],
-        "elapsed_time_s": [result.elapsed_time],
-        "utilization": {k: v.summarize() for k, v in result.utilization.items()},
+        "throughput_mibs": result["volume"] / result["training_duration_s"],
+        "training_duration_s": result["training_duration_s"],
+        "epoch_durations_s": result["epoch_durations_s"],
+        "utilization": {k: v.summarize() for k, v in result["utilization"].items()},
     }
     return {"metrics": metrics}
 
diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/dcp/README.md b/s3torchbenchmarking/src/s3torchbenchmarking/dcp/README.md
index aba26d7e..8dc028a7 100644
--- a/s3torchbenchmarking/src/s3torchbenchmarking/dcp/README.md
+++ b/s3torchbenchmarking/src/s3torchbenchmarking/dcp/README.md
@@ -1,165 +1,24 @@
 ## PyTorch's Distributed Checkpoint (DCP) benchmarks
 
-The `dcp` Python package holds all the logic to execute benchmarks for [PyTorch's Distributed Checkpointing][DCP]
-feature against the `s3torchconnector` library.
+The `dcp` Python package provides a suite of benchmarks designed to evaluate and measure the performance
+of [PyTorch's Distributed Checkpointing (DCP)][DCP] feature in comparison to the `s3torchconnector` library.
 
 ### Purpose
 
-These benchmarks are designed to:
+These benchmarks focus on testing the "save" mechanism of PyTorch DCP (`torch.distributed.checkpoint.save`). The primary
+objectives are to evaluate the `s3torchconnector` library's performance against other libraries and local storage
+options, by measuring the following metrics:
 
-1. Test the "save" mechanism of PyTorch DCP (`torch.distributed.checkpoint.save`);
-2. Compare the performance of the s3torchconnector library against other libraries and local storage;
-3. Measure throughput (in MiB/s) and save times (in seconds).
-
-### Usage
-
-> [!IMPORTANT]
-> The benchmarks are designed to be run on a EC2 instance.
-
-Install the `s3torchbenchmarking` package with `pip` (see the [root README](../../../README.md) for instructions),
-along with the `s3torchconnector[dcp]` extra; once installed, the DCP benchmarks can be run with:
-
-```shell
-$ s3torch-benchmark-dcp -cd conf -cn dcp
-```
-
-The command must be executed from the package's root, where it can read from the `config/` directory; it will create a
-`./multirun/` directory (at the location of execution), and store all benchmark results there.
-
-> [!WARNING]
-> When saving on local disk, consider clearing the `path` specified in your config between runs to prevent disk space
-> issues.
-
-#### Potential caveats
-
-If you encounter the following errors during installation, try the associated command:
-
-**Error**:
-
-```
-RuntimeError: Failed to import transformers.models.vit.modeling_vit because of the following error (look up to see its traceback):
-operator torchvision::nms does not exist
-```
-
-**Try**:
-
-```shell
-$ conda install -y pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia
-```
-
-**Error**:
-
-```
-TypeError: canonicalize_version() got an unexpected keyword argument 'strip_trailing_zero'
-```
-
-**Try**:
-
-```shell
-$ pip install "setuptools<71"
-```
+- Checkpoint saving throughput (in MiB/s);
+- Checkpoint "corrected" save durations (in seconds), which exclude the influence of model load duration on the device.
 
 ### Configuration
 
-The benchmark runs can be customized using the [`dcp.yaml`](../../../conf/dcp.yaml) file. This section outlines the key
-configuration options and their impacts.
-
-#### Configuration Requirements
-
-All keys in the `dcp.yaml` file must be defined for a run to execute successfully.
-
-#### Key Configuration Options
-
-`epochs`
+The benchmark runs can be customized through the [`dcp.yaml`](../../../conf/dcp.yaml) file.
 
-- Specifies the number of iterations for "saving" a model's checkpoint.
-- Note: This does not affect model training, as no actual training occurs in these benchmarks.
-
-`path`
-
-- Designates the directory for benchmark operations.
-- If the specified directory doesn't exist, it will be created automatically.
-- For optimal performance using an SSD filesystem, refer to the [`prepare_nvme.sh`](../../../utils/prepare_nvme.sh)
-  script.
-
-`hydra.sweeper.params`
-
-This section allows for multiple benchmark configurations:
-
-- The benchmark will run sequential jobs for each combination of the specified parameters.
-- Available options include:
-    - `+model`: Choose from pre-trained models listed in [`models.py`](models.py).
-    - `+backend`: Select `nccl`, `gloo`, or both.
-    - `+world_size`: Defines the number of workers.
-    - `+thread_count`: Defines the number of threads to use for saving the checkpoints.
-    - `+checkpoint.storage`: Choose `s3`, `disk`, or both.
-
-#### Example Configuration
-
-```yaml
-s3:
-  region: eu-west-1
-  uri: s3://my-bucket
-epochs: 3
-path: ./nvme/
-
-hydra:
-  mode: MULTIRUN
-  sweeper:
-    params:
-      +model: vit-base,T0_3B
-      +backend: nccl,gloo
-      +world_size: 2,4
-      +thread_count: 1
-      +checkpoint.storage: s3,disk
-```
-
-This configuration will run benchmarks for all combinations of the specified models, backends, world sizes, and storage
-options, totaling 16 (2×2×2×1×2) different benchmark scenarios.
-
-### Important notes
-
-- The benchmarks may take some time to complete, depending on the hardware and network configuration.
-- For optimal results, it is recommended to run the benchmarks on a dedicated EC2 instance without other
-  resource-intensive processes.
-- Ensure the specified S3 bucket exists in the given region and the EC2 user/role has read+write permissions.
-
-### Results
-
-Benchmark results are organized as follows:
-
-```shell
-multirun/
-└── YYYY-MM-DD
-    └── HH-MM-SS
-        ├── 0
-        │   ├── benchmark.log
-        │   └── results_small_nccl_2_2_s3.json
-        ├── 1
-        │   ├── benchmark.log
-        │   └── results_small_nccl_2_2_disk.json
-        ├── 2
-        │   ├── benchmark.log
-        │   └── results_small_nccl_4_2_s3.json
-        ├── 3
-        │   ├── benchmark.log
-        │   └── results_small_nccl_4_2_disk.json
-        └── multirun.yaml
-```
-
-Each run creates a timestamped subdirectory. The `./multirun/` directory is managed by [Hydra](https://hydra.cc/).
-
-Result file names reflect the parameter combinations, e.g.,
-
-```
-+model: vit-base
-+backend: nccl
-+world_size: 2
-+thread_count: 1
-+checkpoint.storage: s3
-```
-
-will produce the file `results_vit-base_nccl_2_1_s3.json` (respecting parameters declaration order).
+> [!IMPORTANT]
+> A `+path` option is passed to the running script ([`run_dcp_benchmarks.sh`](../../../utils/run_dcp_benchmarks.sh)),
+> and will be used only if `checkpoint.storage` key includes `disk`.
 
 ### References
 
@@ -168,3 +27,5 @@ will produce the file `results_vit-base_nccl_2_1_s3.json` (respecting parameters
 - https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
 
 [DCP]: https://pytorch.org/docs/stable/distributed.checkpoint.html
+
+[multirun]: https://hydra.cc/docs/tutorials/basic/running_your_app/multi-run/
diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/hydra_callback.py b/s3torchbenchmarking/src/s3torchbenchmarking/hydra_callback.py
index fa09f1b8..caffb3f9 100644
--- a/s3torchbenchmarking/src/s3torchbenchmarking/hydra_callback.py
+++ b/s3torchbenchmarking/src/s3torchbenchmarking/hydra_callback.py
@@ -154,4 +154,4 @@ def _write_to_dynamodb(region: str, table_name: str, run: RunResults) -> None:
         table.put_item(Item=run_json)
         logger.info("✅ Put item into table successfully")
     except ClientError:
-        logger.error("Couldn't put item into table %s", table, exc_info=True)
+        logger.error("❌ Couldn't put item into table %s", table, exc_info=True)
diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/models.py b/s3torchbenchmarking/src/s3torchbenchmarking/models.py
index 577e0783..dce00bce 100644
--- a/s3torchbenchmarking/src/s3torchbenchmarking/models.py
+++ b/s3torchbenchmarking/src/s3torchbenchmarking/models.py
@@ -7,7 +7,8 @@
 from abc import ABC, abstractmethod
 from functools import cached_property
 from io import IOBase
-from typing import Optional, Any, Tuple, Union, Callable
+from time import perf_counter
+from typing import Optional, Any, Tuple, Union, Callable, List
 
 import lightning as L
 import torch
@@ -122,11 +123,15 @@ def train_batch(self, batch_idx: int, data, target) -> Optional[Any]:
 
     def train(self, dataloader: DataLoader, epochs: int) -> ExperimentResult:
         """Train the model using given dataloader for number of epochs"""
+
+        epoch_durations_s: List[float] = []
+
         with ResourceMonitor() as monitor:
             num_samples = 0
             checkpoint_times = []
-            start_time = time.perf_counter()
+            begin_training = perf_counter()
             for epoch in range(epochs):
+                begin_epoch = time.perf_counter()
                 logger.info("Epoch #%i/%i", epoch, epochs - 1)
                 for batch_idx, (data, target) in enumerate(dataloader):
                     logger.debug("Batch #%i", batch_idx)
@@ -134,14 +139,16 @@ def train(self, dataloader: DataLoader, epochs: int) -> ExperimentResult:
                     num_samples += len(data)
                     if result:
                         checkpoint_times.append(result)
-            training_time = time.perf_counter() - start_time
+                epoch_durations_s.append(time.perf_counter() - begin_epoch)
+            training_duration_s = time.perf_counter() - begin_training
 
-        return ExperimentResult(
-            elapsed_time=training_time,
-            volume=num_samples,
-            checkpoint_times=checkpoint_times,
-            utilization=monitor.resource_data,
-        )
+        return {
+            "training_duration_s": training_duration_s,
+            "epoch_durations_s": epoch_durations_s,
+            "volume": num_samples,
+            "checkpoint_times": checkpoint_times,
+            "utilization": monitor.resource_data,
+        }
 
     @abstractmethod
     def save(self, **kwargs):
@@ -313,12 +320,12 @@ def train(self, dataloader: DataLoader, epochs: int) -> ExperimentResult:
             end_time = time.perf_counter()
             training_time = end_time - start_time
 
-        return ExperimentResult(
-            elapsed_time=training_time,
-            volume=sample_counting_cb.count,
-            checkpoint_times=profiling_checkpointer.save_times,
-            utilization=monitor.resource_data,
-        )
+        return {
+            "training_duration_s": training_time,
+            "volume": sample_counting_cb.count,
+            "checkpoint_times": profiling_checkpointer.save_times,
+            "utilization": monitor.resource_data,
+        }
 
     def save(self, **kwargs):
         raise NotImplementedError(
diff --git a/s3torchbenchmarking/utils/prepare_and_run_benchmark.sh b/s3torchbenchmarking/utils/prepare_and_run_benchmark.sh
deleted file mode 100755
index 69edfdcf..00000000
--- a/s3torchbenchmarking/utils/prepare_and_run_benchmark.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-DATALOADER=$1
-PATH_TO_STORE_DATASETS=$2
-BUCKET_NAME=$3
-REGION_NAME=$4
-RESULTS_BUCKET_NAME=$5
-RESULTS_REGION_NAME=$6
-RESULTS_PREFIX=$7
-
-datasets=("100k_496x387_images_4Mb_shards" "100k_496x387_images_8Mb_shards" "100k_496x387_images_16Mb_shards" "100k_496x387_images_32Mb_shards" "100k_496x387_images_64Mb_shards" "100k_496x387_images_128Mb_shards" "100k_496x387_images_256Mb_shards" "10k_496x387_images")
-
-./utils/generate_datasets_files.sh "${PATH_TO_STORE_DATASETS}" "${BUCKET_NAME}" "${REGION_NAME}" "${datasets[@]}"
-./utils/prepare_nvme.sh
-rm -r -f ./multirun
-./utils/run_dataloading_benchmarks.sh "${DATALOADER}" "${datasets[@]}"
-python ./utils/upload_colated_results_to_s3.py "./multirun" "${RESULTS_BUCKET_NAME}" "${RESULTS_PREFIX}" "${DATALOADER}"
diff --git a/s3torchbenchmarking/utils/prepare_ec2_instance.sh b/s3torchbenchmarking/utils/prepare_ec2_instance.sh
deleted file mode 100755
index 7f530ae1..00000000
--- a/s3torchbenchmarking/utils/prepare_ec2_instance.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash
-#
-# Script to prepare an EC2 instance for PyTorch benchmarks. Like other scripts within this directory, it is assumed
-# that this is run from within the "s3-connector-for-pytorch/s3torchbenchmarking" directory.
-
-set -eou pipefail
-
-# Sanity check + install Mountpoint for Amazon S3
-if [[ -n $(which yum) ]]; then
-  sudo yum -y upgrade
-
-  wget https://s3.amazonaws.com/mountpoint-s3-release/latest/x86_64/mount-s3.rpm
-  sudo yum install -y ./mount-s3.rpm && rm ./mount-s3.rpm
-elif [[ -n $(which apt) ]]; then
-  sudo apt -y upgrade
-
-  wget https://s3.amazonaws.com/mountpoint-s3-release/latest/x86_64/mount-s3.deb
-  sudo apt install -y ./mount-s3.deb && rm ./mount-s3.deb
-fi
-
-# Install s3torchconnector and s3torchbenchmarking
-pip install 's3torchconnector[lightning,dcp]'
-pip install -e .
diff --git a/s3torchbenchmarking/utils/prepare_nvme.sh b/s3torchbenchmarking/utils/prepare_nvme.sh
index c8882024..3c688db0 100755
--- a/s3torchbenchmarking/utils/prepare_nvme.sh
+++ b/s3torchbenchmarking/utils/prepare_nvme.sh
@@ -1,16 +1,20 @@
 #!/usr/bin/env bash
 #
-# Mount an NVMe drive (by default, at `./nvme/`) relative to where this script is run. If a drive is already mounted at
-# the specified location, clear its content.
+# Mount an NVMe drive (by default, at `./nvme/`). Script assumes that it is run on a DLAMI-based EC2 instance.
 
 nvme_dir=${1:-"./nvme/"} # default value
 
 if ! mountpoint -q "$nvme_dir"; then
   rm -rf "$nvme_dir"
-  sudo mkfs -t xfs /dev/nvme1n1
   mkdir -p "$nvme_dir"
-  sudo mount /dev/nvme1n1 "$nvme_dir"
+
+  if grep -q 'NAME="Amazon Linux"' /etc/os-release; then
+    sudo mkfs -t xfs /dev/nvme1n1
+    sudo mount /dev/nvme1n1 "$nvme_dir"
+  elif grep -q 'NAME="Ubuntu"' /etc/os-release; then
+    sudo /opt/aws/dlami/bin/nvme_ephemeral_drives.sh
+    sudo mount /dev/vg.01/lv_ephemeral "$nvme_dir"
+  fi
+
   sudo chmod 777 "$nvme_dir"
-else
-  rm -rf "${nvme_dir:?}"/* # https://www.shellcheck.net/wiki/SC2115
 fi
diff --git a/s3torchbenchmarking/utils/run_benchmarks.sh b/s3torchbenchmarking/utils/run_benchmarks.sh
index 124a47e9..fd141182 100755
--- a/s3torchbenchmarking/utils/run_benchmarks.sh
+++ b/s3torchbenchmarking/utils/run_benchmarks.sh
@@ -1,16 +1,23 @@
 #!/usr/bin/env bash
 #
-# Template script to run other benchmarks (not to be used directly).
+# Template script to run other benchmarks (not meant to be used directly).
 
 set -euo pipefail
 
-scenario=$1        # name of the scenario
-nvme_dir="./nvme/" # local path for saving checkpoints
+while getopts "s:d:" opt; do
+  case $opt in
+  s) scenario=$OPTARG ;; # name of the scenario
+  d) nvme_dir=$OPTARG ;; # mount point dir for saving checkpoints (will use NVMe drive)
+  *) ;;
+  esac
+done
 
-shift
+shift $((OPTIND - 1)) # remove all processed positional arguments from "$@"
 
 # Prepare NVMe drive mount
-./utils/prepare_nvme.sh "$nvme_dir"
+if [[ -n $nvme_dir ]]; then
+  ./utils/prepare_nvme.sh "$nvme_dir"
+fi
 
-# Run benchmarks; will write to DynamoDB table, if specified in the config
+# Run benchmarks; will write to DynamoDB table, if specified in the config (in `conf/aws/dynamodb.yaml`)
 python ./src/s3torchbenchmarking/"$scenario"/benchmark.py -cd conf -cn "$scenario" +path="$nvme_dir" "$@"
diff --git a/s3torchbenchmarking/utils/run_checkpoint_benchmarks.sh b/s3torchbenchmarking/utils/run_checkpoint_benchmarks.sh
index 471dd2ad..8abb39e1 100755
--- a/s3torchbenchmarking/utils/run_checkpoint_benchmarks.sh
+++ b/s3torchbenchmarking/utils/run_checkpoint_benchmarks.sh
@@ -2,4 +2,4 @@
 #
 # Run PyTorch Checkpointing benchmarks.
 
-./utils/run_benchmarks.sh pytorch_checkpointing "$@"
+./utils/run_benchmarks.sh -s pytorch_checkpointing -d ./nvme/ "$@"
diff --git a/s3torchbenchmarking/utils/run_dataloading_benchmarks.sh b/s3torchbenchmarking/utils/run_dataloading_benchmarks.sh
deleted file mode 100755
index 509374b0..00000000
--- a/s3torchbenchmarking/utils/run_dataloading_benchmarks.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-# s3iterabledataset
-# fsspec
-# mountpoint
-# mountpointcache
-
-# Check if the list of datasets is provided as an argument
-if [ "$#" -lt 2 ]; then
-  echo "Usage: $0 <DATALOADER> <dataset1> [dataset2] [dataset3] ..."
-  exit 1
-fi
-
-DATALOADER=$1
-shift
-
-# Create an array from the remaining arguments (the datasets)
-datasets=("$@")
-
-# work around for PyTorch's cuda clashing with installed locally from https://github.com/pytorch/pytorch/issues/119989
-unset LD_LIBRARY_PATH
-
-for dataset in "${datasets[@]}"; do
-  if [[ "$dataset" == *"shards"* ]]; then
-    s3torch-benchmark -cd conf -m -cn dataloading_sharded_vit "dataset=$dataset" "dataloader=$DATALOADER"
-    s3torch-benchmark -cd conf -m -cn dataloading_sharded_ent "dataset=$dataset" "dataloader=$DATALOADER"
-  else
-    s3torch-benchmark -cd conf -m -cn dataloading_unsharded_1epochs "dataset=$dataset" "dataloader=$DATALOADER"
-    s3torch-benchmark -cd conf -m -cn dataloading_unsharded_vit_10epochs "dataset=$dataset" "dataloader=$DATALOADER"
-    s3torch-benchmark -cd conf -m -cn dataloading_unsharded_ent_10epochs "dataset=$dataset" "dataloader=$DATALOADER"
-  fi
-done
diff --git a/s3torchbenchmarking/utils/run_dataset_benchmarks.sh b/s3torchbenchmarking/utils/run_dataset_benchmarks.sh
index e9bb3983..fc508b61 100755
--- a/s3torchbenchmarking/utils/run_dataset_benchmarks.sh
+++ b/s3torchbenchmarking/utils/run_dataset_benchmarks.sh
@@ -2,5 +2,4 @@
 #
 # Run dataset benchmarks.
 
-# TODO: see if it can reuse the `run_benchmarks.sh` script template here
-python ./src/s3torchbenchmarking/dataset/benchmark.py -cd conf -cn dataset "$@"
+./utils/run_benchmarks.sh -s dataset -d ./nvme/ "$@"
diff --git a/s3torchbenchmarking/utils/run_dcp_benchmarks.sh b/s3torchbenchmarking/utils/run_dcp_benchmarks.sh
index 875b8384..d40cfb14 100755
--- a/s3torchbenchmarking/utils/run_dcp_benchmarks.sh
+++ b/s3torchbenchmarking/utils/run_dcp_benchmarks.sh
@@ -2,4 +2,4 @@
 #
 # Run PyTorch’s Distributed Checkpointing (DCP) benchmarks.
 
-./utils/run_benchmarks.sh dcp "$@"
+./utils/run_benchmarks.sh -s dcp -d ./nvme/ "$@"
diff --git a/s3torchbenchmarking/utils/run_lightning_benchmarks.sh b/s3torchbenchmarking/utils/run_lightning_benchmarks.sh
index 60a8059e..ef285439 100755
--- a/s3torchbenchmarking/utils/run_lightning_benchmarks.sh
+++ b/s3torchbenchmarking/utils/run_lightning_benchmarks.sh
@@ -2,4 +2,4 @@
 #
 # Run PyTorch Lightning Checkpointing benchmarks.
 
-./utils/run_benchmarks.sh lightning_checkpointing "$@"
+./utils/run_benchmarks.sh -s lightning_checkpointing -d ./nvme/ "$@"
diff --git a/s3torchbenchmarking/utils/upload_colated_results_to_s3.py b/s3torchbenchmarking/utils/upload_colated_results_to_s3.py
deleted file mode 100644
index 27661fd7..00000000
--- a/s3torchbenchmarking/utils/upload_colated_results_to_s3.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import os
-import boto3
-from botocore.exceptions import ClientError
-import sys
-
-s3_client = boto3.client("s3")
-
-
-def upload_file_to_s3(local_file_path: str, bucket_name: str, s3_file_key: str) -> None:
-    try:
-        s3_client.upload_file(local_file_path, bucket_name, s3_file_key)
-        print(f"Uploaded {local_file_path} to {bucket_name}/{s3_file_key}")
-    except ClientError as e:
-        print(f"Error uploading {local_file_path} to {bucket_name}/{s3_file_key}: {e}")
-
-
-def traverse_folders(
-    folder_path: str, bucket_name: str, prefix: str, dataloader: str
-) -> None:
-    for root, _, files in os.walk(folder_path):
-        for file in files:
-            if file == "collated_results.json":
-                local_file_path = os.path.join(root, file)
-                parent_folder = os.path.basename(os.path.dirname(local_file_path))
-                s3_file_key = f"{prefix}/{dataloader}_{parent_folder}_{file}"
-                print(f"Uploading {local_file_path} to {bucket_name}/{s3_file_key}")
-                upload_file_to_s3(local_file_path, bucket_name, s3_file_key)
-
-
-if __name__ == "__main__":
-    if len(sys.argv) != 5:
-        print("Usage: python script.py ROOT_FOLDER BUCKET_NAME FOLDER_PREFIX DS_PREFIX")
-        print(
-            "Example: python script.py ./multirun pytorch-benchmarks-results 20240810 s3iterabledataset"
-        )
-        print("Note: ROOT_FOLDER is the root folder where the results are stored")
-        print(
-            "Note: BUCKET_NAME is the S3 bucket name where the results will be uploaded"
-        )
-        print(
-            "Note: FOLDER_PREFIX is the prefix for the folder where the results are stored"
-        )
-        print("Note: DS_PREFIX is the prefix for the dataset loader")
-        sys.exit(1)
-
-    ROOT_FOLDER = sys.argv[1]
-    BUCKET_NAME = sys.argv[2]
-    FOLDER_PREFIX = sys.argv[3]
-    DS_PREFIX = sys.argv[4]
-    traverse_folders(ROOT_FOLDER, BUCKET_NAME, FOLDER_PREFIX, DS_PREFIX)