From 58de12a24ebff2c3399ebdba57a466e8c92646f4 Mon Sep 17 00:00:00 2001 From: Matthieu Dufour Date: Fri, 10 Jan 2025 16:13:46 +0000 Subject: [PATCH] chore(benchmarks): tidy up benchmark (#292) Delete leftover script files. Add minor change to Jupyter Notebook (`"ec2_metadata"` key in results table). Simplify pyproject.toml dependencies list. Change some example parameters in the Hydra config files, add clarifying comments. Rework the READMEs. Tune the utils/prepare_nvme.sh to work for both Amazon Linux and Ubuntu EC2 instances. Update global .gitignore. Delete utils/prepare_ec2_instance.sh, and add its content to the README. For dataset scenario, add training time measurement around epochs. Minor Python code improvements. --- .gitignore | 6 +- s3torchbenchmarking/README.md | 255 +++++++++--------- .../benchmark_results_aggregator.ipynb | 25 +- s3torchbenchmarking/conf/aws/s3.yaml | 5 - s3torchbenchmarking/conf/dataset.yaml | 34 ++- s3torchbenchmarking/conf/dcp.yaml | 25 +- .../conf/lightning_checkpointing.yaml | 18 +- .../conf/pytorch_checkpointing.yaml | 16 +- s3torchbenchmarking/pyproject.toml | 25 +- .../s3torchbenchmarking/benchmark_utils.py | 31 +-- .../src/s3torchbenchmarking/datagen.py | 44 +-- .../s3torchbenchmarking/dataset/benchmark.py | 19 +- .../src/s3torchbenchmarking/dcp/README.md | 165 +----------- .../src/s3torchbenchmarking/hydra_callback.py | 2 +- .../src/s3torchbenchmarking/models.py | 37 +-- .../utils/prepare_and_run_benchmark.sh | 15 -- .../utils/prepare_ec2_instance.sh | 23 -- s3torchbenchmarking/utils/prepare_nvme.sh | 16 +- s3torchbenchmarking/utils/run_benchmarks.sh | 19 +- .../utils/run_checkpoint_benchmarks.sh | 2 +- .../utils/run_dataloading_benchmarks.sh | 32 --- .../utils/run_dataset_benchmarks.sh | 3 +- .../utils/run_dcp_benchmarks.sh | 2 +- .../utils/run_lightning_benchmarks.sh | 2 +- .../utils/upload_colated_results_to_s3.py | 50 ---- 25 files changed, 315 insertions(+), 556 deletions(-) delete mode 100644 s3torchbenchmarking/conf/aws/s3.yaml delete mode 100755 s3torchbenchmarking/utils/prepare_and_run_benchmark.sh delete mode 100755 s3torchbenchmarking/utils/prepare_ec2_instance.sh delete mode 100755 s3torchbenchmarking/utils/run_dataloading_benchmarks.sh delete mode 100644 s3torchbenchmarking/utils/upload_colated_results_to_s3.py diff --git a/.gitignore b/.gitignore index 6b9126ed..e6b9f81d 100644 --- a/.gitignore +++ b/.gitignore @@ -60,8 +60,10 @@ venv.bak/ .dmypy.json dmypy.json -# Hydra (https://hydra.cc/) -multirun/ +# PyTorch benchmarks: Hydra, NVMe directory, and CSV results +s3torchbenchmarking/**/multirun/ +s3torchbenchmarking/**/nvme/ +s3torchbenchmarking/**/*.csv # Rust .gitignore (https://github.com/github/gitignore/blob/main/Rust.gitignore) -- cherry-picked ###################### diff --git a/s3torchbenchmarking/README.md b/s3torchbenchmarking/README.md index ddae9a4c..5a07164d 100644 --- a/s3torchbenchmarking/README.md +++ b/s3torchbenchmarking/README.md @@ -1,179 +1,180 @@ -# Benchmarking the S3 Connector for PyTorch +# s3torchbenchmarking -This directory contains a modular component for the experimental evaluation of the performance of the Amazon S3 Connector for -PyTorch. -The goal of this component is to be able to run performance benchmarks for PyTorch connectors in an easy-to-reproduce and -extensible fashion. This way, users can experiment with different settings and arrive at the optimal configuration for their workloads, -before committing to a setup. +This Python package houses a set of benchmarks for experimentally evaluating the performance of +the **Amazon S3 Connector for PyTorch** library. -By managing complex configuration space with [Hydra](https://hydra.cc/) we are able to define modular configuration pieces mapped to various -stages of the training pipeline. This approach allows one to mix and match configurations and measure the performance -impact to the end-to-end training process. +With the use of the [Hydra](https://hydra.cc/) framework, we are able to define modular configuration pieces mapped to +various stages of the training pipeline. This approach allows one to mix and match configurations and measure the +performance impact to the end-to-end training process. -There are **three scenarios** available: +**Four scenarios** are available: -- **Data loading benchmarks**: measure our connector against other Dataset classes (i.e., classes used to fetch and - index actual datasets); all save to S3. -- **PyTorch Lightning Checkpointing benchmarks**: measure our connector, using the PyTorch Lightning framework, against - the latter default implementation of checkpointing. -- **PyTorch’s Distributed Checkpointing (DCP) benchmarks**: measure our connector against PyTorch default distributed - checkpointing mechanism — learn more in [this dedicated README](src/s3torchbenchmarking/dcp/README.md). +1. **Dataset benchmarks** + - Compare our connector against other Dataset classes + - All scenarios save data to S3 + - Measure performance in data fetching and indexing +2. **PyTorch's Distributed Checkpointing (DCP) benchmarks** + - Assess our connector's performance versus PyTorch's default distributed checkpointing mechanism + - For detailed information, refer to the [dedicated DCP `README`](src/s3torchbenchmarking/dcp/README.md) +3. **PyTorch Lightning Checkpointing benchmarks** + - Evaluate our connector within the PyTorch Lightning framework + - Compare against PyTorch Lightning's default checkpointing implementation +4. **PyTorch Checkpointing benchmarks** + - TODO! -## Getting Started +## Getting started -The benchmarking code is available within the `src/s3torchbenchmarking` module. +The benchmarking code is located in the `src/s3torchbenchmarking` module. The scenarios are designed to be run on an EC2 +instance with one (or many) GPU(s). -The tests can be run locally, or you can launch an EC2 instance with a GPU (we used a [g5.2xlarge][g5.2xlarge]), -choosing the [AWS Deep Learning AMI GPU PyTorch 2.5 (Ubuntu 22.04)][dl-ami] as your AMI. +### EC2 instance setup (recommended) -First, activate the Conda env within this machine by running: +From your EC2 AWS Console, launch an instance with one (or many) GPU(s) (e.g., G5 instance type); we recommend using +an [AWS Deep Learning AMI (DLAMI)][dlami], such +as [AWS Deep Learning AMI GPU PyTorch 2.5 (Amazon Linux 2023)][dlami-pytorch]. + +> [!NOTE] +> Some benchmarks can be long-running. To avoid the shortcomings around expired AWS tokens, we recommend attaching a +> role to your EC2 instance with: +> +> - Full access to S3 +> - (Optional) Full access to DynamoDB — for writing run results +> +> See the [Running the benchmarks](#running-the-benchmarks) section for more details. + +For optimal results, it is recommended to run the benchmarks on a dedicated EC2 instance _without_ other +resource-intensive processes. + +### Creating a new Conda environment (env) + +> [!WARNING] +> While some DLAMIs provide a pre-configured Conda env (`source activate pytorch`), we have observed compatibility +> issues with the latest PyTorch versions (2.5.X) at the time of writing. We recommend creating a new one from scratch +> as detailed below. + +Once your instance is running, `ssh` into it, and create a new Conda env: ```shell -source activate pytorch +conda create -n pytorch-benchmarks python=3.12 +conda init ``` -If running locally you can optionally configure a Python venv: +Then, activate it (_you will need to log out and in again in the meantime, as signaled by `conda init`_): ```shell -python -m venv -source /bin/activate +source activate pytorch-benchmarks ``` -Then, `cd` to the `s3torchbenchmarking` directory, and run the `utils/prepare_ec2_instance.sh` script: the latter will -take care of updating the instance's packages (through either `yum` or `apt`), install Mountpoint for Amazon S3, and -install the required Python packages. +Finally, from within this directory, install the `s3torchbenchmarking` module: + +```shell +# `-e` so local modifications get picked up, if any +pip install -e . +``` > [!NOTE] -> Some errors may arise while trying to run the benchmarks; below are some workarounds to execute in such cases. - -- Error `RuntimeError: operator torchvision::nms does not exist` while trying the run the benchmarks: - ```shell - conda install -y pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia - ``` -- Error `TypeError: canonicalize_version() got an unexpected keyword argument 'strip_trailing_zero'` while trying to - install `s3torchbenchmarking` package: - ```shell - pip install "setuptools<71" - ``` +> For some scenarios, you may be required to install the [Mountpoint for Amazon S3][mountpoint-s3] file client: please +> refer to their README for instructions. ### (Pre-requisite) Configure AWS Credentials -The commands provided below (`datagen.py`, `benchmark.py`) rely on the -standard [AWS credential discovery mechanism][credentials]. Supplement the command as necessary to ensure the AWS -credentials are made available to the process, e.g., by setting the `AWS_PROFILE` environment variable. +The benchmarks and other commands provided below rely on the standard [AWS credential discovery mechanism][credentials]. +Supplement the command as necessary to ensure the AWS credentials are made available to the process, e.g., by setting +the `AWS_PROFILE` environment variable. -### Configuring the dataset +### Creating a dataset (optional; for "dataset" benchmarks only) -_Note: This is a one-time setup for each dataset configuration. The dataset configuration files, once created locally -and can be used in subsequent benchmarks, as long as the dataset on the S3 bucket is intact._ +You can use your own dataset for the benchmarks, or you can generate one on-the-fly using the `s3torch-datagen` command. -If you already have a dataset, you only need upload it to an S3 bucket and set up a YAML file under -`./conf/dataset/` in the following format: - -```yaml -# custom_dataset.yaml +Here are some sample dataset configurations that we ran our benchmarks against: -prefix_uri: s3://// -region: -sharding: TAR|null # if the samples have been packed into TAR archives. +```shell +s3torch-datagen -n 100k --shard-size 128MiB --s3-bucket my-bucket --region us-east-1 ``` -This dataset can then be referenced in an experiment with an entry like `dataset: custom_dataset` (note that we're -omitting the *.yaml extension). This will result in running the benchmarks against this dataset. Some experiments have -already been defined for reference - see `./conf/dataloading.yaml` or `./conf/sharding.yaml`. +## Running the benchmarks -_Note: Ensure the bucket is in the same region as the EC2 instance to eliminate network latency effects in your -measurements._ +You can run the different benchmarks by editing their corresponding config files, then running one of those shell +script (specifically, you must provide a value for all keys marked with `???`): -Alternatively, you can use the `s3torch-datagen` command to procedurally generate an image dataset and upload it to -Amazon S3. The script also creates a Hydra configuration file at the appropriate path. +```shell +# Dataset benchmarks +vim ./conf/dataset.yaml # 1. edit config +./utils/run_dataset_benchmarks.sh # 2. run scenario -``` -$ s3torch-datagen --help -Usage: s3torch-datagen [OPTIONS] - - Synthesizes a dataset that will be used for benchmarking and uploads it to - an S3 bucket. - -Options: - -n, --num-samples FLOAT Number of samples to generate. Can be supplied as - an IEC or SI prefix. Eg: 1k, 2M. Note: these are - case-sensitive notations. [default: 1k] - --resolution TEXT Resolution written in 'widthxheight' format - [default: 496x387] - --shard-size TEXT If supplied, the images are grouped into tar files - of the given size. Size can be supplied as an IEC - or SI prefix. Eg: 16Mib, 4Kb, 1Gib. Note: these are - case-sensitive notations. - --s3-bucket TEXT S3 Bucket name. Note: Ensure the credentials are - made available either through environment variables - or a shared credentials file. [required] - --s3-prefix TEXT Optional S3 Key prefix where the dataset will be - uploaded. Note: a prefix will be autogenerated. eg: - s3:///1k_256x256_16Mib_sharded/ - --region TEXT Region where the S3 bucket is hosted. [default: - us-east-1] - --help Show this message and exit. +# PyTorch Checkpointing benchmarks +vim ./conf/pytorch_checkpointing.yaml # 1. edit config +./utils/run_checkpoints_benchmarks.sh # 2. run scenario + +# PyTorch Lightning Checkpointing benchmarks +vim ./conf/lightning_checkpointing.yaml # 1. edit config +./utils/run_lighning_benchmarks.sh # 2. run scenario +# PyTorch’s Distributed Checkpointing (DCP) benchmarks +vim ./conf/dcp.yaml # 1. edit config +./utils/run_dcp_benchmarks.sh # 2. run scenario ``` -Here are some sample dataset configurations that we ran our benchmarks against: +> [!NOTE] +> Ensure the bucket is in the same region as the EC2 instance, to eliminate network latency effects in your +> measurements. -- `-n 20k --resolution 496x387` -- `-n 20k --resolution 496x387 --shard-size {4, 8, 16, 32, 64}MiB` +Each of those scripts rely on Hydra config files, located under the [`conf`](conf) directory. You may edit those as you +see fit to configure the runs: in particular, parameters under the `hydra.sweeper.params` path will create as many jobs +as the cartesian product of those. -Example: +Also, as the scripts pass the inline parameters you give them to Hydra, you may override their behaviors this way: -``` -$ s3torch-datagen -n 20k \ - --resolution 496x387 \ - --shard-size 4MB \ - --s3-bucket swift-benchmark-dataset \ - --region eu-west-2 - -Generating data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1243.50it/s] -Uploading to S3: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 3378.87it/s] -Dataset uploaded to: s3://swift-benchmark-dataset/20k_496x387_images_4MB_shards/ -Dataset Configuration created at: ./conf/dataset/20k_496x387_images_4MB_shards.yaml -Configure your experiment by setting the entry: - dataset: 20k_496x387_images_4MB_shards -Alternatively, you can run specify it on the cmd-line when running the benchmark like so: - s3torch-benchmark -cd conf -m -cn 'dataset=20k_496x387_images_4MB_shards' +```shell +./utils/run_dataset_benchmarks.sh +disambiguator=some_key ``` ---- +## Getting the results -Finally, once the dataset and other configuration modules have been defined, you can kick off the benchmark by running: +### Scenario organization -```shell -# For data loading benchmarks: -$ . utils/run_dataset_benchmarks.sh +Benchmark results are organized as follows, inside a default `./multirun` directory (e.g.): + +``` +./multirun +└── dataset + └── 2024-12-20_13-42-27 + ├── 0 + │ ├── benchmark.log + │ └── job_results.json + ├── 1 + │ ├── benchmark.log + │ └── job_resutls.json + ├── multirun.yaml + └── run_results.json +``` -# For PyTorch Checkpointing benchmarks: -$ . utils/run_checkpoints_benchmarks.sh +Scenarios are organized at the top level, each in its own directory named after the scenario (e.g., `dataset`). Within +each scenario directory, you'll find individual run directories, automatically named by Hydra using the creation +timestamp (e.g., `2024-12-20_13-42-27`). -# For PyTorch Lightning Checkpointing benchmarks: -$ . utils/run_lighning_benchmarks.sh +Each run directory contains job subdirectories (e.g., `0`, `1`, etc.), corresponding to a specific subset of parameters. -# For PyTorch’s Distributed Checkpointing (DCP) benchmarks: -$ . utils/run_dcp_benchmarks.sh -``` +### Experiment reporting -_Note: For overriding any other benchmark parameters, see [Hydra Overrides][hydra-overrides]. You can also run -`s3torch-benchmark --hydra-help` to learn more._ +Experiments will report various metrics, such as throughput and processed time — the exact types vary per scenarios. +Results are stored in two locations: -Experiments will report various metrics, like throughput, processed time, etc. The results for individual jobs and runs -(one run will contain 1 to N jobs) will be written out to dedicated files, respectively `job_results.json` and -`run_results.json`, within their corresponding output directory (see the YAML config files). +1. In the job subdirectories: + - `benchmark.log`: Individual job logs (collected by Hydra) + - `job_results.json`: Individual job results +2. In the run directory: + - `multirun.yaml`: Global Hydra configuration for the run + - `run_results.json`: Comprehensive run results, including additional metadata -## Next Steps +If a DynamoDB table is defined in the [`conf/aws/dynamodb.yaml`](conf/aws/dynamodb.yaml) configuration file, results +will also be written to the specified table. -- Add more models (LLMs?) to monitor training performance. -- Support plugging in user-defined models and automatic discovery of the same. +[dlami]: https://docs.aws.amazon.com/dlami/ -[g5.2xlarge]: https://aws.amazon.com/ec2/instance-types/g5/ +[dlami-pytorch]: https://aws.amazon.com/releasenotes/aws-deep-learning-ami-gpu-pytorch-2-5-amazon-linux-2023/ -[dl-ami]: https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html +[mountpoint-s3]: https://github.com/awslabs/mountpoint-s3/tree/main [credentials]: https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html diff --git a/s3torchbenchmarking/benchmark_results_aggregator.ipynb b/s3torchbenchmarking/benchmark_results_aggregator.ipynb index c06fb9c1..315be844 100644 --- a/s3torchbenchmarking/benchmark_results_aggregator.ipynb +++ b/s3torchbenchmarking/benchmark_results_aggregator.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "id": "6522fc8a931ffbc3", "metadata": { "ExecuteTime": { @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "id": "a371fc9062af6126", "metadata": { "ExecuteTime": { @@ -76,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "id": "e14b9efad6ae3ad6", "metadata": { "ExecuteTime": { @@ -127,6 +127,7 @@ " ),\n", " **metrics_averaged,\n", " \"config\": job_result[\"config\"],\n", + " \"ec2_metadata\": run_result[\"ec2_metadata\"],\n", " }\n", " rows.append(row)\n", "\n", @@ -143,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "id": "be008fb6acf09055", "metadata": { "ExecuteTime": { @@ -170,14 +171,18 @@ "source": [ "import pandas as pd\n", "\n", - "_data = transform(_run_results)\n", - "_table = pd.json_normalize(_data).set_index(\"version\")\n", + "_table = pd.DataFrame()\n", + "\n", + "if _run_results:\n", + " _data = transform(_run_results)\n", + " _table = pd.json_normalize(_data).set_index(\"version\")\n", + "\n", "_table" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "b4eed2752e6add17", "metadata": { "ExecuteTime": { @@ -191,7 +196,11 @@ "import random\n", "\n", "_suffix = \"\".join(random.choices(string.ascii_letters, k=5))\n", - "_table.to_csv(f\"benchmark_results_{_suffix}.csv\")" + "_filename = f\"benchmark_results_{_suffix}.csv\"\n", + "\n", + "if not _table.empty:\n", + " _table.to_csv(_filename)\n", + " print(f\"CSV written to {_filename}\")" ] } ], diff --git a/s3torchbenchmarking/conf/aws/s3.yaml b/s3torchbenchmarking/conf/aws/s3.yaml deleted file mode 100644 index 4340abf2..00000000 --- a/s3torchbenchmarking/conf/aws/s3.yaml +++ /dev/null @@ -1,5 +0,0 @@ -# @package _global_ -# S3 config; used for checkpoint storage -s3: - region: ??? - uri: ??? diff --git a/s3torchbenchmarking/conf/dataset.yaml b/s3torchbenchmarking/conf/dataset.yaml index 83ed5e07..69f65b43 100644 --- a/s3torchbenchmarking/conf/dataset.yaml +++ b/s3torchbenchmarking/conf/dataset.yaml @@ -1,19 +1,25 @@ defaults: - - hydra/callbacks: - - collate_results - - aws: - - s3 - - dynamodb # save run results to DynamoDB (see also conf/aws/dynamodb.yaml) -- comment me if not required + - hydra/callbacks/collate_results + - aws/dynamodb # save run results to DynamoDB -- comment me if not required - _self_ -prefix_uri: ??? # where the dataset are stored in S3 -region: ??? -sharding: False -epochs: 1 +# S3 bucket where the dataset is stored. +# NOTE: a non-existing bucket will fail the benchmarks. +s3: + region: ??? # e.g., eu-west-1 + bucket: ??? # e.g., my-bucket (*not* an S3 URI) +# Boolean flag to tell whether the dataset is sharded or not. +sharding: True +# Number of iterations for training a model. +epochs: 10 checkpoint: - save_one_in: 25 + # Number of training steps between checkpoints. + save_one_in: 0 + # Checkpoint storage location. destination: disk + # Path for checkpoint saving (local disk or S3 URI). uri: ./nvme/checkpoints/ + # S3 region. region: eu-west-2 hydra: @@ -22,5 +28,9 @@ hydra: dir: multirun/${hydra.job.config_name}/${now:%Y-%m-%d_%H-%M-%S} sweeper: params: - +model: entitlement, vit - +dataloader: s3iterabledataset, s3mapdataset, fsspec, mountpoint, mountpointcache + # Name of a model (valid options: "entitlement", "vit"). + +model: entitlement + # Kind of the dataloader (valid options: "fsspec", "s3iterabledataset", "mountpoint", "mountpointcache"). + +dataloader: fsspec, s3iterabledataset, mountpoint, mountpointcache + # Dataset name (corresponds to the name of a folder in S3); will be used to build an S3 URI + +dataset: 100k_496x387_images diff --git a/s3torchbenchmarking/conf/dcp.yaml b/s3torchbenchmarking/conf/dcp.yaml index 7da5665d..0aa5b3ba 100644 --- a/s3torchbenchmarking/conf/dcp.yaml +++ b/s3torchbenchmarking/conf/dcp.yaml @@ -1,11 +1,15 @@ defaults: - - hydra/callbacks: - - collate_results - - aws: - - s3 - - dynamodb # save run results to DynamoDB (see also conf/aws/dynamodb.yaml) -- comment me if not required + - hydra/callbacks/collate_results + - aws/dynamodb # save run results to DynamoDB -- comment me if not required - _self_ +# S3 bucket to use to save checkpoints. +# NOTE: a non-existing bucket will fail the benchmarks. +s3: + region: ??? # e.g., eu-west-1 + uri: ??? # e.g., s3://my-bucket/ +# Number of iterations for "saving" a model's checkpoint. +# NOTE: this does not affect model training, as no actual training occurs in these benchmarks. epochs: 4 hydra: @@ -14,8 +18,13 @@ hydra: dir: multirun/${hydra.job.config_name}/${now:%Y-%m-%d_%H-%M-%S} sweeper: params: + # Short name of a pre-trained model (from Hugging Face), listed in `models.py`. +model: vit-base, T0_3B - +backend: nccl, gloo # nccl == GPU, gloo == CPU - +world_size: 1, 2, 4, 8 # == total number of workers to use - +thread_count: 1, 2, 4, 8 + # Type of Torch distributed backend (valid options: "nccl", "gloo"). + +backend: nccl + # Number of workers. + +world_size: 4 + # Number of threads to use for saving the checkpoints. + +thread_count: 4 + # Checkpoint storage location (valid options: "disk", "s3"). +checkpoint.storage: disk, s3 diff --git a/s3torchbenchmarking/conf/lightning_checkpointing.yaml b/s3torchbenchmarking/conf/lightning_checkpointing.yaml index ebeae64c..51b8a04a 100644 --- a/s3torchbenchmarking/conf/lightning_checkpointing.yaml +++ b/s3torchbenchmarking/conf/lightning_checkpointing.yaml @@ -1,12 +1,16 @@ defaults: - - hydra/callbacks: - - collate_results - - aws: - - s3 - - dynamodb # save run results to DynamoDB (see also conf/aws/dynamodb.yaml) -- comment me if not required + - hydra/callbacks/collate_results + - aws/dynamodb # save run results to DynamoDB -- comment me if not required - _self_ +# S3 bucket to use to save checkpoints. +# NOTE: a non-existing bucket will fail the benchmarks. +s3: + region: ??? # e.g., eu-west-1 + uri: ??? # e.g., s3://my-bucket/ +# Number of iterations for "saving" a model's checkpoint. epochs: 5 +# Number of training steps between checkpoints. save_one_in: 1 hydra: @@ -15,5 +19,7 @@ hydra: dir: multirun/${hydra.job.config_name}/${now:%Y-%m-%d_%H-%M-%S} sweeper: params: - +model: vit-base, whisper, clip-vit, T0_3B, T0pp + # Short name of a pre-trained model (from Hugging Face), listed in `models.py`. + +model: clip-vit, T0_3B, T0pp + # Checkpoint storage location (valid options: "disk", "s3"). +checkpoint.storage: disk, s3 diff --git a/s3torchbenchmarking/conf/pytorch_checkpointing.yaml b/s3torchbenchmarking/conf/pytorch_checkpointing.yaml index ebeae64c..d8d9136b 100644 --- a/s3torchbenchmarking/conf/pytorch_checkpointing.yaml +++ b/s3torchbenchmarking/conf/pytorch_checkpointing.yaml @@ -1,12 +1,16 @@ defaults: - - hydra/callbacks: - - collate_results - - aws: - - s3 - - dynamodb # save run results to DynamoDB (see also conf/aws/dynamodb.yaml) -- comment me if not required + - hydra/callbacks/collate_results + - aws/dynamodb # save run results to DynamoDB -- comment me if not required - _self_ +# S3 bucket to use to save checkpoints. +# NOTE: a non-existing bucket will fail the benchmarks. +s3: + region: ??? # e.g., eu-west-1 + uri: ??? # e.g., s3://my-bucket/ +# Number of iterations for "saving" a model's checkpoint. epochs: 5 +# Number of training steps between checkpoints. save_one_in: 1 hydra: @@ -15,5 +19,7 @@ hydra: dir: multirun/${hydra.job.config_name}/${now:%Y-%m-%d_%H-%M-%S} sweeper: params: + # Short name of a pre-trained model (from Hugging Face), listed in `models.py`. +model: vit-base, whisper, clip-vit, T0_3B, T0pp + # Checkpoint storage location (valid options: "disk", "s3"). +checkpoint.storage: disk, s3 diff --git a/s3torchbenchmarking/pyproject.toml b/s3torchbenchmarking/pyproject.toml index e3fb993a..9b1a16de 100644 --- a/s3torchbenchmarking/pyproject.toml +++ b/s3torchbenchmarking/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools", "build"] +requires = ["setuptools"] build-backend = "setuptools.build_meta" [project] @@ -9,23 +9,20 @@ description = "Tools to run and compare benchmarks against various PyTorch conne requires-python = ">=3.8,<3.13" readme = "README.md" dependencies = [ - "torch >= 2.0.1, != 2.3.0, != 2.5.0", - "lightning >= 2.0", - "s3torchconnector", - "hydra-core", - "torchdata>=0.6.1, <=0.9.0", # we have dependency on deprecated DataPipes, which were removed in 0.10.0 - "torchvision", - "s3fs>=2024.6.1", - "transformers", - "numpy", - "psutil", - "pynvml", + "s3torchconnector[lightning,dcp]", "boto3", - "prefixed", "click", - "accelerate", + "hydra-core", "pandas", + "pillow", + "prefixed", + "psutil", + "pynvml", "requests", + "s3fs>=2024", # prevents "UserWarning: Your installed version of s3fs is very old" type of warnings + "torchdata<0.10.0", # we have dependency on deprecated DataPipes, which were removed in 0.10.0 + "torchvision", + "transformers", ] [project.optional-dependencies] diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/benchmark_utils.py b/s3torchbenchmarking/src/s3torchbenchmarking/benchmark_utils.py index ad768670..85739ea4 100644 --- a/s3torchbenchmarking/src/s3torchbenchmarking/benchmark_utils.py +++ b/s3torchbenchmarking/src/s3torchbenchmarking/benchmark_utils.py @@ -6,14 +6,12 @@ import time from collections import defaultdict from collections import deque -from dataclasses import dataclass from pathlib import Path -from typing import Dict, Optional, List +from typing import Dict, Optional, List, TypedDict import numpy as np import psutil import torch.cuda -from PIL import Image from pynvml import ( # type: ignore nvmlInit, nvmlDeviceGetUtilizationRates, @@ -52,12 +50,12 @@ def summarize(self) -> dict: } -@dataclass -class ExperimentResult: - elapsed_time: float +class ExperimentResult(TypedDict, total=False): + training_duration_s: float + epoch_durations_s: List[float] volume: int - checkpoint_times: Optional[List[float]] = None - utilization: Optional[Dict[str, Distribution]] = None + checkpoint_times: Optional[List[float]] + utilization: Dict[str, Distribution] class ResourceMonitor: @@ -116,23 +114,6 @@ def stop(self): self.monitor_thread.join() -class Transforms: - IMG_TRANSFORMS = v2.Compose( - [ - v2.ToImage(), - v2.ToDtype(torch.uint8, scale=True), - v2.RandomResizedCrop((224, 224), antialias=True), - v2.ToDtype(torch.float32, scale=True), - v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ] - ) - - @staticmethod - def transform_image(data): - img = Image.open(data) - return Transforms.IMG_TRANSFORMS(img) - - def build_random_suffix() -> str: """Build a random suffix for use in filepaths or S3 URIs.""" return "".join(random.choices(string.ascii_letters, k=7)) diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/datagen.py b/s3torchbenchmarking/src/s3torchbenchmarking/datagen.py index fa5941f2..be6da48b 100644 --- a/s3torchbenchmarking/src/s3torchbenchmarking/datagen.py +++ b/s3torchbenchmarking/src/s3torchbenchmarking/datagen.py @@ -1,23 +1,25 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # // SPDX-License-Identifier: BSD -import abc + import io +import logging import os import tarfile import time +from abc import ABC, abstractmethod from dataclasses import dataclass from multiprocessing import Queue -from pathlib import Path from threading import Thread, Lock, Barrier -from typing import Any, Callable, Iterator, TypeVar, Tuple, Dict, List, Optional, Union +from typing import Callable, Iterator, TypeVar, Tuple, List, Optional, Union import boto3 # type: ignore import click import numpy as np import prefixed as pr # type: ignore -import yaml from PIL import Image +logger = logging.getLogger(__name__) + @dataclass class LabelledSample: @@ -29,8 +31,8 @@ class Sentinel: pass -class DataGenerator(abc.ABC): - @abc.abstractmethod +class DataGenerator(ABC): + @abstractmethod def create(self, idx_gen: Iterator[int]) -> Iterator[LabelledSample]: pass @@ -128,14 +130,6 @@ def parse_human_readable_bytes( if value is not None: return pr.Float(value.rstrip("b").rstrip("B")) - @staticmethod - def write_dataset_config(disambiguator: str, dataset_cfg: Dict[str, Any]): - current_dir = os.getcwd() - cfg_path = Path(current_dir) / "conf" / "dataset" / f"{disambiguator}.yaml" - with open(cfg_path, "w") as outfile: - yaml.dump(dataset_cfg, outfile, default_flow_style=False) - click.echo(f"Dataset Configuration created at: {cfg_path}") - @staticmethod def validate_image_format(ctx: click.Context, param: str, value: str): supported_formats = set(Image.registered_extensions().values()) @@ -242,13 +236,8 @@ def producer(generator: Iterator, barrier: Barrier, queue: Queue, identifier: in queue.put(Sentinel) -def consumer( - queue: Queue, - activity: Callable[[Union[LabelledSample, Sentinel]], None], - identifier: int, -): +def consumer(queue: Queue, activity: Callable[[Union[LabelledSample, Sentinel]], None]): while True: - # click.echo(f"Consumer running on thread {threading.current_thread().ident}") item: Union[LabelledSample, Sentinel] = queue.get() if item is Sentinel: # add signal back for other consumers @@ -314,9 +303,7 @@ def synthesize_dataset( s3_prefix: str, region: str, ): - """ - Synthesizes a dataset that will be used for s3torchbenchmarking and uploads it to an S3 bucket. - """ + """Synthesizes a dataset that will be used for s3torchbenchmarking and uploads it to an S3 bucket.""" num_workers = os.cpu_count() or 1 task_queue: Queue = Queue(num_workers) @@ -348,6 +335,7 @@ def synthesize_dataset( # kick off consumers and producers for worker in [*consumers, *producers]: worker.start() + # wait for all threads to finish. Note: order is important since we wait to drain all pending messages from # producers first. for worker in [*producers, *consumers]: @@ -355,16 +343,6 @@ def synthesize_dataset( fq_key = f"s3://{s3_bucket}/{disambiguator}/" click.echo(f"Dataset uploaded to: {fq_key}") - # generate hydra dataset config file - Utils.write_dataset_config( - disambiguator=disambiguator, - dataset_cfg={ - "prefix_uri": fq_key, - "region": region, - # TODO: extend this when introduce other sharding types - "sharding": "TAR" if shard_size else None, - }, - ) click.echo( f"Configure your experiment by setting the entry:\n\tdataset: {disambiguator}" diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/dataset/benchmark.py b/s3torchbenchmarking/src/s3torchbenchmarking/dataset/benchmark.py index b6602e8a..4e9e9e20 100644 --- a/s3torchbenchmarking/src/s3torchbenchmarking/dataset/benchmark.py +++ b/s3torchbenchmarking/src/s3torchbenchmarking/dataset/benchmark.py @@ -13,6 +13,7 @@ from torch.utils.data import DataLoader, Dataset, default_collate from torchdata.datapipes.utils import StreamWrapper # type: ignore +from s3torchbenchmarking.benchmark_utils import ExperimentResult from s3torchbenchmarking.models import ( Entitlement, ViT, @@ -26,11 +27,16 @@ @hydra.main(version_base=None) def run_experiment(config: DictConfig) -> dict: model = make_model(config) + + fully_qualified_uri = ( + "s3://" + config.s3.bucket.strip("/") + "/" + config.dataset.strip("/") + ) + dataset = make_dataset( kind=config.dataloader.kind, sharding=config.sharding, - prefix_uri=config.prefix_uri, - region=config.region, + prefix_uri=fully_qualified_uri, + region=config.s3.region, load_sample=model.load_sample, num_workers=config.dataloader.num_workers, ) @@ -40,12 +46,13 @@ def run_experiment(config: DictConfig) -> dict: batch_size=config.dataloader.batch_size, ) - result = model.train(dataloader, config.epochs) + result: ExperimentResult = model.train(dataloader, config.epochs) metrics = { - "throughput_mibs": [result.volume / result.elapsed_time], - "elapsed_time_s": [result.elapsed_time], - "utilization": {k: v.summarize() for k, v in result.utilization.items()}, + "throughput_mibs": result["volume"] / result["training_duration_s"], + "training_duration_s": result["training_duration_s"], + "epoch_durations_s": result["epoch_durations_s"], + "utilization": {k: v.summarize() for k, v in result["utilization"].items()}, } return {"metrics": metrics} diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/dcp/README.md b/s3torchbenchmarking/src/s3torchbenchmarking/dcp/README.md index aba26d7e..8dc028a7 100644 --- a/s3torchbenchmarking/src/s3torchbenchmarking/dcp/README.md +++ b/s3torchbenchmarking/src/s3torchbenchmarking/dcp/README.md @@ -1,165 +1,24 @@ ## PyTorch's Distributed Checkpoint (DCP) benchmarks -The `dcp` Python package holds all the logic to execute benchmarks for [PyTorch's Distributed Checkpointing][DCP] -feature against the `s3torchconnector` library. +The `dcp` Python package provides a suite of benchmarks designed to evaluate and measure the performance +of [PyTorch's Distributed Checkpointing (DCP)][DCP] feature in comparison to the `s3torchconnector` library. ### Purpose -These benchmarks are designed to: +These benchmarks focus on testing the "save" mechanism of PyTorch DCP (`torch.distributed.checkpoint.save`). The primary +objectives are to evaluate the `s3torchconnector` library's performance against other libraries and local storage +options, by measuring the following metrics: -1. Test the "save" mechanism of PyTorch DCP (`torch.distributed.checkpoint.save`); -2. Compare the performance of the s3torchconnector library against other libraries and local storage; -3. Measure throughput (in MiB/s) and save times (in seconds). - -### Usage - -> [!IMPORTANT] -> The benchmarks are designed to be run on a EC2 instance. - -Install the `s3torchbenchmarking` package with `pip` (see the [root README](../../../README.md) for instructions), -along with the `s3torchconnector[dcp]` extra; once installed, the DCP benchmarks can be run with: - -```shell -$ s3torch-benchmark-dcp -cd conf -cn dcp -``` - -The command must be executed from the package's root, where it can read from the `config/` directory; it will create a -`./multirun/` directory (at the location of execution), and store all benchmark results there. - -> [!WARNING] -> When saving on local disk, consider clearing the `path` specified in your config between runs to prevent disk space -> issues. - -#### Potential caveats - -If you encounter the following errors during installation, try the associated command: - -**Error**: - -``` -RuntimeError: Failed to import transformers.models.vit.modeling_vit because of the following error (look up to see its traceback): -operator torchvision::nms does not exist -``` - -**Try**: - -```shell -$ conda install -y pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia -``` - -**Error**: - -``` -TypeError: canonicalize_version() got an unexpected keyword argument 'strip_trailing_zero' -``` - -**Try**: - -```shell -$ pip install "setuptools<71" -``` +- Checkpoint saving throughput (in MiB/s); +- Checkpoint "corrected" save durations (in seconds), which exclude the influence of model load duration on the device. ### Configuration -The benchmark runs can be customized using the [`dcp.yaml`](../../../conf/dcp.yaml) file. This section outlines the key -configuration options and their impacts. - -#### Configuration Requirements - -All keys in the `dcp.yaml` file must be defined for a run to execute successfully. - -#### Key Configuration Options - -`epochs` +The benchmark runs can be customized through the [`dcp.yaml`](../../../conf/dcp.yaml) file. -- Specifies the number of iterations for "saving" a model's checkpoint. -- Note: This does not affect model training, as no actual training occurs in these benchmarks. - -`path` - -- Designates the directory for benchmark operations. -- If the specified directory doesn't exist, it will be created automatically. -- For optimal performance using an SSD filesystem, refer to the [`prepare_nvme.sh`](../../../utils/prepare_nvme.sh) - script. - -`hydra.sweeper.params` - -This section allows for multiple benchmark configurations: - -- The benchmark will run sequential jobs for each combination of the specified parameters. -- Available options include: - - `+model`: Choose from pre-trained models listed in [`models.py`](models.py). - - `+backend`: Select `nccl`, `gloo`, or both. - - `+world_size`: Defines the number of workers. - - `+thread_count`: Defines the number of threads to use for saving the checkpoints. - - `+checkpoint.storage`: Choose `s3`, `disk`, or both. - -#### Example Configuration - -```yaml -s3: - region: eu-west-1 - uri: s3://my-bucket -epochs: 3 -path: ./nvme/ - -hydra: - mode: MULTIRUN - sweeper: - params: - +model: vit-base,T0_3B - +backend: nccl,gloo - +world_size: 2,4 - +thread_count: 1 - +checkpoint.storage: s3,disk -``` - -This configuration will run benchmarks for all combinations of the specified models, backends, world sizes, and storage -options, totaling 16 (2×2×2×1×2) different benchmark scenarios. - -### Important notes - -- The benchmarks may take some time to complete, depending on the hardware and network configuration. -- For optimal results, it is recommended to run the benchmarks on a dedicated EC2 instance without other - resource-intensive processes. -- Ensure the specified S3 bucket exists in the given region and the EC2 user/role has read+write permissions. - -### Results - -Benchmark results are organized as follows: - -```shell -multirun/ -└── YYYY-MM-DD - └── HH-MM-SS - ├── 0 - │ ├── benchmark.log - │ └── results_small_nccl_2_2_s3.json - ├── 1 - │ ├── benchmark.log - │ └── results_small_nccl_2_2_disk.json - ├── 2 - │ ├── benchmark.log - │ └── results_small_nccl_4_2_s3.json - ├── 3 - │ ├── benchmark.log - │ └── results_small_nccl_4_2_disk.json - └── multirun.yaml -``` - -Each run creates a timestamped subdirectory. The `./multirun/` directory is managed by [Hydra](https://hydra.cc/). - -Result file names reflect the parameter combinations, e.g., - -``` -+model: vit-base -+backend: nccl -+world_size: 2 -+thread_count: 1 -+checkpoint.storage: s3 -``` - -will produce the file `results_vit-base_nccl_2_1_s3.json` (respecting parameters declaration order). +> [!IMPORTANT] +> A `+path` option is passed to the running script ([`run_dcp_benchmarks.sh`](../../../utils/run_dcp_benchmarks.sh)), +> and will be used only if `checkpoint.storage` key includes `disk`. ### References @@ -168,3 +27,5 @@ will produce the file `results_vit-base_nccl_2_1_s3.json` (respecting parameters - https://pytorch.org/tutorials/intermediate/ddp_tutorial.html [DCP]: https://pytorch.org/docs/stable/distributed.checkpoint.html + +[multirun]: https://hydra.cc/docs/tutorials/basic/running_your_app/multi-run/ diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/hydra_callback.py b/s3torchbenchmarking/src/s3torchbenchmarking/hydra_callback.py index fa09f1b8..caffb3f9 100644 --- a/s3torchbenchmarking/src/s3torchbenchmarking/hydra_callback.py +++ b/s3torchbenchmarking/src/s3torchbenchmarking/hydra_callback.py @@ -154,4 +154,4 @@ def _write_to_dynamodb(region: str, table_name: str, run: RunResults) -> None: table.put_item(Item=run_json) logger.info("✅ Put item into table successfully") except ClientError: - logger.error("Couldn't put item into table %s", table, exc_info=True) + logger.error("❌ Couldn't put item into table %s", table, exc_info=True) diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/models.py b/s3torchbenchmarking/src/s3torchbenchmarking/models.py index 577e0783..dce00bce 100644 --- a/s3torchbenchmarking/src/s3torchbenchmarking/models.py +++ b/s3torchbenchmarking/src/s3torchbenchmarking/models.py @@ -7,7 +7,8 @@ from abc import ABC, abstractmethod from functools import cached_property from io import IOBase -from typing import Optional, Any, Tuple, Union, Callable +from time import perf_counter +from typing import Optional, Any, Tuple, Union, Callable, List import lightning as L import torch @@ -122,11 +123,15 @@ def train_batch(self, batch_idx: int, data, target) -> Optional[Any]: def train(self, dataloader: DataLoader, epochs: int) -> ExperimentResult: """Train the model using given dataloader for number of epochs""" + + epoch_durations_s: List[float] = [] + with ResourceMonitor() as monitor: num_samples = 0 checkpoint_times = [] - start_time = time.perf_counter() + begin_training = perf_counter() for epoch in range(epochs): + begin_epoch = time.perf_counter() logger.info("Epoch #%i/%i", epoch, epochs - 1) for batch_idx, (data, target) in enumerate(dataloader): logger.debug("Batch #%i", batch_idx) @@ -134,14 +139,16 @@ def train(self, dataloader: DataLoader, epochs: int) -> ExperimentResult: num_samples += len(data) if result: checkpoint_times.append(result) - training_time = time.perf_counter() - start_time + epoch_durations_s.append(time.perf_counter() - begin_epoch) + training_duration_s = time.perf_counter() - begin_training - return ExperimentResult( - elapsed_time=training_time, - volume=num_samples, - checkpoint_times=checkpoint_times, - utilization=monitor.resource_data, - ) + return { + "training_duration_s": training_duration_s, + "epoch_durations_s": epoch_durations_s, + "volume": num_samples, + "checkpoint_times": checkpoint_times, + "utilization": monitor.resource_data, + } @abstractmethod def save(self, **kwargs): @@ -313,12 +320,12 @@ def train(self, dataloader: DataLoader, epochs: int) -> ExperimentResult: end_time = time.perf_counter() training_time = end_time - start_time - return ExperimentResult( - elapsed_time=training_time, - volume=sample_counting_cb.count, - checkpoint_times=profiling_checkpointer.save_times, - utilization=monitor.resource_data, - ) + return { + "training_duration_s": training_time, + "volume": sample_counting_cb.count, + "checkpoint_times": profiling_checkpointer.save_times, + "utilization": monitor.resource_data, + } def save(self, **kwargs): raise NotImplementedError( diff --git a/s3torchbenchmarking/utils/prepare_and_run_benchmark.sh b/s3torchbenchmarking/utils/prepare_and_run_benchmark.sh deleted file mode 100755 index 69edfdcf..00000000 --- a/s3torchbenchmarking/utils/prepare_and_run_benchmark.sh +++ /dev/null @@ -1,15 +0,0 @@ -DATALOADER=$1 -PATH_TO_STORE_DATASETS=$2 -BUCKET_NAME=$3 -REGION_NAME=$4 -RESULTS_BUCKET_NAME=$5 -RESULTS_REGION_NAME=$6 -RESULTS_PREFIX=$7 - -datasets=("100k_496x387_images_4Mb_shards" "100k_496x387_images_8Mb_shards" "100k_496x387_images_16Mb_shards" "100k_496x387_images_32Mb_shards" "100k_496x387_images_64Mb_shards" "100k_496x387_images_128Mb_shards" "100k_496x387_images_256Mb_shards" "10k_496x387_images") - -./utils/generate_datasets_files.sh "${PATH_TO_STORE_DATASETS}" "${BUCKET_NAME}" "${REGION_NAME}" "${datasets[@]}" -./utils/prepare_nvme.sh -rm -r -f ./multirun -./utils/run_dataloading_benchmarks.sh "${DATALOADER}" "${datasets[@]}" -python ./utils/upload_colated_results_to_s3.py "./multirun" "${RESULTS_BUCKET_NAME}" "${RESULTS_PREFIX}" "${DATALOADER}" diff --git a/s3torchbenchmarking/utils/prepare_ec2_instance.sh b/s3torchbenchmarking/utils/prepare_ec2_instance.sh deleted file mode 100755 index 7f530ae1..00000000 --- a/s3torchbenchmarking/utils/prepare_ec2_instance.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash -# -# Script to prepare an EC2 instance for PyTorch benchmarks. Like other scripts within this directory, it is assumed -# that this is run from within the "s3-connector-for-pytorch/s3torchbenchmarking" directory. - -set -eou pipefail - -# Sanity check + install Mountpoint for Amazon S3 -if [[ -n $(which yum) ]]; then - sudo yum -y upgrade - - wget https://s3.amazonaws.com/mountpoint-s3-release/latest/x86_64/mount-s3.rpm - sudo yum install -y ./mount-s3.rpm && rm ./mount-s3.rpm -elif [[ -n $(which apt) ]]; then - sudo apt -y upgrade - - wget https://s3.amazonaws.com/mountpoint-s3-release/latest/x86_64/mount-s3.deb - sudo apt install -y ./mount-s3.deb && rm ./mount-s3.deb -fi - -# Install s3torchconnector and s3torchbenchmarking -pip install 's3torchconnector[lightning,dcp]' -pip install -e . diff --git a/s3torchbenchmarking/utils/prepare_nvme.sh b/s3torchbenchmarking/utils/prepare_nvme.sh index c8882024..3c688db0 100755 --- a/s3torchbenchmarking/utils/prepare_nvme.sh +++ b/s3torchbenchmarking/utils/prepare_nvme.sh @@ -1,16 +1,20 @@ #!/usr/bin/env bash # -# Mount an NVMe drive (by default, at `./nvme/`) relative to where this script is run. If a drive is already mounted at -# the specified location, clear its content. +# Mount an NVMe drive (by default, at `./nvme/`). Script assumes that it is run on a DLAMI-based EC2 instance. nvme_dir=${1:-"./nvme/"} # default value if ! mountpoint -q "$nvme_dir"; then rm -rf "$nvme_dir" - sudo mkfs -t xfs /dev/nvme1n1 mkdir -p "$nvme_dir" - sudo mount /dev/nvme1n1 "$nvme_dir" + + if grep -q 'NAME="Amazon Linux"' /etc/os-release; then + sudo mkfs -t xfs /dev/nvme1n1 + sudo mount /dev/nvme1n1 "$nvme_dir" + elif grep -q 'NAME="Ubuntu"' /etc/os-release; then + sudo /opt/aws/dlami/bin/nvme_ephemeral_drives.sh + sudo mount /dev/vg.01/lv_ephemeral "$nvme_dir" + fi + sudo chmod 777 "$nvme_dir" -else - rm -rf "${nvme_dir:?}"/* # https://www.shellcheck.net/wiki/SC2115 fi diff --git a/s3torchbenchmarking/utils/run_benchmarks.sh b/s3torchbenchmarking/utils/run_benchmarks.sh index 124a47e9..fd141182 100755 --- a/s3torchbenchmarking/utils/run_benchmarks.sh +++ b/s3torchbenchmarking/utils/run_benchmarks.sh @@ -1,16 +1,23 @@ #!/usr/bin/env bash # -# Template script to run other benchmarks (not to be used directly). +# Template script to run other benchmarks (not meant to be used directly). set -euo pipefail -scenario=$1 # name of the scenario -nvme_dir="./nvme/" # local path for saving checkpoints +while getopts "s:d:" opt; do + case $opt in + s) scenario=$OPTARG ;; # name of the scenario + d) nvme_dir=$OPTARG ;; # mount point dir for saving checkpoints (will use NVMe drive) + *) ;; + esac +done -shift +shift $((OPTIND - 1)) # remove all processed positional arguments from "$@" # Prepare NVMe drive mount -./utils/prepare_nvme.sh "$nvme_dir" +if [[ -n $nvme_dir ]]; then + ./utils/prepare_nvme.sh "$nvme_dir" +fi -# Run benchmarks; will write to DynamoDB table, if specified in the config +# Run benchmarks; will write to DynamoDB table, if specified in the config (in `conf/aws/dynamodb.yaml`) python ./src/s3torchbenchmarking/"$scenario"/benchmark.py -cd conf -cn "$scenario" +path="$nvme_dir" "$@" diff --git a/s3torchbenchmarking/utils/run_checkpoint_benchmarks.sh b/s3torchbenchmarking/utils/run_checkpoint_benchmarks.sh index 471dd2ad..8abb39e1 100755 --- a/s3torchbenchmarking/utils/run_checkpoint_benchmarks.sh +++ b/s3torchbenchmarking/utils/run_checkpoint_benchmarks.sh @@ -2,4 +2,4 @@ # # Run PyTorch Checkpointing benchmarks. -./utils/run_benchmarks.sh pytorch_checkpointing "$@" +./utils/run_benchmarks.sh -s pytorch_checkpointing -d ./nvme/ "$@" diff --git a/s3torchbenchmarking/utils/run_dataloading_benchmarks.sh b/s3torchbenchmarking/utils/run_dataloading_benchmarks.sh deleted file mode 100755 index 509374b0..00000000 --- a/s3torchbenchmarking/utils/run_dataloading_benchmarks.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -# s3iterabledataset -# fsspec -# mountpoint -# mountpointcache - -# Check if the list of datasets is provided as an argument -if [ "$#" -lt 2 ]; then - echo "Usage: $0 [dataset2] [dataset3] ..." - exit 1 -fi - -DATALOADER=$1 -shift - -# Create an array from the remaining arguments (the datasets) -datasets=("$@") - -# work around for PyTorch's cuda clashing with installed locally from https://github.com/pytorch/pytorch/issues/119989 -unset LD_LIBRARY_PATH - -for dataset in "${datasets[@]}"; do - if [[ "$dataset" == *"shards"* ]]; then - s3torch-benchmark -cd conf -m -cn dataloading_sharded_vit "dataset=$dataset" "dataloader=$DATALOADER" - s3torch-benchmark -cd conf -m -cn dataloading_sharded_ent "dataset=$dataset" "dataloader=$DATALOADER" - else - s3torch-benchmark -cd conf -m -cn dataloading_unsharded_1epochs "dataset=$dataset" "dataloader=$DATALOADER" - s3torch-benchmark -cd conf -m -cn dataloading_unsharded_vit_10epochs "dataset=$dataset" "dataloader=$DATALOADER" - s3torch-benchmark -cd conf -m -cn dataloading_unsharded_ent_10epochs "dataset=$dataset" "dataloader=$DATALOADER" - fi -done diff --git a/s3torchbenchmarking/utils/run_dataset_benchmarks.sh b/s3torchbenchmarking/utils/run_dataset_benchmarks.sh index e9bb3983..fc508b61 100755 --- a/s3torchbenchmarking/utils/run_dataset_benchmarks.sh +++ b/s3torchbenchmarking/utils/run_dataset_benchmarks.sh @@ -2,5 +2,4 @@ # # Run dataset benchmarks. -# TODO: see if it can reuse the `run_benchmarks.sh` script template here -python ./src/s3torchbenchmarking/dataset/benchmark.py -cd conf -cn dataset "$@" +./utils/run_benchmarks.sh -s dataset -d ./nvme/ "$@" diff --git a/s3torchbenchmarking/utils/run_dcp_benchmarks.sh b/s3torchbenchmarking/utils/run_dcp_benchmarks.sh index 875b8384..d40cfb14 100755 --- a/s3torchbenchmarking/utils/run_dcp_benchmarks.sh +++ b/s3torchbenchmarking/utils/run_dcp_benchmarks.sh @@ -2,4 +2,4 @@ # # Run PyTorch’s Distributed Checkpointing (DCP) benchmarks. -./utils/run_benchmarks.sh dcp "$@" +./utils/run_benchmarks.sh -s dcp -d ./nvme/ "$@" diff --git a/s3torchbenchmarking/utils/run_lightning_benchmarks.sh b/s3torchbenchmarking/utils/run_lightning_benchmarks.sh index 60a8059e..ef285439 100755 --- a/s3torchbenchmarking/utils/run_lightning_benchmarks.sh +++ b/s3torchbenchmarking/utils/run_lightning_benchmarks.sh @@ -2,4 +2,4 @@ # # Run PyTorch Lightning Checkpointing benchmarks. -./utils/run_benchmarks.sh lightning_checkpointing "$@" +./utils/run_benchmarks.sh -s lightning_checkpointing -d ./nvme/ "$@" diff --git a/s3torchbenchmarking/utils/upload_colated_results_to_s3.py b/s3torchbenchmarking/utils/upload_colated_results_to_s3.py deleted file mode 100644 index 27661fd7..00000000 --- a/s3torchbenchmarking/utils/upload_colated_results_to_s3.py +++ /dev/null @@ -1,50 +0,0 @@ -import os -import boto3 -from botocore.exceptions import ClientError -import sys - -s3_client = boto3.client("s3") - - -def upload_file_to_s3(local_file_path: str, bucket_name: str, s3_file_key: str) -> None: - try: - s3_client.upload_file(local_file_path, bucket_name, s3_file_key) - print(f"Uploaded {local_file_path} to {bucket_name}/{s3_file_key}") - except ClientError as e: - print(f"Error uploading {local_file_path} to {bucket_name}/{s3_file_key}: {e}") - - -def traverse_folders( - folder_path: str, bucket_name: str, prefix: str, dataloader: str -) -> None: - for root, _, files in os.walk(folder_path): - for file in files: - if file == "collated_results.json": - local_file_path = os.path.join(root, file) - parent_folder = os.path.basename(os.path.dirname(local_file_path)) - s3_file_key = f"{prefix}/{dataloader}_{parent_folder}_{file}" - print(f"Uploading {local_file_path} to {bucket_name}/{s3_file_key}") - upload_file_to_s3(local_file_path, bucket_name, s3_file_key) - - -if __name__ == "__main__": - if len(sys.argv) != 5: - print("Usage: python script.py ROOT_FOLDER BUCKET_NAME FOLDER_PREFIX DS_PREFIX") - print( - "Example: python script.py ./multirun pytorch-benchmarks-results 20240810 s3iterabledataset" - ) - print("Note: ROOT_FOLDER is the root folder where the results are stored") - print( - "Note: BUCKET_NAME is the S3 bucket name where the results will be uploaded" - ) - print( - "Note: FOLDER_PREFIX is the prefix for the folder where the results are stored" - ) - print("Note: DS_PREFIX is the prefix for the dataset loader") - sys.exit(1) - - ROOT_FOLDER = sys.argv[1] - BUCKET_NAME = sys.argv[2] - FOLDER_PREFIX = sys.argv[3] - DS_PREFIX = sys.argv[4] - traverse_folders(ROOT_FOLDER, BUCKET_NAME, FOLDER_PREFIX, DS_PREFIX)