diff --git a/CALL_FOR_SUBMISSIONS.md b/CALL_FOR_SUBMISSIONS.md new file mode 100644 index 000000000..ecc7840e7 --- /dev/null +++ b/CALL_FOR_SUBMISSIONS.md @@ -0,0 +1,3 @@ +# MLCommons™ AlgoPerf: Call for Submissions + +🚧 **Coming soon!** 🚧 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 38867b369..025cb6d30 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,28 @@ -# Contributing +# MLCommons™ AlgoPerf: Contributing + +## Table of Contents + +- [Setup](#setup) + - [Setting up a Linux VM on GCP](#setting-up-a-linux-vm-on-gcp) + - [Installing GPU Drivers](#installing-gpu-drivers) + - [Authentication for Google Cloud Container Registry](#authentication-for-google-cloud-container-registry) +- [Installation](#installation) +- [Docker workflows](#docker-workflows) + - [Pre-built Images on Google Cloud Container Registry](#pre-built-images-on-google-cloud-container-registry) + - [Trigger rebuild and push of maintained images](#trigger-rebuild-and-push-of-maintained-images) + - [Trigger build and push of images on other branch](#trigger-build-and-push-of-images-on-other-branch) + - [GCP Data and Experiment Integration](#gcp-data-and-experiment-integration) + - [Downloading Data from GCP](#downloading-data-from-gcp) + - [Saving Experiments to GCP](#saving-experiments-to-gcp) + - [Getting Information from a Container](#getting-information-from-a-container) + - [Mounting Local Repository](#mounting-local-repository) +- [Submitting PRs](#submitting-prs) +- [Testing](#testing) + - [Style Testing](#style-testing) + - [Unit and integration tests](#unit-and-integration-tests) + - [Regression tests](#regression-tests) + +We invite everyone to look through our rules and codebase and submit issues and pull requests, e.g. for rules changes, clarifications, or any bugs you might encounter. If you are interested in contributing to the work of the working group and influence the benchmark's design decisions, please [join the weekly meetings](https://mlcommons.org/en/groups/research-algorithms/) and consider becoming a member of the working group. The best way to contribute to the MLCommons is to get involved with one of our many project communities. You find more information about getting involved with MLCommons [here](https://mlcommons.org/en/get-involved/#getting-started). @@ -8,29 +32,25 @@ To get started contributing code, you or your organization needs to sign the MLC MLCommons project work is tracked with issue trackers and pull requests. Modify the project in your own fork and issue a pull request once you want other developers to take a look at what you have done and discuss the proposed changes. Ensure that cla-bot and other checks pass for your Pull requests. -# Table of Contents -- [Setup](#setup) -- [Installation](#installation) -- [Docker workflows](#docker-workflows) -- [Submitting PRs](#submitting-prs) -- [Testing](#testing) +## Setup +### Setting up a Linux VM on GCP -# Setup -## Setting up a Linux VM on GCP If you want to run containers on GCP VMs or store and retrieve Docker images from the Google Cloud Container Registry, please read ahead. If you'd like to use a Linux VM, you will have to install the correct GPU drivers and the NVIDIA Docker toolkit. We recommmend to use the Deep Learning on Linux image. Further instructions are based on that. ### Installing GPU Drivers + You can use the `scripts/cloud-startup.sh` as a startup script for the VM. This will automate the installation of the NVIDIA GPU Drivers and NVIDIA Docker toolkit. ### Authentication for Google Cloud Container Registry + To access the Google Cloud Container Registry, you will have to authenticate to the repository whenever you use Docker. Use the gcloud credential helper as documented [here](https://cloud.google.com/artifact-registry/docs/docker/pushing-and-pulling#cred-helper). +## Installation -# Installation If you have not installed the package and dependencies yet see [Installation](./README.md#installation). To use the development tools such as `pytest` or `pylint` use the `dev` option: @@ -42,39 +62,34 @@ pre-commit install To get an installation with the requirements for all workloads and development, use the argument `[full_dev]`. +## Docker workflows +We recommend developing in our Docker image to ensure a consistent environment between developing, testing and scoring submissions. -# Docker workflows -We recommend developing in our Docker image to ensure a consistent environment between developing, testing and scoring submissions. +To get started see also: -To get started see: -- [Installation with Docker](./README.md#docker) +- [Installation with Docker](./README.md#docker) - [Running a submission inside a Docker Container](./getting_started.md#run-your-submission-in-a-docker-container) -Other resources: -- [Pre-built Images on Google Cloud Container Registry](#pre-built-images-on-google-cloud-container-registry) -- [GCP Data and Experiment Integration](#gcp-integration) - - [Downloading Data from GCP](#downloading-data-from-gcp) - - [Saving Experiments Results to GCP](#saving-experiments-to-gcp) -- [Getting Information from a Container](#getting-information-from-a-container) -- [Mounting local repository](#mounting-local-repository) +### Pre-built Images on Google Cloud Container Registry - -## Pre-built Images on Google Cloud Container Registry If you want to maintain or use images stored on our Google Cloud Container Registry read this section. You will have to use an authentication helper to set up permissions to access the repository: -``` + +```bash ARTIFACT_REGISTRY_URL=us-central1-docker.pkg.dev gcloud auth configure-docker $ARTIFACT_REGISTRY_URL ``` To pull the latest prebuilt image: -``` +```bash docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/ ``` -The naming convention for `image_name` is `algoperf__`. + +The naming convention for `image_name` is `algoperf__`. Currently maintained images on the repository are: + - `algoperf_jax_main` - `algoperf_pytorch_main` - `algoperf_both_main` @@ -82,32 +97,40 @@ Currently maintained images on the repository are: - `algoperf_pytorch_dev` - `algoperf_both_dev` -To reference the pulled image you will have to use the full `image_path`, e.g. +To reference the pulled image you will have to use the full `image_path`, e.g. `us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main`. ### Trigger rebuild and push of maintained images + To build and push all images (`pytorch`, `jax`, `both`) on maintained branches (`dev`, `main`). -``` + +```bash bash docker/build_docker_images.sh -b ``` #### Trigger build and push of images on other branch -You can also use the above script to build images from a different branch. + +You can also use the above script to build images from a different branch. + 1. Push the branch to `mlcommons/algorithmic-efficiency` repository. 2. Run - ``` + + ```bash bash docker/build_docker_images.sh -b ``` -## GCP Data and Experiment Integration -The Docker entrypoint script can transfer data to and from +### GCP Data and Experiment Integration + +The Docker entrypoint script can transfer data to and from our GCP buckets on our internal GCP project. If -you are an approved contributor you can get access to these resources to automatically download the datasets and upload experiment results. +you are an approved contributor you can get access to these resources to automatically download the datasets and upload experiment results. You can use these features by setting the `--internal_contributor` flag to 'true' for the Docker entrypoint script. ### Downloading Data from GCP + To run a docker container that will only download data (if not found on host) -``` + +```bash docker run -t -d \ -v $HOME/data/:/data/ \ -v $HOME/experiment_runs/:/experiment_runs \ @@ -120,15 +143,18 @@ docker run -t -d \ --keep_container_alive \ --internal_contributor true ``` + If `keep_container_alive` is `true` the main process on the container will persist after finishing the data download. -This run command is useful if you are developing or debugging. +This run command is useful if you are developing or debugging. ### Saving Experiments to GCP + If you set the internal collaborator mode to true experiments will also be automatically uploaded to our GCP bucket under `gs://mlcommons-runs/ ``` To enter a bash session in the container -``` + +```bash docker exec -it /bin/bash ``` -## Mounting Local Repository +### Mounting Local Repository + Rebuilding the docker image can become tedious if you are making frequent changes to the code. -To have changes in your local copy of the algorithmic-efficiency repo be reflected inside the container you can mount the local repository with the `-v` flag. -``` +To have changes in your local copy of the algorithmic-efficiency repo be reflected inside the container you can mount the local repository with the `-v` flag. + +```bash docker run -t -d \ -v $HOME/data/:/data/ \ -v $HOME/experiment_runs/:/experiment_runs \ @@ -178,33 +210,40 @@ docker run -t -d \ --keep_container_alive true ``` -# Submitting PRs +## Submitting PRs + New PRs will be merged on the dev branch by default, given that they pass the presubmits. -# Testing +## Testing + We run tests with GitHub Actions, configured in the [.github/workflows](https://github.com/mlcommons/algorithmic-efficiency/tree/main/.github/workflows) folder. -## Style Testing +### Style Testing + We run yapf and linting tests on PRs. You can view and fix offending errors with these instructions. To run the below commands, use the versions installed via `pip install -e '.[dev]'`. To automatically fix formatting errors, run the following (*WARNING:* this will edit your code, so it is suggested to make a git commit first!): + ```bash yapf -i -r -vv -p algorithmic_efficiency baselines datasets reference_algorithms tests *.py ``` To sort all import orderings, run the following: + ```bash isort . ``` To just print out all offending import orderings, run the following: + ```bash isort . --check --diff ``` To print out all offending pylint issues, run the following: + ```bash pylint algorithmic_efficiency pylint baselines @@ -218,16 +257,20 @@ pylint tests We run unit tests and integration tests as part of the of github actions as well. You can also use `python tests/reference_algorithm_tests.py` to run a single model update and two model evals for each workload using the reference algorithm in `reference_algorithms/target_setting_algorithms/`. -## Regression tests +### Regression tests + We also have regression tests available in [.github/workflows/regression_tests.yml](https://github.com/mlcommons/algorithmic-efficiency/tree/main/.github/workflows/regression_tests.yml) that can be run semi-automatically. -The regression tests are shorter end-to-end submissions run in a containerized environment across all 8 workloads, in both the jax and pytorch frameworks. +The regression tests are shorter end-to-end submissions run in a containerized environment across all 8 workloads, in both the jax and pytorch frameworks. The regression tests run on self-hosted runners and are triggered for pull requests that target the main branch. Typically these PRs will be from the `dev` branch so the tests will run containers based on images build from the `dev` branch. To run a regression test: + 1. Build and upload latest Docker images from dev branch. - ``` + + ```bash bash ~/algorithmic-efficiency/docker/build_docker_images.sh -b dev ``` + 2. Turn on the self-hosted runner. 3. Run the self-hosted runner application for the runner to accept jobs. 4. Open a pull request into mian to trigger the workflow. diff --git a/getting_started.md b/GETTING_STARTED.md similarity index 89% rename from getting_started.md rename to GETTING_STARTED.md index 96e58edab..1369f5cc7 100644 --- a/getting_started.md +++ b/GETTING_STARTED.md @@ -1,52 +1,68 @@ -# Getting Started +# MLCommons™ AlgoPerf: Getting Started -Table of Contents: -- [Set up and installation](#set-up-and-installation) +## Table of Contents + +- [Set up and installation](#set-up-and-installation) - [Download the data](#download-the-data) - [Develop your submission](#develop-your-submission) + - [Set up your directory structure (Optional)](#set-up-your-directory-structure-optional) + - [Coding your submission](#coding-your-submission) - [Run your submission](#run-your-submission) - - [Docker](#run-your-submission-in-a-docker-container) + - [Pytorch DDP](#pytorch-ddp) + - [Run your submission in a Docker container](#run-your-submission-in-a-docker-container) + - [Docker Tips](#docker-tips) - [Score your submission](#score-your-submission) +- [Good Luck](#good-luck) ## Set up and installation + To get started you will have to make a few decisions and install the repository along with its dependencies. Specifically: + 1. Decide if you would like to develop your submission in either Pytorch or Jax. -2. Set up your workstation or VM. We recommend to use a setup similar to the [benchmarking hardware](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#benchmarking-hardware). +2. Set up your workstation or VM. We recommend to use a setup similar to the [benchmarking hardware](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#benchmarking-hardware). The specs on the benchmarking machines are: - 8 V100 GPUs - 240 GB in RAM - - 2 TB in storage (for datasets). + - 2 TB in storage (for datasets). + 3. Install the algorithmic package and dependencies, see [Installation](./README.md#installation). ## Download the data -The workloads in this benchmark use 6 different datasets across 8 workloads. You may choose to download some or all of the datasets as you are developing your submission, but your submission will be scored across all 8 workloads. For instructions on obtaining and setting up the datasets see [datasets/README](https://github.com/mlcommons/algorithmic-efficiency/blob/main/datasets/README.md#dataset-setup). +The workloads in this benchmark use 6 different datasets across 8 workloads. You may choose to download some or all of the datasets as you are developing your submission, but your submission will be scored across all 8 workloads. For instructions on obtaining and setting up the datasets see [datasets/README](https://github.com/mlcommons/algorithmic-efficiency/blob/main/datasets/README.md#dataset-setup). ## Develop your submission + To develop a submission you will write a python module containing your optimizer algorithm. Your optimizer must implement a set of predefined API methods for the initialization and update steps. ### Set up your directory structure (Optional) + Make a submissions subdirectory to store your submission modules e.g. `algorithmic-effiency/submissions/my_submissions`. ### Coding your submission + You can find examples of sumbission modules under `algorithmic-efficiency/baselines` and `algorithmic-efficiency/reference_algorithms`. \ A submission for the external ruleset will consist of a submission module and a tuning search space definition. + 1. Copy the template submission module `submissions/template/submission.py` into your submissions directory e.g. in `algorithmic-efficiency/my_submissions`. 2. Implement at least the methods in the template submission module. Feel free to use helper functions and/or modules as you see fit. Make sure you adhere to to the competition rules. Check out the guidelines for [allowed submissions](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#disallowed-submissions), [disallowed submissions](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#disallowed-submissions) and pay special attention to the [software dependencies rule](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#software-dependencies). 3. Add a tuning configuration e.g. `tuning_search_space.json` file to your submission directory. For the tuning search space you can either: 1. Define the set of feasible points by defining a value for "feasible_points" for the hyperparameters: - ``` + + ```JSON { "learning_rate": { "feasible_points": 0.999 }, } ``` + For a complete example see [tuning_search_space.json](https://github.com/mlcommons/algorithmic-efficiency/blob/main/reference_algorithms/target_setting_algorithms/imagenet_resnet/tuning_search_space.json). - 2. Define a range of values for quasirandom sampling by specifing a `min`, `max` and `scaling` + 2. Define a range of values for quasirandom sampling by specifing a `min`, `max` and `scaling` keys for the hyperparameter: - ``` + + ```JSON { "weight_decay": { "min": 5e-3, @@ -55,14 +71,15 @@ A submission for the external ruleset will consist of a submission module and a } } ``` - For a complete example see [tuning_search_space.json](https://github.com/mlcommons/algorithmic-efficiency/blob/main/baselines/nadamw/tuning_search_space.json). + For a complete example see [tuning_search_space.json](https://github.com/mlcommons/algorithmic-efficiency/blob/main/baselines/nadamw/tuning_search_space.json). ## Run your submission From your virtual environment or interactively running Docker container run your submission with `submission_runner.py`: -**JAX**: to score your submission on a workload, from the algorithmic-efficency directory run: +**JAX**: to score your submission on a workload, from the algorithmic-efficency directory run: + ```bash python3 submission_runner.py \ --framework=jax \ @@ -73,7 +90,8 @@ python3 submission_runner.py \ --tuning_search_space= ``` -**Pytorch**: to score your submission on a workload, from the algorithmic-efficency directory run: +**Pytorch**: to score your submission on a workload, from the algorithmic-efficency directory run: + ```bash python3 submission_runner.py \ --framework=pytorch \ @@ -84,14 +102,18 @@ python3 submission_runner.py \ --tuning_search_space= ``` -#### Pytorch DDP -We recommend using PyTorch's [Distributed Data Parallel (DDP)](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) -when using multiple GPUs on a single node. You can initialize ddp with torchrun. +### Pytorch DDP + +We recommend using PyTorch's [Distributed Data Parallel (DDP)](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) +when using multiple GPUs on a single node. You can initialize ddp with torchrun. For example, on single host with 8 GPUs simply replace `python3` in the above command by: + ```bash torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 --standalone --nnodes=1 --nproc_per_node=N_GPUS ``` + So the complete command is: + ```bash torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 \ --standalone \ @@ -109,17 +131,18 @@ torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 \ ### Run your submission in a Docker container The container entrypoint script provides the following flags: + - `--dataset` dataset: can be 'imagenet', 'fastmri', 'librispeech', 'criteo1tb', 'wmt', or 'ogbg'. Setting this flag will download data if `~/data/` does not exist on the host machine. Required for running a submission. - `--framework` framework: can be either 'pytorch' or 'jax'. If you just want to download data, this flag is required for `-d imagenet` since we have two versions of data for imagenet. This flag is also required for running a submission. -- `--submission_path` submission_path: path to submission file on container filesystem. If this flag is set, the container will run a submission, so it is required for running a submission. +- `--submission_path` submission_path: path to submission file on container filesystem. If this flag is set, the container will run a submission, so it is required for running a submission. - `--tuning_search_space` tuning_search_space: path to file containing tuning search space on container filesystem. Required for running a submission. - `--experiment_name` experiment_name: name of experiment. Required for running a submission. - `--workload` workload: can be 'imagenet_resnet', 'imagenet_jax', 'librispeech_deepspeech', 'librispeech_conformer', 'ogbg', 'wmt', 'fastmri' or 'criteo1tb'. Required for running a submission. - `--max_global_steps` max_global_steps: maximum number of steps to run the workload for. Optional. - `--keep_container_alive` : can be true or false. If`true` the container will not be killed automatically. This is useful for developing or debugging. - To run the docker container that will run the submission runner run: + ```bash docker run -t -d \ -v $HOME/data/:/data/ \ @@ -136,32 +159,37 @@ docker run -t -d \ --workload \ --keep_container_alive ``` + This will print the container ID to the terminal. -#### Docker Tips #### +#### Docker Tips To find the container IDs of running containers -``` + +```bash docker ps ``` To see output of the entrypoint script -``` + +```bash docker logs ``` To enter a bash session in the container -``` + +```bash docker exec -it /bin/bash ``` -## Score your submission +## Score your submission + To produce performance profile and performance table: + ```bash python3 scoring/score_submission.py --experiment_path= --output_dir= ``` -We provide the scores and performance profiles for the baseline algorithms in the "Baseline Results" section in [Benchmarking Neural Network Training Algorithms](https://arxiv.org/abs/2306.07179). - +We provide the scores and performance profiles for the baseline algorithms in the "Baseline Results" section in [Benchmarking Neural Network Training Algorithms](https://arxiv.org/abs/2306.07179). -## Good Luck! +## Good Luck diff --git a/README.md b/README.md index 6ffbab6f7..dd0d7fe3e 100644 --- a/README.md +++ b/README.md @@ -22,20 +22,38 @@ [MLCommons Algorithmic Efficiency](https://mlcommons.org/en/groups/research-algorithms/) is a benchmark and competition measuring neural network training speedups due to algorithmic improvements in both training algorithms and models. This repository holds the [competition rules](RULES.md) and the benchmark code to run it. For a detailed description of the benchmark design, see our [paper](https://arxiv.org/abs/2306.07179). -# Table of Contents +## Table of Contents + +- [Table of Contents](#table-of-contents) - [Installation](#installation) - - [Python Virtual Environment](#python-virtual-environment) - - [Docker](#docker) + - [Python virtual environment](#python-virtual-environment) + - [Docker](#docker) + - [Building Docker Image](#building-docker-image) + - [Running Docker Container (Interactive)](#running-docker-container-interactive) + - [Running Docker Container (End-to-end)](#running-docker-container-end-to-end) + - [Using Singularity/Apptainer instead of Docker](#using-singularityapptainer-instead-of-docker) - [Getting Started](#getting-started) + - [Running a workload](#running-a-workload) + - [JAX](#jax) + - [Pytorch](#pytorch) - [Rules](#rules) - [Contributing](#contributing) -- [Diclaimers](#disclaimers) -- [FAQS](#faqs) -- [Citing AlgoPerf Benchmark](#citing-algoperf-benchmark) +- [Shared data pipelines between JAX and PyTorch](#shared-data-pipelines-between-jax-and-pytorch) +- [Setup and Platform](#setup-and-platform) + - [My machine only has one GPU. How can I use this repo?](#my-machine-only-has-one-gpu-how-can-i-use-this-repo) + - [How do I run this on my SLURM cluster?](#how-do-i-run-this-on-my-slurm-cluster) + - [How can I run this on my AWS/GCP/Azure cloud project?](#how-can-i-run-this-on-my-awsgcpazure-cloud-project) +- [Submissions](#submissions) + - [Can submission be structured using multiple files?](#can-submission-be-structured-using-multiple-files) + - [Can I install custom dependencies?](#can-i-install-custom-dependencies) + - [How can I know if my code can be run on benchmarking hardware?](#how-can-i-know-if-my-code-can-be-run-on-benchmarking-hardware) + - [Are we allowed to use our own hardware to self-report the results?](#are-we-allowed-to-use-our-own-hardware-to-self-report-the-results) + ## Installation + You can install this package and dependences in a [python virtual environment](#virtual-environment) or use a [Docker/Singularity/Apptainer container](#install-in-docker) (recommended). *TL;DR to install the Jax version for GPU run:* @@ -53,10 +71,13 @@ You can install this package and dependences in a [python virtual environment](# pip3 install -e '.[pytorch_gpu]' -f 'https://download.pytorch.org/whl/torch_stable.html' pip3 install -e '.[full]' ``` -## Python virtual environment + +### Python virtual environment + Note: Python minimum requirement >= 3.8 To set up a virtual enviornment and install this repository + 1. Create new environment, e.g. via `conda` or `virtualenv` ```bash @@ -89,17 +110,21 @@ or all workloads at once via ```bash pip3 install -e '.[full]' ``` + -## Docker +### Docker + We recommend using a Docker container to ensure a similar environment to our scoring and testing environments. Alternatively, a Singularity/Apptainer container can also be used (see instructions below). +We recommend using a Docker container to ensure a similar environment to our scoring and testing environments. -**Prerequisites for NVIDIA GPU set up**: You may have to install the NVIDIA Container Toolkit so that the containers can locate the NVIDIA drivers and GPUs. +**Prerequisites for NVIDIA GPU set up**: You may have to install the NVIDIA Container Toolkit so that the containers can locate the NVIDIA drivers and GPUs. See instructions [here](https://github.com/NVIDIA/nvidia-docker). -### Building Docker Image +#### Building Docker Image + 1. Clone this repository ```bash @@ -107,17 +132,21 @@ See instructions [here](https://github.com/NVIDIA/nvidia-docker). ``` 2. Build Docker Image + ```bash cd algorithmic-efficiency/docker docker build -t . --build-arg framework= ``` + The `framework` flag can be either `pytorch`, `jax` or `both`. Specifying the framework will install the framework specific dependencies. The `docker_image_name` is arbitrary. +#### Running Docker Container (Interactive) -### Running Docker Container (Interactive) To use the Docker container as an interactive virtual environment, you can run a container mounted to your local data and code directories and execute the `bash` program. This may be useful if you are in the process of developing a submission. -1. Run detached Docker Container. The `container_id` will be printed if the container is running successfully. + +1. Run detached Docker Container. The container_id will be printed if the container is run successfully. + ```bash docker run -t -d \ -v $HOME/data/:/data/ \ @@ -142,36 +171,47 @@ To use the Docker container as an interactive virtual environment, you can run a docker exec -it /bin/bash ``` -### Running Docker Container (End-to-end) +#### Running Docker Container (End-to-end) + To run a submission end-to-end in a containerized environment see [Getting Started Document](./getting_started.md#run-your-submission-in-a-docker-container). ### Using Singularity/Apptainer instead of Docker + Since many compute clusters don't allow the usage of Docker due to securtiy concerns and instead encourage the use of [Singularity/Apptainer](https://github.com/apptainer/apptainer) (formerly Singularity, now called Apptainer), we also provide instructions on how to build an Apptainer container based on the here provided Dockerfile. To convert the Dockerfile into an Apptainer definition file, we will use [spython](https://github.com/singularityhub/singularity-cli): + ```bash pip3 install spython cd algorithmic-efficiency/docker spython recipe Dockerfile &> Singularity.def ``` + Now we can build the Apptainer image by running + ```bash singularity build --fakeroot .sif Singularity.def ``` + To start a shell session with GPU support (by using the `--nv` flag), we can run + ```bash singularity shell --nv .sif ``` + Similarly to Docker, Apptainer allows you to bind specific paths on the host system and the container by specifying the `--bind` flag, as explained [here](https://docs.sylabs.io/guides/3.7/user-guide/bind_paths_and_mounts.html). -# Getting Started +## Getting Started + For instructions on developing and scoring your own algorithm in the benchmark see [Getting Started Document](./getting_started.md). -## Running a workload + +### Running a workload + To run a submission directly by running a Docker container, see [Getting Started Document](./getting_started.md#run-your-submission-in-a-docker-container). From your virtual environment or interactively running Docker container run: -**JAX** +#### JAX ```bash python3 submission_runner.py \ @@ -183,7 +223,7 @@ python3 submission_runner.py \ --tuning_search_space=baselines/adamw/tuning_search_space.json ``` -**Pytorch** +#### Pytorch ```bash python3 submission_runner.py \ @@ -194,6 +234,7 @@ python3 submission_runner.py \ --submission_path=baselines/adamw/jax/submission.py \ --tuning_search_space=baselines/adamw/tuning_search_space.json ``` +
Using Pytorch DDP (Recommended) @@ -207,12 +248,14 @@ torchrun --standalone --nnodes=1 --nproc_per_node=N_GPUS ``` where `N_GPUS` is the number of available GPUs on the node. To only see output from the first process, you can run the following to redirect the output from processes 1-7 to a log file: + ```bash torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 --standalone --nnodes=1 --nproc_per_node=8 ``` So the complete command is for example: -``` + +```bash torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 --standalone --nnodes=1 --nproc_per_node=8 \ submission_runner.py \ --framework=pytorch \ @@ -222,13 +265,15 @@ submission_runner.py \ --submission_path=baselines/adamw/jax/submission.py \ --tuning_search_space=baselines/adamw/tuning_search_space.json ``` +
+## Rules -# Rules The rules for the MLCommons Algorithmic Efficency benchmark can be found in the seperate [rules document](RULES.md). Suggestions, clarifications and questions can be raised via pull requests. -# Contributing +## Contributing + If you are interested in contributing to the work of the working group, feel free to [join the weekly meetings](https://mlcommons.org/en/groups/research-algorithms/), open issues. See our [CONTRIBUTING.md](CONTRIBUTING.md) for MLCommons contributing guidelines and setup and workflow instructions. @@ -241,6 +286,11 @@ The JAX and PyTorch versions of the Criteo, FastMRI, Librispeech, OGBG, and WMT Since we use PyTorch's [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel) implementation, there is one Python process for each device. Depending on the hardware and the settings of the cluster, running a TensorFlow input pipeline in each Python process can lead to errors, since too many threads are created in each process. See [this PR thread](https://github.com/mlcommons/algorithmic-efficiency/pull/85) for more details. While this issue might not affect all setups, we currently implement a different strategy: we only run the TensorFlow input pipeline in one Python process (with `rank == 0`), and [broadcast](https://pytorch.org/docs/stable/distributed.html#torch.distributed.broadcast) the batches to all other devices. This introduces an additional communication overhead for each batch. See the [implementation for the WMT workload](https://github.com/mlcommons/algorithmic-efficiency/blob/main/algorithmic_efficiency/workloads/wmt/wmt_pytorch/workload.py#L215-L288) as an example. +## Pytorch Conformer CUDA OOM + +The conformer pytorch workload may run out of memory in current state. Please set the `submission_runner.py` flag `reduce_pytorch_max_split_size` to `True` as a temporary workaround if you encounter this issue. This will set 'max_split_size_mb:256'. Note that this will adversely impact the performance of the submission on this workload. See [tracking issue](https://github.com/mlcommons/algorithmic-efficiency/issues/497). + + # FAQS ## Setup and Platform diff --git a/RULES.md b/RULES.md index 873cc1786..d74525244 100644 --- a/RULES.md +++ b/RULES.md @@ -1,10 +1,12 @@ # MLCommons™ AlgoPerf: Benchmark Rules -**Version:** 0.0.16 *(Last updated 28 April 2023)* +**Version:** 0.0.18 *(Last updated 03 Oktober 2023)* > **TL;DR** New training algorithms and models can make neural net training faster. > We need a rigorous training time benchmark that measures time to result given a fixed hardware configuration and stimulates algorithmic progress. We propose a [Training Algorithm Track](#training-algorithm-track) and a [Model Track](#model-track) in order to help disentangle optimizer improvements and model architecture improvements. This two-track structure lets us enforce a requirement that new optimizers work well on multiple models and that new models aren't highly specific to particular training hacks. +## Table of Contents + - [Introduction](#introduction) - [Training Algorithm Track](#training-algorithm-track) - [Submissions](#submissions) @@ -23,9 +25,6 @@ - [Defining target performance](#defining-target-performance) - [Benchmark score using performance profiles](#benchmark-score-using-performance-profiles) - [Benchmark Procedure](#benchmark-procedure) - - [Multiple Submission](#multiple-submission) - - [Licensing](#licensing) - - [Awards and prize money](#awards-and-prize-money) - [Model Track](#model-track) ## Introduction @@ -47,9 +46,11 @@ Submissions to the Training Algorithm Track can be entered under two separate ru The intention is that a training algorithm submission will be broadly applicable and useful without customization to the specific [workload](#workloads) (model, dataset, loss function). We want to discourage detecting the particular workload and doing something highly specific that isn't generally useful. In order to further discourage submissions that overfit to the particular [fixed benchmark workloads](#fixed-workloads), submissions will also be evaluated on [held-out workloads](#randomized-workloads) specified after the submission deadline. +For a description of how to submit a training algorithm to the AlgoPerf: Training Algorithms Benchmark, see the [Call for submissions](CALL_FOR_SUBMISSIONS.md), which details the entire competition process. + ### Submissions -A valid submission is a piece of code that defines all of the submission functions and is able to train all benchmark workloads on the [benchmarking hardware](#benchmarking-hardware) (defined in the [Scoring](#scoring) section). Both the validation set and the test set performance will be checked regularly during training (see the [Evaluation during training](#evaluation-during-training) section). Training halts when the workload-specific [target errors](#defining-target-performance) for the validation and test sets have been reached. For each workload, the training time to reach the *test* set target error is used as input to the [scoring process](#scoring) for the submission. Submissions using [external tuning](#external-tuning-ruleset) will be tuned independently for each workload using a single workload-agnostic search space for their specified hyperparameters. The tuning trials are selected based on the time to reach the *validation* target, but only their training times to reach the *test* target will be used for scoring. Submissions under either tuning ruleset may always self-tune while on the clock. +A valid submission is a piece of code that defines all of the submission functions and is able to train all benchmark workloads on the [benchmarking hardware](#benchmarking-hardware) (defined in the [Scoring](#scoring) section). Both the validation set and the test set performance will be checked regularly during training (see the [Evaluation during training](#evaluation-during-training) section), however, only the validation performance is relevant for scoring. Training halts when the workload-specific [target errors](#defining-target-performance) for the validation and test sets have been reached. For each workload, only the training time to reach the *validation* set target error is used as input to the [scoring process](#scoring) for the submission. Submissions using [external tuning](#external-tuning-ruleset) will be tuned independently for each workload using a single workload-agnostic search space for their specified hyperparameters. The tuning trials are selected based on the time to reach the *validation* target. Submissions under either tuning ruleset may always self-tune while on the clock. #### Specification @@ -355,17 +356,17 @@ Tuning will be substantially different for the [external](#external-tuning-rules For each workload, the hyperparameters are tuned using $O=20$ tuning **trials**. To estimate the variance of the results, this tuning will be repeated for $S=5$ **studies**, for a total of $S\cdot O = 100$ different hyperparameter settings. The submitters will provide a workload-agnostic search space and the working group will then return $100$ hyperparameters settings obtained using [(quasi)random search](https://arxiv.org/abs/1706.03200). The working group will also randomly partition these $100$ trials into $5$ studies of $20$ trials each. In lieu of independent samples from a search space, submissions can instead supply a fixed list of $20$ hyper-parameter points that will be sampled without replacement. -In each trial, the tuning trial with the fastest training time to achieve the *validation target* is determined among the $O=20$ hyperparameter settings. For scoring, however, we use the training time to reach the *test targets* of those $5$ selected runs. The median of these $5$ per-study training times will be the final training time for the submission on this workload and is used in the scoring procedure (see the "[Scoring submissions](#scoring)" section). In other words: We use the *validation performance* for tuning and selecting the best hyperparameter but use the *test performance* when measuring the training speed. Runs that do not reach the target performance of the evaluation metric have an infinite time. Submissions are always free to perform additional self-tuning while being timed. +In each trial, the tuning trial with the fastest training time to achieve the *validation target* is determined among the $O=20$ hyperparameter settings. For scoring, we use this required training time to reach the *validation targets* of those $5$ selected runs. The median of these $5$ per-study training times will be the final training time for the submission on this workload and is used in the scoring procedure (see the "[Scoring submissions](#scoring)" section). Runs that do not reach the target performance of the evaluation metric have an infinite time. Submissions are always free to perform additional self-tuning while being timed. #### Self-tuning ruleset Submissions to this ruleset are not allowed to have user-defined hyperparameters. This ruleset allows both submissions that use the same hyperparameters for all workloads, including the randomized ones (e.g. Adam with default parameters), as well as submissions that perform inner-loop tuning during their training run (e.g. SGD with line searches). -Submissions will run on one instance of the [benchmarking hardware](#benchmarking-hardware). As always, submissions are allowed to perform inner-loop tuning (e.g. for their learning rate) but the tuning efforts will be part of their score. A submission will run *S=5* times and its score will be the median time to reach the target evaluation metric value on the test set. To account for the lack of external tuning, submissions have a longer time budget to reach the target performance. Compared to the [external tuning ruleset](#external-tuning-ruleset), the `max_runtime` is tripled. Runs that do not reach the target performance of the evaluation metric within this allotted time budget have an infinite time. +Submissions will run on one instance of the [benchmarking hardware](#benchmarking-hardware). As always, submissions are allowed to perform inner-loop tuning (e.g. for their learning rate) but the tuning efforts will be part of their score. A submission will run *S=5* times and its score will be the median time to reach the target evaluation metric value on the validation set. To account for the lack of external tuning, submissions have a longer time budget to reach the target performance. Compared to the [external tuning ruleset](#external-tuning-ruleset), the `max_runtime` is tripled. Runs that do not reach the target performance of the evaluation metric within this allotted time budget have an infinite time. ### Workloads -For the purposes of the Training Algorithm Track, we consider a workload the combination of a `dataset`, `model`, `loss_fn`, along with a target that is defined over some evaluation metric. E.g., ResNet50 on ImageNet using the cross-entropy loss until a target error of 34.6% on the test set has been reached, would constitute a workload. The evaluation metric, in this example the misclassification error rate, is directly implied by the dataset/task. +For the purposes of the Training Algorithm Track, we consider a workload the combination of a `dataset`, `model`, `loss_fn`, along with a target that is defined over some evaluation metric. E.g., ResNet50 on ImageNet using the cross-entropy loss until a target error of 22.6% on the validation set has been reached, would constitute a workload. The evaluation metric, in this example the misclassification error rate, is directly implied by the dataset/task. Submissions will be scored based on their performance on the [fixed workload](#fixed-workloads). However, additionally submissions must also perform resonably well on a set of [held-out workloads](#randomized-workloads) in order for their score on the fixed workload to count (for full details see the [Scoring](#scoring) section). These held-out workloads will be generated after the submission deadline, but their randomized generating process is publicly available with the call for submissions (see "[Randomized workloads](#randomized-workloads)" section). @@ -400,7 +401,7 @@ Our scoring procedure uses the held-out workloads only to penalize submissions t #### Qualification set -The qualification set is designed for submitters that may not have the compute resources to self-report on the full set of [fixed](#fixed-workloads) and [held-out workloads](#randomized-workloads). They may instead self-report numbers on this smaller qualification set. The best-performing submissions may then qualify for compute sponsorship offering a free evaluation on the full benchmark set and therefore the possibility to win [awards and prize money](#awards-and-prize-money). +The qualification set is designed for submitters that may not have the compute resources to self-report on the full set of [fixed](#fixed-workloads) and [held-out workloads](#randomized-workloads). They may instead self-report numbers on this smaller qualification set. The best-performing submissions may then qualify for compute sponsorship offering a free evaluation on the full benchmark set and therefore the possibility to win [awards and prize money](/SUBMISSION_PROCESS_RULES.md#awards-and-prize-money). The qualification set consists of the same [fixed workloads](#fixed-workloads) as mentioned above, except for both workloads on *ImageNet*, both workloads on *LibriSpeech*, and the *fastMRI* workload. The remaining three workloads (*WMT*, *Criteo 1TB*, and *OGBG*) form the qualification set. There are no [randomized workloads](#randomized-workloads) in the qualification set. The qualification set of workloads aims to have a combined runtime of roughly 24 hours on the [benchmarking hardware](#benchmarking-hardware). @@ -408,9 +409,9 @@ For the [external tuning ruleset](#external-tuning-ruleset), we will only use $1 ### Scoring -Submissions will be scored based on their required training time to reach the target performance on the test set of each workload. This target performance metric can be the same as the loss function but might also be a different workload-specific metric such as the error rate or BLEU score. The target performance was defined using four standard training algorithms, see the "[Defining target performance](#defining-target-performance)" section for more details. The training time of a submission includes the compilation times for computation graphs and ops that could happen just-in-time during training; all our benchmarks should be fast enough to compile so as not to dramatically impact overall performance. The overall ranking is then determined by summarizing the performances across all [fixed workloads](#fixed-workloads), using [performance profiles](#benchmark-score-using-performance-profiles), as explained below. +Submissions will be scored based on their required training time to reach the target performance on the validation set of each workload. This target performance metric can be the same as the loss function but might also be a different workload-specific metric such as the error rate or BLEU score. The target performance was defined using four standard training algorithms, see the "[Defining target performance](#defining-target-performance)" section for more details. The training time of a submission includes the compilation times for computation graphs and ops that could happen just-in-time during training; all our benchmarks should be fast enough to compile so as not to dramatically impact overall performance. The overall ranking is then determined by summarizing the performances across all [fixed workloads](#fixed-workloads), using [performance profiles](#benchmark-score-using-performance-profiles), as explained below. -While the training time to the *test set* target is used for scoring, we use the training time to the *validation set* target for tuning. This is only relevant for submissions in the [external tuning ruleset](#external-tuning-ruleset) but is also enforced for self-reported results (i.e. submissions in the self-reported ruleset must also reach the validation target in time but only the time to the test target is used for scoring). Submitters must select the hyperparameter setting that reached the *validation* target the fastest, irrespective of its training time to achieve the *test* target. This ensures a fair and practical procedure. +The training time until the target performance on the test set was reached is not used in the scoring procedure but might be used for additional analysis of the competition results. #### Benchmarking hardware @@ -429,7 +430,7 @@ Both [tuning rulesets](#tuning) will use the same target performances. The runti We will aggregate the training times of a submission on all fixed workloads using [Performance Profiles](http://www.argmin.net/2018/03/26/performance-profiles/) (originally from [Dolan and Moré](https://arxiv.org/abs/cs/0102001)). Below we surface several relevant definitions from their work for easier readability, before explaining how we integrate the performance profiles to reach a scalar benchmark score that will be used for ranking submissions. -*Notation:* We have a set $\mathcal{S} = \{s_1, s_2, \dots, s_k\}$ of in total $k$ submissions that we evaluate on a set of $n$ fixed workloads: $\mathcal{W} = \{w_1, w_2, \dots, w_n\}$. For each submission $s$ and each workload $w$ we have a training time score $t_{s,w} \in [0,\infty)$. This is the time it took the submission to reach the test target performance on this particular workload. +*Notation:* We have a set $\mathcal{S} = \{s_1, s_2, \dots, s_k\}$ of in total $k$ submissions that we evaluate on a set of $n$ fixed workloads: $\mathcal{W} = \{w_1, w_2, \dots, w_n\}$. For each submission $s$ and each workload $w$ we have a training time score $t_{s,w} \in [0,\infty)$. This is the time it took the submission to reach the validation target performance on this particular workload. ##### Computing performance ratios @@ -465,10 +466,10 @@ The integral is normalized by the total integration area, with higher benchmark For the benchmark score, we compute and integrate the performance profiles using the training times of only the fixed workloads. But we use the submission's performance on the held-out workloads to penalize submissions. Specifically, if a submission is unable to train a held-out workload, we score the submission on the corresponding fixed workload as if that submission did not reach the target. In other words, for a submission to receive a finite training time on a fixed workload, it needs to: -- Reach the validation and test target on the fixed workload within the maximum runtime. -- Reach the validation and test target fixed workload within 4x of the fastest submission. -- Reach the validation and test target on the held-out workload (corresponding to the fixed workload) within the maximum runtime. -- Reach the validation and test target on the held-out workload (corresponding to the fixed workload) within 4x of the fastest submission. To determine the fastest submission on a held-out workload, we only consider submissions that reached the target on the corresponding fixed workload. This protects us against extremely fast submissions that only work on a specific held-out workload and are useless as general algorithms. +- Reach the validation target on the fixed workload within the maximum runtime. +- Reach the validation target fixed workload within 4x of the fastest submission. +- Reach the validation target on the held-out workload (corresponding to the fixed workload) within the maximum runtime. +- Reach the validation target on the held-out workload (corresponding to the fixed workload) within 4x of the fastest submission. To determine the fastest submission on a held-out workload, we only consider submissions that reached the target on the corresponding fixed workload. This protects us against extremely fast submissions that only work on a specific held-out workload and are useless as general algorithms. Only if all four requirements are met, does the submission get a finite score. Otherwise, a submission will receive a training time of infinity. @@ -483,35 +484,7 @@ For a given workload $\bar{w}$, we define the "speedup of a submission $\bar{s}$ ### Benchmark Procedure -#### Multiple Submission - -Our benchmark allows multiple submissions by the same submitter. However, we would like to prevent submitters from circumventing the purpose of the benchmark by, for example, submitting dozens of copies of the same submission with slightly different hyperparameters. Such a bulk submission would result in an unfair advantage on the randomized workloads and is not in the spirit of the benchmark. - -We encourage multiple submissions if they differ substantially. A spirit jury will be responsible for judging whether the submissions are substantially different. This jury will apply stricter scrutiny to submitters with a larger number of submissions. In this context, a submitter refers to an individual (not the general institution or research group they belong to). The total number of submissions by a submitter is the sum of submissions they contributed to. - -##### Requesting Additional Baselines - -Submitters can both contribute and request additional baseline algorithms. This includes existing algorithms with different search spaces or learning rate schedules. These baselines will not be eligible for winning the competition or prize money. - -#### Licensing - -Submitting to the benchmark requires the following legal considerations: - -- A signed [Contributor License Agreement (CLA) "Corporate CLA"](https://mlcommons.org/en/policies/) of MLCommons. -- *Either* membership in MLCommons *or* a signed [non-member test agreement](https://mlcommons.org/en/policies/). -- A signed trademark license agreement, either the member or the non-member version, as appropriate). These license agreements are available upon request to [support@mlcommons.org](mailto:support@mlcommons.org). - -We furthermore require all submissions to be made available open source after the submission deadline under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0). - -#### Awards and prize money - -An awards committee will award a prize for the "*Best Performance*" in each ruleset as well as a "*Jury Award*". The prize for the best-performing submission will take into account the [benchmark score](#benchmark-score-using-performance-profiles) on the full benchmark. The "*Jury Award*" will favor more out-of-the-box ideas that show great potential, even though the method may not be of practical value with the current landscape of models, software, etc. - -The prize money for "*Best Performance*" in a ruleset is $20,000 each. The winner of the "*Jury Award*" will be awarded $10,000. We reserve the right to split the prize money and distribute it among multiple submissions. - -The chairs of the MLCommons Algorithms Working Group (presently *George Dahl* and *Frank Schneider*) and their institutions (currently *Google Inc.* and the *University of Tübingen*) are ineligible to receive prize money. In addition, all individuals serving on the awards committee and their institutions are ineligible to win prize money. A submission with at least one ineligible submitter may still win an award, but the prize money will then be awarded to the top-ranked submission that is eligible for prize money. - -Submitters may self-report the results of their submissions as long as they follow the benchmark protocol (e.g. use the time to reach the validation target for tuning, use the hyperparameter samples provided by the working group, etc.). The working group will independently verify the self-reported submissions with the highest scores. Only verified results are eligible to win the benchmark and be awarded prize money. +For a description of how to submit a training algorithm to the AlgoPerf: Training Algorithms Benchmark, see the [Call for submissions](CALL_FOR_SUBMISSIONS.md), which details the entire competition process. ## Model Track diff --git a/SUBMISSION_PROCESS_RULES.md b/SUBMISSION_PROCESS_RULES.md new file mode 100644 index 000000000..227d6128b --- /dev/null +++ b/SUBMISSION_PROCESS_RULES.md @@ -0,0 +1,171 @@ +# MLCommons™ AlgoPerf: Submission Process Rules + +**Version:** 0.0.3 *(Last updated 10 Oktober 2023)* + +## Table of Contents + +- [Basics](#basics) +- [Schedule](#schedule) + - [Dates](#dates) + - [Version freeze](#version-freeze) + - [Submission deadline](#submission-deadline) +- [Submission](#submission) + - [Register a submission](#register-a-submission) + - [How to submit](#how-to-submit) + - [Submission repository](#submission-repository) + - [Licensing](#licensing) + - [Multiple Submission](#multiple-submission) +- [Scoring](#scoring) + - [Self-reporting scores](#self-reporting-scores) + - [Verifying scores](#verifying-scores) + - [Sampling held-out workloads and hyperparameters](#sampling-held-out-workloads-and-hyperparameters) + - [Leaderboard](#leaderboard) +- [Sprit jury \& challenging submissions](#sprit-jury--challenging-submissions) +- [Awards and prize money](#awards-and-prize-money) + - [Awards committee](#awards-committee) +- [Ineligibility and conflict of interest](#ineligibility-and-conflict-of-interest) + +## Basics + +This is the submission process rules for the AlgoPerf: Training Algorithms Benchmark. It describes the process of submitting a new training algorithm and details how it will be scored. This process applies to both the external tuning ruleset and the self-tuning ruleset although, for all intents and purposes, they are two separate competitions, with separate leaderboards. + +Three additional documents complement this document: + +- [**Benchmark rules**](RULES.md): While the submission process rules detail the *logistical* aspects of submitting to the AlgoPerf: Training Algorithms Benchmark, the [rules document](RULES.md) describes the *scientific* rules of the competition. This includes, for example, how tuning is performed in each ruleset, what types of submissions are allowed, or how the benchmark score is computed. +- [**AlgoPerf paper**](https://arxiv.org/abs/2306.07179): The paper titled ["Benchmarking Neural Network Training Algorithms"](https://arxiv.org/abs/2306.07179) motivates the need for the benchmark, explains the rules, and justifies the specific design choices of the AlgoPerf: Training Algorithms Benchmark. Additionally, it evaluates baseline submissions, constructed using various optimizers like Adam, Shampoo, or SAM, on the benchmark, demonstrating the feasibility but also the difficulty of the benchmark. +- [**Benchmark codebase**](https://github.com/mlcommons/algorithmic-efficiency): The codebase implements the rules, provides exact specifications of the workloads, and it will ultimately be used to score submissions. + +## Schedule + +### Dates + +- **Publication of the call for submission: 17. October 2023 (08:00 AM UTC)** +- Registration deadline to express non-binding intent to submit: 15. December 2023 (08:00 AM UTC) +- Version freeze for the benchmark codebase: 17. January 2024 (08:00 AM UTC) +- **Submission deadline: 15. February 2024 (08:00 AM UTC)** +- Sampling the held-out workloads and hyperparameters: 16. February 2024 (08:00 AM UTC) +- Deadline for specifying the submission batch sizes for held-out workloads: 28. February 2024 (08:00 AM UTC) +- Deadline for self-reporting results: 10. April 2024 (08:00 AM UTC) +- **[extra tentative] Announcement of all results: 22. May 2024 (08:00 AM UTC)** + +The presented dates are subject to change and adjustments may be made by the [MLCommmons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/). + +### Version freeze + +The benchmark code base is subject to change after the call for submissions is published. For example, while interacting with the codebase, if submitters encounter bugs or API limitations, they have the option to issue a bug report. This might lead to modifications of the benchmark codebase even after the publication of the call for submissions. + +To ensure that all submitters can develop their submissions based on the same code that will be utilized for scoring, we will freeze the package versions of the codebase dependencies before the submission deadline. By doing so, we level the playing field for everyone involved, ensuring fairness and consistency in the assessment of submissions. We will also try to minimize changes to the benchmark codebase as best as possible. + +### Submission deadline + +With the submission deadline, all submissions need to be available as a *public* repository with the appropriate license (see the [Licensing section](#licensing)). No changes to the submission code are allowed after the submission deadline (with the notable exception of specifying the batch size for the - at that point unknown - held-out workloads). Once the submission deadline has passed, the working group will publish a list of all submitted algorithms, along with their associated repositories. Anyone has the right to challenge a submission, i.e. request a review by the spirit jury to determine whether a submission violates the rules of the competition, see the [Spirit jury section](#sprit-jury--challenging-submissions). + +Directly after the submission deadline, all randomized aspects of the competition are fixed. This includes sampling the held-out workloads from the set of randomized workloads, as well as, sampling the hyperparameters for each submission in the external tuning ruleset (for more details see the [Sampling held-out workloads and hyperparameters section](#sampling-held-out-workloads-and-hyperparameters)). After that, submitters can now ascertain the appropriate batch size of their submission on each held-out workload and self-report scores on either the qualification set or the full benchmarking set of workloads including both fixed and held-out workloads (see the [Self-reporting scores section](#self-reporting-scores)). + +## Submission + +For a guide on the technical steps and details on how to write a submission, please refer to the [**Getting started document**](GETTING_STARTED.md). Additionally, the folders [/reference_algorithms](/reference_algorithms/) and [/baselines](/baselines/) provide example submissions that can serve as a template for creating new submissions. + +In the following, we describe the logistical steps required to submit a training algorithm to the AlgoPerf: Training Algorithms Benchmark. + +### Register an intent to submit + +All submitters need to register an intent to submit before the submission registration deadline. This registration is mandatory, i.e. required for all submissions, but not binding, i.e. you don't have to submit a registered submission. This registration is necessary, to estimate the number of submissions and provide support for potential submitters. + +To register an intent to submission, please fill out this [online form](https://forms.gle/iY1bUhwSjj1JZ4fa9) with the following information + +- Name of the submission (e.g. name of the algorithm, or any other arbitrary identifier). +- Ruleset under which the submission will be scored. +- Name, email, and affiliations of all submitters associated with this submission. +- Interest in compute support. + +The submission will be issued a unique **submission ID** that will be used throughout the submission process. + +### How to submit + +Submitters have the flexibility to submit their training algorithm anytime between the registration of the submission and the submission deadline. To submit a submission, please write an email to with the subject "[Submission] *submission_ID*" and the following information: + +- Submission ID. +- URL of the associated *public* GitHub repository. +- If applicable, a list of all changes to the names, emails, or affiliations compared to the registration of the submission. +- A digital version of all relevant licensing documents (see the [Licensing section](#licensing)). + +#### Submission repository + +The *public* GitHub repository needs to be a clone of the frozen `main` branch of the [benchmark codebase](https://github.com/mlcommons/algorithmic-efficiency). All elements of the original codebase, except for the `/submission` directory need to be unaltered from the original benchmark code. In particular, the repository must use the same [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0) as the benchmark codebase. Once the submission deadline has passed, modifications of the submission repository's code are generally prohibited. The sole exception to this rule is the definition of the batch sizes for the held-out workloads. + +Any software dependencies required for the submission need to be defined in a `requirements.txt` file within the `/submission` directory. This file needs to be `pip` readable, i.e. installable via `pip install -r requirements.txt`. In order to comply with the rules, submissions are not allowed to modify the used package version of the software dependencies of the benchmarking codebase, e.g. by using a different version of PyTorch or JAX (see [](RULES.md#disallowed-submissions)). + +#### Licensing + +Submitting to the AlgoPerf: Training Algorithms Benchmark requires the following legal considerations: + +- A signed [Contributor License Agreement (CLA) "Corporate CLA"](https://mlcommons.org/en/policies/) of MLCommons. +- *Either* a membership in MLCommons *or* a signed [non-member test agreement](https://mlcommons.org/en/policies/). +- A signed trademark license agreement, either the member or the non-member version, as appropriate. These license agreements are available upon request to [support@mlcommons.org](mailto:support@mlcommons.org). + +We furthermore require all submissions to be made available open source on the submission deadline under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0). + +### Multiple Submission + +Our benchmark allows multiple submissions by the same submitter(s). However, we would like to prevent submitters from circumventing the purpose of the benchmark by, for example, submitting dozens of copies of the same submission with slightly different hyperparameters. Such a bulk submission would result in an unfair advantage on the randomized workloads and is not in the spirit of the benchmark. + +Submitters may submit algorithms marked as *baselines*. These might include existing algorithms with different search spaces or learning rate schedules. These baseline algorithms are not eligible for winning the competition or prize money but they are also not required to be "substantially different" from other submissions by the same submitters. + +## Scoring + +### Self-reporting scores + +Submitters are expected to self-report scores on the full benchmark set before the deadline for self-reporting results. Reporting the scores involves providing all unmodified logs that the benchmarking codebase automatically generates in a separate `/results` directory within the `/submission` folder. For submissions competing in the external tuning ruleset, this includes all the logs of the tuning trials using the [hyperparameter samples provided by the working group](#sampling-held-out-workloads-and-hyperparameters). Note, that while the tuning runs can be performed on non-competition hardware, they still need to show that the "winning hyperparameter configuration" in each study was selected according to the [tuning rules](/RULES.md#external-tuning-ruleset), i.e. the fastest hyperparameter to reach the validation target. Additionally, the logs of the "winning hyperparameter configuration" (or each trial, in the self-tuning ruleset) in each of the five studies need to be computed on the competition hardware, to allow wall-clock runtime comparisons. + +Submitters unable to self-fund scoring costs can instead self-report only on the [qualification set of workloads](/RULES.md#qualification-set) that excludes some of the most expensive workloads. Based on this performance on the qualification set, the working group will provide - as funding allows - compute to evaluate and score the most promising submissions. Additionally, we encourage researchers to reach out to the [working group](mailto:algorithms@mlcommons.org) to find potential collaborators with the resources to run larger, more comprehensive experiments for both developing and scoring submissions. + +#### Verifying scores + +The working group will independently verify the scores of the highest-scoring submissions in each ruleset. Results that have been verified by the working group will be clearly marked on the leaderboard. + +### Sampling held-out workloads and hyperparameters + +After the submission deadline has passed and all submission code is frozen, the working group will sample a specific instance of held-out workloads from the set of randomized workloads. Additionally, every submission in the external tuning ruleset will receive its specific set of 5x20 hyperparameter values grouped by study. This set of hyperparameter values is sampled from the search space provided by the submitters. + +The sampling code for the held-out workloads and the hyperparameters is publicly available (**TODO link to both functions!**). Both sampling functions take as input a random seed, which will be provided by a trusted third party after the submission deadline. + +### Leaderboard + +The announcement of the results will contain two separate leaderboards, one for the self-tuning and one for the external tuning ruleset. All valid submissions will be ranked by the benchmark score, taking into account all workloads, including the held-out ones. The leaderboard will clearly mark scores that were verified by the working group. + +## Sprit jury & challenging submissions + +The spirit jury, consisting of selected active members of the working group, will be responsible for deciding whether a submission violates the "spirit of the rules". Submitters with specific concerns about a particular submission can request a review by the spirit jury to determine whether a submission violates the rules of the competition. To challenge a submission, please write an email to with the subject "[Challenge] *submission_name*". The email needs to link to the challenged submission and include a detailed description of why the submission should be reviewed. This request must be made reasonably in advance of the results announcement deadline to allow the Spirit Jury sufficient time to conduct a thorough review. + +The spirit jury may then hear the justifications of the submitters, inspect the code, and also ask the submitters to explain how the submission was produced, for example, by disclosing their intermediate experiments. Example cases that might be reviewed by the spirit jury are cases of multiple similar submissions by the same submitter or extensive workload-specific tuning. + +## Awards and prize money + +An awards committee will award a prize for the "*Best Performance*" in each ruleset as well as a "*Innovative Submission Award*". The prize for the best-performing submission will take into account the [benchmark score](RULES.md#benchmark-score-using-performance-profiles) on the full benchmark. The "*Innovative Submission Award*" will favor more out-of-the-box ideas that show great potential, even though the method may not be of practical value with the current landscape of models, software, etc. + +The prize money for "*Best Performance*" in a ruleset is $20,000 each. The winner of the "*Innovative Submission Award*" will be awarded $10,000. We reserve the right to split the prize money and distribute it among multiple submissions. + +If a submission is ineligible to win prize money it can still win an award. The prize money will then go to the highest-ranking eligible submission. + +### Awards committee + +The awards committee will be responsible for awarding prize money to submissions. The committee will try to reach a consensus on how to award prize money and settle disagreements by majority vote, if necessary. + +**TODO Who is on the Awards committee?** + +## Ineligibility and conflict of interest + +To ensure a fair process and avoid conflicts of interest, some individuals and institutions are ineligible to win prize money. This includes: + +- The chairs of the MLCommons Algorithms Working Group (presently *George Dahl* and *Frank Schneider*) and their associated institutions (currently *Google Inc.* and the *University of Tübingen*) +- All individuals serving on the awards committee and their associated institutions. + +A submission with at least one participating ineligible entity may still win an award, but the prize money will then be given to the top-ranked submission that does not contain ineligible entities. + +Additionally, we require members of the spirit jury to abstain from being involved in a review if: + +- They are part of the reviewed submission. +- The reviewed submission contains individuals from their institution. + +The spirit jury can still take a decision if at least one member of the jury is without a conflict of interest. diff --git a/algorithmic_efficiency/workloads/fastmri/workload.py b/algorithmic_efficiency/workloads/fastmri/workload.py index 4677dc2bb..d1d07e70e 100644 --- a/algorithmic_efficiency/workloads/fastmri/workload.py +++ b/algorithmic_efficiency/workloads/fastmri/workload.py @@ -19,14 +19,14 @@ def has_reached_validation_target(self, eval_result: float) -> bool: @property def validation_target_value(self) -> float: - return 0.7344 + return 0.726999 def has_reached_test_target(self, eval_result: float) -> bool: return eval_result['test/ssim'] > self.test_target_value @property def test_target_value(self) -> float: - return 0.741652 + return 0.744254 @property def loss_type(self) -> spec.LossType: diff --git a/datasets/README.md b/datasets/README.md index 5ff0e18a7..586895022 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -28,7 +28,7 @@ make sure the data directory is mounted to a directory on your host with -v flag. If you are following instructions from the README you will have used the `-v $HOME/data:/data` flag in the `docker run` command. This will mount the `$HOME/data` directory to the `/data` directory in the container. -In this case set --data_dir to `\data`. +In this case set --data_dir to `/data`. ```bash DATA_DIR='/data' ``` diff --git a/reference_algorithms/target_setting_algorithms/ogbg/tuning_search_space.json b/reference_algorithms/target_setting_algorithms/ogbg/tuning_search_space.json index 0ca3b935d..0f365a183 100644 --- a/reference_algorithms/target_setting_algorithms/ogbg/tuning_search_space.json +++ b/reference_algorithms/target_setting_algorithms/ogbg/tuning_search_space.json @@ -9,11 +9,6 @@ 0.9449369031171744 ] }, - "beta2": { - "feasible_points": [ - 0.9978504782314613 - ] - }, "warmup_steps": { "feasible_points": [ 3000 diff --git a/scoring/score_submission.py b/scoring/score_submission.py index 42a605dac..e8a6ac010 100644 --- a/scoring/score_submission.py +++ b/scoring/score_submission.py @@ -5,8 +5,7 @@ from absl import logging import scoring_utils -from algorithmic_efficiency import workloads -import scoring +from scoring import scoring flags.DEFINE_string( 'experiment_path', diff --git a/scoring/scoring.py b/scoring/scoring.py index 12aae1357..dba254233 100644 --- a/scoring/scoring.py +++ b/scoring/scoring.py @@ -40,6 +40,12 @@ WORKLOADS = workloads_registry.WORKLOADS WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)' BASE_WORKLOADS_DIR = 'algorithmic_efficiency/workloads/' +# These global variables have to be set according to the current set of +# workloads and rules for the scoring to be correct. +# We do not use the workload registry since it contains test and development +# workloads as well. +NUM_WORKLOADS = 8 +NUM_TRIALS = 5 MIN_EVAL_METRICS = [ 'ce_loss', @@ -47,9 +53,10 @@ 'ctc_loss', 'wer', 'l1_loss', + 'loss', ] -MAX_EVAL_METRICS = ['average_precision', 'ssim', 'accuracy', 'bleu_score'] +MAX_EVAL_METRICS = ['mean_average_precision', 'ssim', 'accuracy', 'bleu'] def generate_eval_cols(metrics): @@ -128,14 +135,14 @@ def get_index_that_reaches_target(workload_df, op = operator.le if is_minimized else operator.ge validation_target_reached = validation_series.apply( lambda x: op(x, validation_target)) - - target_reached = pd.Series(validation_target_reached[0]) + target_reached = pd.Series(validation_target_reached) # Remove trials that never reach the target target_reached = target_reached[target_reached.apply(np.any)] - # If we have no trials that have reached the target, return -1. Else, return - # the eval index of the earliest point the target is reached. - if target_reached.empty: + # If less than 3 trials reach the target, the submission will be scored as + # missing the target on this workload; return -1. Else, return the eval index + # of the earliest point the target is reached. + if len(target_reached) < 3: return -1, -1 else: index_reached = target_reached.apply(np.argmax) @@ -287,7 +294,7 @@ def compute_performance_profiles(results, np.log10(min_tau), np.log10(max_tau), num=num_points, base=10.0) def rho(r, tau): - return (r <= tau).sum(axis=1) / len(r.columns) + return (r <= tau).sum(axis=1) / NUM_WORKLOADS perf_df = pd.concat([rho(df, tau) for tau in points], axis=1) diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py index 37db73dd4..1a15db2f5 100644 --- a/scoring/scoring_utils.py +++ b/scoring/scoring_utils.py @@ -1,10 +1,14 @@ import json import os import re +import warnings from absl import logging import pandas as pd +from scoring.scoring import NUM_TRIALS +from scoring.scoring import NUM_WORKLOADS + TRIAL_LINE_REGEX = '(.*) --- Tuning run (\d+)/(\d+) ---' METRICS_LINE_REGEX = '(.*) Metrics: ({.*})' TRIAL_DIR_REGEX = 'trial_(\d+)' @@ -103,8 +107,7 @@ def get_trials_df_dict(logfile): """ trials_dict = get_trials_dict(logfile) trials_df_dict = {} - for trial in trials_dict.keys(): - metrics = trials_dict[trial] + for trial, metrics in trials_dict.items(): trials_df_dict[trial] = pd.DataFrame(metrics) return trials_df_dict @@ -156,6 +159,10 @@ def get_experiment_df(experiment_dir): """ df = pd.DataFrame() workload_dirs = os.listdir(experiment_dir) + num_workloads = len(workload_dirs) + if num_workloads != NUM_WORKLOADS: + warnings.warn(f'There should be {NUM_WORKLOADS} workloads but there are ' + f'{num_workloads}.') for workload in workload_dirs: data = { 'workload': workload, @@ -164,6 +171,7 @@ def get_experiment_df(experiment_dir): t for t in os.listdir(os.path.join(experiment_dir, workload)) if re.match(TRIAL_DIR_REGEX, t) ] + workload_df = pd.DataFrame() for trial in trial_dirs: eval_measurements_filepath = os.path.join( experiment_dir, @@ -173,7 +181,7 @@ def get_experiment_df(experiment_dir): ) try: trial_df = pd.read_csv(eval_measurements_filepath) - except FileNotFoundError as e: + except FileNotFoundError: logging.info(f'Could not read {eval_measurements_filepath}') continue data['trial'] = trial @@ -181,5 +189,10 @@ def get_experiment_df(experiment_dir): values = trial_df[column].to_numpy() data[column] = values trial_df = pd.DataFrame([data]) - df = pd.concat([df, trial_df], ignore_index=True) + workload_df = pd.concat([workload_df, trial_df], ignore_index=True) + num_trials = len(workload_df) + if num_trials != NUM_TRIALS: + warnings.warn(f'There should be {NUM_TRIALS} trials for workload ' + f'{workload} but there are only {num_trials}.') + df = pd.concat([df, workload_df], ignore_index=True) return df diff --git a/scoring/test_scoring_utils.py b/scoring/test_scoring_utils.py index b766a04d7..fbb21958c 100644 --- a/scoring/test_scoring_utils.py +++ b/scoring/test_scoring_utils.py @@ -1,8 +1,11 @@ from absl.testing import absltest -import scoring_utils -TEST_LOGFILE = 'test_data/adamw_fastmri_jax_04-18-2023-13-10-58.log' -TEST_DIR = 'test_data/experiment_dir' +from scoring import scoring_utils +from scoring.scoring import NUM_TRIALS +from scoring.scoring import NUM_WORKLOADS + +TEST_LOGFILE = 'scoring/test_data/adamw_fastmri_jax_04-18-2023-13-10-58.log' +TEST_DIR = 'scoring/test_data/experiment_dir' NUM_EVALS = 18 @@ -14,8 +17,7 @@ def test_get_trials_dict(self): def test_get_trials_df_dict(self): trials_dict = scoring_utils.get_trials_df_dict(TEST_LOGFILE) - for trial in trials_dict: - df = trials_dict[trial] + for df in trials_dict.values(): self.assertEqual(len(df.index), NUM_EVALS) def test_get_trials_df(self): @@ -24,7 +26,18 @@ def test_get_trials_df(self): self.assertEqual(len(df.at['1', column]), NUM_EVALS) def test_get_experiment_df(self): - df = scoring_utils.get_experiment_df(TEST_DIR) + _ = scoring_utils.get_experiment_df(TEST_DIR) + self.assertWarnsRegex( + Warning, + f'There should be {NUM_WORKLOADS} workloads but there are 1.', + scoring_utils.get_experiment_df, + TEST_DIR) + self.assertWarnsRegex( + Warning, + f'There should be {NUM_TRIALS} trials for workload mnist_jax but there ' + 'are only 1.', + scoring_utils.get_experiment_df, + TEST_DIR) if __name__ == '__main__': diff --git a/submission_runner.py b/submission_runner.py index 656599a42..d92732145 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -149,6 +149,9 @@ None, 'Value of rng seed. If None, a random seed will' 'be generated from hardware.') +flags.DEFINE_boolean('set_pytorch_max_split_size', + False, + 'If true, set pytorch max_split_size_mb to 256') FLAGS = flags.FLAGS USE_PYTORCH_DDP, RANK, DEVICE, N_GPUS = pytorch_setup() @@ -602,6 +605,9 @@ def main(_): if FLAGS.workload == 'librispeech_conformer': os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '0.85' + if FLAGS.set_pytorch_max_split_size: + os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:256' + # Extend path according to framework. workload_metadata['workload_path'] = os.path.join( BASE_WORKLOADS_DIR,