Merge pull request #843 from mlcommons/dev

Dev -> main
mlcommons · Feb 11, 2025 · 9653f18 · 9653f18
2 parents bf61255 + 5c4c07d
commit 9653f18
Show file tree

Hide file tree

Showing 223 changed files with 2,108 additions and 1,300 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -7,10 +7,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v4
       with:
-        python-version: 3.9
+        python-version: 3.11.10
         cache: 'pip' # Cache pip dependencies\.
         cache-dependency-path: '**/setup.py'
     - name: Install Modules and Run
@@ -25,10 +25,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v4
       with:
-        python-version: 3.9
+        python-version: 3.11.10
         cache: 'pip' # Cache pip dependencies\.
         cache-dependency-path: '**/setup.py'
     - name: Install Modules and Run
@@ -42,10 +42,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v4
       with:
-        python-version: 3.9
+        python-version: 3.11.10
         cache: 'pip' # Cache pip dependencies\.
         cache-dependency-path: '**/setup.py'
     - name: Install Modules and Run
@@ -59,10 +59,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v4
       with:
-        python-version: 3.9
+        python-version: 3.11.10
         cache: 'pip' # Cache pip dependencies\.
         cache-dependency-path: '**/setup.py'
     - name: Install Modules and Run
@@ -77,10 +77,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v4
       with:
-        python-version: 3.9
+        python-version: 3.11.10
         cache: 'pip' # Cache pip dependencies\.
         cache-dependency-path: '**/setup.py'
     - name: Install Modules and Run
@@ -96,10 +96,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v4
       with:
-        python-version: 3.9
+        python-version: 3.11.10
         cache: 'pip' # Cache pip dependencies\.
         cache-dependency-path: '**/setup.py'
     - name: Install Modules and Run
@@ -113,10 +113,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v4
       with:
-        python-version: 3.9
+        python-version: 3.11.10
         cache: 'pip' # Cache pip dependencies\.
         cache-dependency-path: '**/setup.py'
     - name: Install Modules and Run
@@ -130,10 +130,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v4
       with:
-        python-version: 3.9
+        python-version: 3.11.10
         cache: 'pip' # Cache pip dependencies\.
         cache-dependency-path: '**/setup.py'
     - name: Install Modules and Run
@@ -148,10 +148,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v4
       with:
-        python-version: 3.9
+        python-version: 3.11.10
         cache: 'pip' # Cache pip dependencies\.
         cache-dependency-path: '**/setup.py'
     - name: Install Modules and Run
@@ -166,10 +166,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v4
       with:
-        python-version: 3.9
+        python-version: 3.11.10
         cache: 'pip' # Cache pip dependencies\.
         cache-dependency-path: '**/setup.py'
     - name: Install Modules and Run
@@ -184,10 +184,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v4
       with:
-        python-version: 3.9
+        python-version: 3.11.10
         cache: 'pip' # Cache pip dependencies\.
         cache-dependency-path: '**/setup.py'
     - name: Install pytest
@@ -199,7 +199,7 @@ jobs:
         pip install .[pytorch_cpu]
     - name: Run pytest tests
       run: |
-        pytest -vx tests/version_test.py
+        pytest -vx tests/test_version.py
         pytest -vx tests/test_num_params.py
         pytest -vx tests/test_param_shapes.py
         pytest -vx tests/test_param_types.py
@@ -208,10 +208,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v4
       with:
-        python-version: 3.9
+        python-version: 3.11.10
         cache: 'pip' # Cache pip dependencies\.
         cache-dependency-path: '**/setup.py'
     - name: Install pytest

diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -7,17 +7,17 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v2
       with:
-        python-version: 3.9
+        python-version: 3.11.10
     - name: Install pylint
       run: |
         python -m pip install --upgrade pip
         pip install pylint==2.16.1
     - name: Run pylint
       run: |
-        pylint algorithmic_efficiency
+        pylint algoperf
         pylint reference_algorithms
         pylint prize_qualification_baselines
         pylint submission_runner.py
@@ -27,14 +27,14 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v2
       with:
-        python-version: 3.9
+        python-version: 3.11.10
     - name: Install isort
       run: |
         python -m pip install --upgrade pip
-        pip install isort
+        pip install isort==5.12.0
     - name: Run isort
       run: |
         isort . --check --diff
@@ -43,14 +43,14 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11.10
       uses: actions/setup-python@v2
       with:
-        python-version: 3.9
+        python-version: 3.11.10
     - name: Install yapf
       run: |
         python -m pip install --upgrade pip
-        pip install yapf==0.32
+        pip install yapf==0.32 toml
     - name: Run yapf
       run: |
         yapf . --diff --recursive
diff --git a/.github/workflows/regression_tests_variants.yml b/.github/workflows/regression_tests_variants.yml
@@ -72,7 +72,7 @@ jobs:
       run: |
         docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }}
         docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w criteo1tb_resnet -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
-  criteo_resnet_pytorch:
+  criteo_embed_init_pytorch:
     runs-on: self-hosted
     needs: build_and_push_pytorch_docker_image
     steps:

diff --git a/.gitignore b/.gitignore
@@ -12,8 +12,8 @@ makefile
 *.swp
 */data/
 *events.out.tfevents*
-algorithmic_efficiency/workloads/librispeech_conformer/data_dir
-algorithmic_efficiency/workloads/librispeech_conformer/work_dir
+algoperf/workloads/librispeech_conformer/data_dir
+algoperf/workloads/librispeech_conformer/work_dir
 *.flac
 *.npy
 *.csv
@@ -23,4 +23,6 @@ wandb/
 scoring/plots/
 
 !scoring/test_data/experiment_dir/study_0/mnist_jax/trial_0/eval_measurements.csv
-!scoring/test_data/experiment_dir/study_0/mnist_jax/trial_1/eval_measurements.csv
+!scoring/test_data/experiment_dir/study_0/mnist_jax/trial_1/eval_measurements.csv
+
+algoperf/_version.py
diff --git a/README.md b/README.md
@@ -6,12 +6,12 @@
 </p>
 
 <p align="center">
-  <a href="https://arxiv.org/abs/2306.07179" target="_blank">Paper (arXiv)</a> •
-  <a href="/CALL_FOR_SUBMISSIONS.md">Call for Submissions</a> •
-  <a href="/GETTING_STARTED.md">Getting Started</a> •
-  <a href="/COMPETITION_RULES.md">Competition Rules</a> •
-  <a href="/DOCUMENTATION.md">Documentation</a> •
-  <a href="/CONTRIBUTING.md">Contributing</a>
+  <a href="https://github.com/mlcommons/submissions_algorithms">Leaderboard</a> •
+  <a href="/docs/GETTING_STARTED.md">Getting Started</a> •
+  <a href="https://github.com/mlcommons/submissions_algorithms">Submit</a> •
+  <a href="/docs/DOCUMENTATION.md">Documentation</a> •
+  <a href="/docs/CONTRIBUTING.md">Contributing</a> •
+  <a href="https://arxiv.org/abs/2306.07179" target="_blank">Benchmark</a>/<a href="https://openreview.net/forum?id=CtM5xjRSfm" target="_blank">Results</a> Paper
 </p>
 
 [![CI](https://github.com/mlcommons/algorithmic-efficiency/actions/workflows/CI.yml/badge.svg)](https://github.com/mlcommons/algorithmic-efficiency/actions/workflows/CI.yml)
@@ -22,19 +22,21 @@
 
 ---
 
-> *AlgoPerf* is a suite of benchmarks and competitions to measure neural network training speedups due to algorithmic improvements in both training algorithms and models. This is the repository for the *AlgoPerf: Training Algorithms benchmark* and its associated competition. It is developed by the [MLCommons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/). This repository holds the [**competition rules**](/COMPETITION_RULES.md), the [**technical documentation**](/DOCUMENTATION.md) of the benchmark, [**getting started guides**](/GETTING_STARTED.md), and the benchmark code. For a detailed description of the benchmark design, see our [**paper**](https://arxiv.org/abs/2306.07179).
-
+> This is the repository for the *AlgoPerf: Training Algorithms benchmark* measuring neural network training speedups due to algorithmic improvements.
+> It is developed by the [MLCommons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/).
+> This repository holds the benchmark code, the benchmark's [**technical documentation**](/docs/DOCUMENTATION.md) and [**getting started guides**](/docs/GETTING_STARTED.md). For a detailed description of the benchmark design, see our [**introductory paper**](https://arxiv.org/abs/2306.07179), for the results of the inaugural competition see our [**results paper**](https://openreview.net/forum?id=CtM5xjRSfm).
+>
+> **See our [AlgoPerf Leaderboard](https://github.com/mlcommons/submissions_algorithms) for the latest results of the benchmark and to submit your algorithm.**
 ---
 
 > [!IMPORTANT]
-> The results of the inaugural AlgoPerf: Training Algorithms benchmark competition have been announced. See the [MLCommons blog post](https://mlcommons.org/2024/08/mlc-algoperf-benchmark-competition/) for an overview and the [results page](https://mlcommons.org/benchmarks/algorithms/) for more details on the results. We are currently preparing an in-depth analysis of the results in the form of a paper and plan the next iteration of the benchmark competition.
+> For future iterations of the AlgoPerf: Training Algorithms benchmark competition, we are switching to a rolling leaderboard, making a few changes to the competition rules, and also run all selected submissions on our hardware. **To submit your algorithm to the next iteration of the benchmark, please see our [How to Submit](#how-to-submit) section and the [submission repository](https://github.com/mlcommons/submissions_algorithms) which hosts the up to date AlgoPerf leaderboard.**
 
 ## Table of Contents <!-- omit from toc -->
 
 - [Installation](#installation)
 - [Getting Started](#getting-started)
-- [Call for Submissions](#call-for-submissions)
-  - [Competition Rules](#competition-rules)
+- [How to Submit](#how-to-submit)
   - [Technical Documentation of the Benchmark \& FAQs](#technical-documentation-of-the-benchmark--faqs)
 - [Contributing](#contributing)
 - [License](#license)
@@ -45,9 +47,9 @@
 > [!TIP]
 > **If you have any questions about the benchmark competition or you run into any issues, please feel free to contact us.** Either [file an issue](https://github.com/mlcommons/algorithmic-efficiency/issues), ask a question on [our Discord](https://discord.gg/5FPXK7SMt6) or [join our weekly meetings](https://mlcommons.org/en/groups/research-algorithms/).
 
-You can install this package and dependencies in a [Python virtual environment](/GETTING_STARTED.md#python-virtual-environment) or use a [Docker/Singularity/Apptainer container](/GETTING_STARTED.md#docker) (recommended).
+You can install this package and dependencies in a [Python virtual environment](/docs/GETTING_STARTED.md#python-virtual-environment) or use a [Docker/Singularity/Apptainer container](/docs/GETTING_STARTED.md#docker) (recommended).
 We recommend using a Docker container (or alternatively, a Singularity/Apptainer container) to ensure a similar environment to our scoring and testing environments.
-Both options are described in detail in the [**Getting Started**](/GETTING_STARTED.md) document.
+Both options are described in detail in the [**Getting Started**](/docs/GETTING_STARTED.md) document.
 
 *TL;DR to install the Jax version for GPU run:*
 
@@ -67,7 +69,7 @@ pip3 install -e '.[full]'
 
 ## Getting Started
 
-For detailed instructions on developing and scoring your own algorithm in the benchmark see the [Getting Started](/GETTING_STARTED.md) document.
+For detailed instructions on developing your own algorithm in the benchmark see the [Getting Started](/docs/GETTING_STARTED.md) document.
 
 *TL;DR running a JAX workload:*
 
@@ -93,23 +95,19 @@ python3 submission_runner.py \
     --tuning_search_space=reference_algorithms/paper_baselines/adamw/tuning_search_space.json
 ```
 
-## Call for Submissions
-
-The [Call for Submissions](/CALL_FOR_SUBMISSIONS.md) announces the first iteration of the AlgoPerf: Training Algorithms competition based on the benchmark by the same name. This document also contains the schedule and key dates for the competition.
-
-### Competition Rules
+## How to Submit
 
-The competition rules for the *AlgoPerf: Training Algorithms* competition can be found in the separate [**Competition Rules**](/COMPETITION_RULES.md) document.
+Once you have developed your training algorithm, you can submit it to the benchmark by creating a pull request to the [submission repository](https://github.com/mlcommons/submissions_algorithms), which hosts the AlgoPerf leaderboard. The AlgoPerf working group will review your PR. Based on our available resources and the perceived potential of the method, it will be selected for a free evaluation. If selected, we will run your algorithm on our hardware and update the leaderboard with the results.
 
 ### Technical Documentation of the Benchmark & FAQs
 
-We provide additional technical documentation of the benchmark and answer frequently asked questions in a separate [**Documentation**](/DOCUMENTATION.md) page. Suggestions, clarifications and questions can be raised via pull requests, creating an issue, or by sending an email to the [working group](mailto:[email protected]).
+We provide a technical documentation of the benchmark and answer frequently asked questions in a separate [**Documentation**](/docs/DOCUMENTATION.md) page. This includes which types of submissions are allowed. Please ensure that your submission is compliant with these rules before submitting. Suggestions, clarifications and questions can be raised via pull requests, creating an issue, or by sending an email to the [working group](mailto:[email protected]).
 
 ## Contributing
 
 We invite everyone to look through our rules, documentation, and codebase and submit issues and pull requests, e.g. for rules changes, clarifications, or any bugs you might encounter. If you are interested in contributing to the work of the working group and influence the benchmark's design decisions, please [join the weekly meetings](https://mlcommons.org/en/groups/research-algorithms/) and consider becoming a member of the working group.
 
-Our [**Contributing**](/CONTRIBUTING.md) document provides further MLCommons contributing guidelines and additional setup and workflow instructions.
+Our [**Contributing**](/docs/CONTRIBUTING.md) document provides further MLCommons contributing guidelines and additional setup and workflow instructions.
 
 ## License
 
@@ -134,3 +132,19 @@ If you are using the *AlgoPerf benchmark*, its codebase, baselines, or workloads
   eprint        = {2306.07179},
 }
 ```
+
+If you use the results from the first *AlgoPerf competition*, please consider citing the results paper, as well as the relevant submissions:
+
+> [Kasimbeg, Schneider, Eschenhagen, et al.<br/>
+> **Accelerating neural network training: An analysis of the AlgoPerf competition**<br/>
+> ICLR 2025](https://openreview.net/forum?id=CtM5xjRSfm)
+
+```bibtex
+@inproceedings{Kasimbeg2025AlgoPerfResults,
+title           = {Accelerating neural network training: An analysis of the {AlgoPerf} competition},
+author          = {Kasimbeg, Priya and Schneider, Frank and Eschenhagen, Runa and Bae, Juhan and Sastry, Chandramouli Shama and Saroufim, Mark and Boyuan, Feng and Wright, Less and Yang, Edward Z. and Nado, Zachary and Medapati, Sourabh and Hennig, Philipp and Rabbat, Michael and Dahl, George E.},
+booktitle       = {The Thirteenth International Conference on Learning Representations},
+year            = {2025},
+url             = {https://openreview.net/forum?id=CtM5xjRSfm}
+}
+```
diff --git a/algoperf/__init__.py b/algoperf/__init__.py
@@ -0,0 +1,5 @@
+"""Algorithmic Efficiency."""
+
+from ._version import version as __version__
+
+__all__ = ["__version__"]
diff --git a/algorithmic_efficiency/checkpoint_utils.py → algoperf/checkpoint_utils.py b/algorithmic_efficiency/checkpoint_utils.py → algoperf/checkpoint_utils.py
@@ -16,8 +16,8 @@
 from tensorflow.io import gfile  # pytype: disable=import-error
 import torch
 
-from algorithmic_efficiency import spec
-from algorithmic_efficiency.pytorch_utils import pytorch_setup
+from algoperf import spec
+from algoperf.pytorch_utils import pytorch_setup
 
 _, _, DEVICE, _ = pytorch_setup()
 CheckpointReturn = Tuple[spec.OptimizerState,
@@ -231,7 +231,7 @@ def save_checkpoint(framework: str,
         target=checkpoint_state,
         step=global_step,
         overwrite=True,
-        keep=np.Inf if save_intermediate_checkpoints else 1)
+        keep=np.inf if save_intermediate_checkpoints else 1)
   else:
     if not save_intermediate_checkpoints:
       checkpoint_files = gfile.glob(