From 6f60b7ce60a84b6fae57c056e25e5bfa4a376073 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Sun, 31 Mar 2024 04:20:45 +0000 Subject: [PATCH 1/3] add resnet_variant tests w nadamw --- .../workflows/regression_tests_variants.yml | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/.github/workflows/regression_tests_variants.yml b/.github/workflows/regression_tests_variants.yml index ef1585d0d..f1482042e 100644 --- a/.github/workflows/regression_tests_variants.yml +++ b/.github/workflows/regression_tests_variants.yml @@ -72,7 +72,7 @@ jobs: run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w criteo1tb_resnet -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false - criteo_resnet_pytorch: + criteo_resnet_embed_init_pytorch: runs-on: self-hosted needs: build_and_push_pytorch_docker_image steps: @@ -80,6 +80,22 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w criteo1tb_embed_init -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false - - + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w criteo1tb_embed_init -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + imagenet_resnet_silu_pytorch: + runs-on: self-hosted + needs: build_and_push_pytorch_docker_image + steps: + - uses: actions/checkout@v2 + - name: Run containerized workload + run: | + docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d imagenet -f pytorch -s reference_algorithms/paper_baselines/nadamw/pytorch/submission.py -w imagenet_resnet_silu -t reference_algorithms/paper_baselines/nadamw/tuning_search_space.json -e tests/regression_tests/nadamw -m 10 -c False -o True -r false + imagenet_resnet_gelu_pytorch: + runs-on: self-hosted + needs: build_and_push_pytorch_docker_image + steps: + - uses: actions/checkout@v2 + - name: Run containerized workload + run: | + docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d imagenet -f pytorch -s reference_algorithms/paper_baselines/nadamw/pytorch/submission.py -w imagenet_resnet_gelu -t reference_algorithms/paper_baselines/nadamw/tuning_search_space.json -e tests/regression_tests/nadamw -m 10 -c False -o True -r false \ No newline at end of file From c433aeda761581790bc47c56b65a7b9bfcebc2f3 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Sun, 31 Mar 2024 04:40:30 +0000 Subject: [PATCH 2/3] temporarily disable other regression tests --- .github/workflows/regression_tests_variants.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/regression_tests_variants.yml b/.github/workflows/regression_tests_variants.yml index f1482042e..b32a91e86 100644 --- a/.github/workflows/regression_tests_variants.yml +++ b/.github/workflows/regression_tests_variants.yml @@ -3,7 +3,7 @@ name: Containerized Regression Tests for Workload Variants on: pull_request: branches: - - 'main' + - 'dev' # temp disable jobs: build_and_push_jax_docker_image: From d49247a8374bcc0cbc38733d8b970f27b65e96d1 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Mon, 1 Apr 2024 19:18:16 +0000 Subject: [PATCH 3/3] enable variant regression tests --- .github/workflows/regression_tests.yml | 2 +- .github/workflows/regression_tests_variants.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/regression_tests.yml b/.github/workflows/regression_tests.yml index cb8595f58..d7b831dbd 100644 --- a/.github/workflows/regression_tests.yml +++ b/.github/workflows/regression_tests.yml @@ -3,7 +3,7 @@ name: Containerized Regression Tests on: pull_request: branches: - - 'main' + - 'dev' # temp disable jobs: build_and_push_jax_docker_image: diff --git a/.github/workflows/regression_tests_variants.yml b/.github/workflows/regression_tests_variants.yml index b32a91e86..7ee3216ff 100644 --- a/.github/workflows/regression_tests_variants.yml +++ b/.github/workflows/regression_tests_variants.yml @@ -3,7 +3,7 @@ name: Containerized Regression Tests for Workload Variants on: pull_request: branches: - - 'dev' # temp disable + - 'main' jobs: build_and_push_jax_docker_image: