From c71a250ebb75e64f92d3b0c0f4ea63f1abf03161 Mon Sep 17 00:00:00 2001
From: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Date: Tue, 7 Jan 2025 11:25:43 -0600
Subject: [PATCH 01/35] (shortfin-sd) Interleave workers and their fibers by
 device. (#587)

This enables proper "filling" of multi-device topologies that populate
each device evenly, rather than block allocating work to one device at a
time.

Co-authored-by: Ean Garvey <ean.garvey@amd.com>
---
 shortfin/python/shortfin_apps/sd/components/service.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/shortfin/python/shortfin_apps/sd/components/service.py b/shortfin/python/shortfin_apps/sd/components/service.py
index 9b09632a6..814916bee 100644
--- a/shortfin/python/shortfin_apps/sd/components/service.py
+++ b/shortfin/python/shortfin_apps/sd/components/service.py
@@ -76,8 +76,10 @@ def __init__(
         self.workers = []
         self.fibers = []
         self.idle_fibers = set()
-        for idx, device in enumerate(self.sysman.ls.devices):
-            for i in range(self.workers_per_device):
+        # For each worker index we create one on each device, and add their fibers to the idle set.
+        # This roughly ensures that the first picked fibers are distributed across available devices.
+        for i in range(self.workers_per_device):
+            for idx, device in enumerate(self.sysman.ls.devices):
                 worker = sysman.ls.create_worker(f"{name}-inference-{device.name}-{i}")
                 self.workers.append(worker)
         for idx, device in enumerate(self.sysman.ls.devices):

From e2cbcb4cb220299d4e49a923b352465d3f4e6416 Mon Sep 17 00:00:00 2001
From: Vinayak Dev <104419489+vinayakdsci@users.noreply.github.com>
Date: Wed, 8 Jan 2025 00:00:23 +0530
Subject: [PATCH 02/35] Add user instructions for converting safetensors to
 gguf (#772)

Adds a note to
[llama_serving.md](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/llama_serving.md)
that instructs the user how to convert a collection of `.safetensor`
weight files to a single `.gguf` file that can be used in the
instructions that follow.
---
 docs/shortfin/llm/user/llama_serving.md | 10 ++++++++++
 docs/user_guide.md                      |  6 ++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/docs/shortfin/llm/user/llama_serving.md b/docs/shortfin/llm/user/llama_serving.md
index cc2c959b4..6abe399db 100644
--- a/docs/shortfin/llm/user/llama_serving.md
+++ b/docs/shortfin/llm/user/llama_serving.md
@@ -87,6 +87,16 @@ LLama3.1 8b f16 model.
 python -m sharktank.utils.hf_datasets llama3_8B_fp16 --local-dir $EXPORT_DIR
 ```
 
+> [!NOTE]
+> If you have the model weights as a collection of `.safetensors` files (downloaded from HuggingFace Model Hub, for example), you can use the `convert_hf_to_gguf.py` script from the [llama.cpp repository](https://github.com/ggerganov/llama.cpp) to convert them to a single `.gguf` file.
+> ```bash
+> export WEIGHTS_DIR=/path/to/safetensors/weights_directory/
+> git clone --depth 1 https://github.com/ggerganov/llama.cpp.git
+> cd llama.cpp
+> python3 convert_hf_to_gguf.py $WEIGHTS_DIR --outtype f16 --outfile $EXPORT_DIR/<output_gguf_name>.gguf
+> ```
+> Now this GGUF file can be used in the instructions ahead.
+
 ### Define environment variables
 
 We'll first define some environment variables that are shared between the
diff --git a/docs/user_guide.md b/docs/user_guide.md
index a53e5af01..5197d04ea 100644
--- a/docs/user_guide.md
+++ b/docs/user_guide.md
@@ -78,8 +78,10 @@ To get started with SDXL, please follow the [SDXL User Guide](../shortfin/python
 
 ### Llama 3.1
 
-To get started with Llama 3.1, please follow the [Llama User Guide](shortfin/llm/user/llama_serving.md).
+To get started with Llama 3.1, please follow the [Llama User Guide][1].
 
 * Once you've set up the Llama server in the guide above, we recommend that you use [SGLang Frontend](https://sgl-project.github.io/frontend/frontend.html) by following the [Using `shortfin` with `sglang` guide](shortfin/llm/user/shortfin_with_sglang_frontend_language.md)
 * If you would like to deploy LLama on a Kubernetes cluster we also provide a simple set of instructions and deployment configuration to do so [here](shortfin/llm/user/llama_serving_on_kubernetes.md).
-* Finally, if you'd like to leverage the instructions above to run against a different variant of Llama 3.1, it's supported. However, you will need to generate a gguf dataset for that variant. In order to do this leverage the [HuggingFace](https://huggingface.co/)'s [`huggingface-cli`](https://huggingface.co/docs/huggingface_hub/en/guides/cli) in combination with [llama.cpp](https://github.com/ggerganov/llama.cpp)'s convert_hf_to_gguf.py. In future releases, we plan to streamline these instructions to make it easier for users to compile their own models from HuggingFace.
+* Finally, if you'd like to leverage the instructions above to run against a different variant of Llama 3.1, it's supported. However, you will need to generate a gguf dataset for that variant (explained in the [user guide][1]). In future releases, we plan to streamline these instructions to make it easier for users to compile their own models from HuggingFace.
+
+[1]: shortfin/llm/user/llama_serving.md

From 3bf4faf6ad53ae838492822b578ca09d5b8f581a Mon Sep 17 00:00:00 2001
From: Max191 <44243577+Max191@users.noreply.github.com>
Date: Tue, 7 Jan 2025 15:20:59 -0500
Subject: [PATCH 03/35] [tuner] Remove legacy tuner path (#756)

This PR removes the old code paths for the tuner following the large
refactoring done in https://github.com/nod-ai/shark-ai/pull/606 and
https://github.com/nod-ai/shark-ai/pull/704.
- The old compilation and benchmarking logic for models and dispatches
are now gone, and the unified `benchmark` and `compile` functions should
be used instead.
- Much of the dispatch parsing logic is removed, and dispatch parsing is
now done by 2 DispatchParser implementations for contraction ops and
convolution ops.
- The example tuning clients in `tuner/examples` for dispatches and
punet are removed, since they use the old path. The new example to
follow is in `tuner/examples/test`.
- The candidate generation `tune` function is removed, and the
`generate_configs_and_td_specs` should be used to generate candidates
instead.
- Many utility functions, structs, and struct fields are now removed,
since they are no longer used.
- All tests testing the old path are removed.

---------

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
---
 tuner/examples/dispatch/.gitignore          |    3 -
 tuner/examples/dispatch/README.md           |   35 -
 tuner/examples/dispatch/__init__.py         |    5 -
 tuner/examples/dispatch/__main__.py         |    9 -
 tuner/examples/dispatch/compile_dispatch.sh |   18 -
 tuner/examples/dispatch/config_epilog.mlir  |   12 -
 tuner/examples/dispatch/config_prolog.mlir  |   32 -
 tuner/examples/dispatch/dispatch_tuner.py   |  147 ---
 tuner/examples/dispatch/mmt.mlir            |   11 -
 tuner/examples/punet/.gitignore             |    3 -
 tuner/examples/punet/README.md              |   46 -
 tuner/examples/punet/__init__.py            |    5 -
 tuner/examples/punet/__main__.py            |    9 -
 tuner/examples/punet/mmt.mlir               |   11 -
 tuner/examples/punet/punet_autotune.py      |  194 ----
 tuner/examples/test/README.md               |    3 +-
 tuner/examples/test/tuner_test.py           |    1 -
 tuner/tuner/candidate_gen.py                |  589 ++--------
 tuner/tuner/candidate_gen_test.py           |  508 ---------
 tuner/tuner/common.py                       |   17 +-
 tuner/tuner/common_test.py                  |   44 +-
 tuner/tuner/dispatch_constraints.py         |   13 +-
 tuner/tuner/dispatch_constraints_test.py    |   12 +-
 tuner/tuner/dispatch_parser.py              |  409 -------
 tuner/tuner/dispatch_parser_test.py         |  154 ---
 tuner/tuner/libtuner.py                     | 1072 +------------------
 tuner/tuner/libtuner_test.py                |  326 ------
 27 files changed, 169 insertions(+), 3519 deletions(-)
 delete mode 100644 tuner/examples/dispatch/.gitignore
 delete mode 100644 tuner/examples/dispatch/README.md
 delete mode 100644 tuner/examples/dispatch/__init__.py
 delete mode 100644 tuner/examples/dispatch/__main__.py
 delete mode 100755 tuner/examples/dispatch/compile_dispatch.sh
 delete mode 100644 tuner/examples/dispatch/config_epilog.mlir
 delete mode 100644 tuner/examples/dispatch/config_prolog.mlir
 delete mode 100644 tuner/examples/dispatch/dispatch_tuner.py
 delete mode 100644 tuner/examples/dispatch/mmt.mlir
 delete mode 100644 tuner/examples/punet/.gitignore
 delete mode 100644 tuner/examples/punet/README.md
 delete mode 100644 tuner/examples/punet/__init__.py
 delete mode 100644 tuner/examples/punet/__main__.py
 delete mode 100644 tuner/examples/punet/mmt.mlir
 delete mode 100644 tuner/examples/punet/punet_autotune.py

diff --git a/tuner/examples/dispatch/.gitignore b/tuner/examples/dispatch/.gitignore
deleted file mode 100644
index 9fb2fe16a..000000000
--- a/tuner/examples/dispatch/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# Test files/dirs recommended by README.md.
-dump/
-benchmark.mlir
diff --git a/tuner/examples/dispatch/README.md b/tuner/examples/dispatch/README.md
deleted file mode 100644
index 70c46e08a..000000000
--- a/tuner/examples/dispatch/README.md
+++ /dev/null
@@ -1,35 +0,0 @@
-# Dispatch Tuner
-
-Allows to tune a single dispatch in isolation.
-
-## Environments
-Follow instructions in [`/tuner/README.md`](../README.md)
-
-## Running the Dispatch Tuner
-
-### Generate a benchmark file
-Use the usual `iree-compile` command for your dispatch and add
-`--iree-hal-dump-executable-files-to=dump`. For example:
-```shell
-iree-compile mmt.mlir --iree-hal-target-backends=rocm --iree-hip-target=gfx942 --iree-hal-dump-executable-files-to=dump -o /dev/null
-```
-
-Next, copy the `*_benchmark.mlir` file to some temporary directory of choice.
-This will be the input to the dispatch tuner.
-
-### Recommended Trial Run
-For an initial trial to test the tuning loop, use:
-```shell
-python -m examples.dispatch benchmark.mlir --num-candidates=20
-```
-
-### Dry Run Test
-To perform a dry run (no GPU required), use:
-```shell
-python -m examples.dispatch benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
-```
-
-### Basic Usage
-```shell
-python -m examples.dispatch benchmark.mlir
-```
diff --git a/tuner/examples/dispatch/__init__.py b/tuner/examples/dispatch/__init__.py
deleted file mode 100644
index a85ba359d..000000000
--- a/tuner/examples/dispatch/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright 2024 Advanced Micro Devices, Inc.
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/tuner/examples/dispatch/__main__.py b/tuner/examples/dispatch/__main__.py
deleted file mode 100644
index 9fb86fd9f..000000000
--- a/tuner/examples/dispatch/__main__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright 2024 Advanced Micro Devices, Inc.
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from . import dispatch_tuner
-
-dispatch_tuner.main()
diff --git a/tuner/examples/dispatch/compile_dispatch.sh b/tuner/examples/dispatch/compile_dispatch.sh
deleted file mode 100755
index 0b01ac991..000000000
--- a/tuner/examples/dispatch/compile_dispatch.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#! /usr/bin/env bash
-
-set -eou pipefail
-
-readonly INPUT="$1"
-readonly DIR="$(dirname "$INPUT")"
-readonly BASENAME="$(basename "$INPUT" .mlir)"
-readonly OUT="${DIR}/compiled/${BASENAME}.vmfb"
-
-iree-compile "$INPUT" -o "$OUT" \
-  --compile-from=executable-sources 2>/dev/null || (mv "$INPUT" "$DIR/failed" && exit 1)
-
-iree-dump-module "$OUT" | grep -q 'rocm-hsaco-fb' || (mv "$INPUT" "$DIR/failed" && rm -f "$OUT" && exit 1)
-if [ -f "${DIR}/${BASENAME}_config.mlir" ]; then
-    cat "${DIR}/../config_prolog.mlir" "${DIR}/${BASENAME}_config.mlir" "${DIR}/../config_epilog.mlir" > "${DIR}/specs/${BASENAME}_spec.mlir"
-fi
-
-echo "Compiling ${INPUT}: success"
diff --git a/tuner/examples/dispatch/config_epilog.mlir b/tuner/examples/dispatch/config_epilog.mlir
deleted file mode 100644
index c15a30502..000000000
--- a/tuner/examples/dispatch/config_epilog.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-
-//===----------------------------------------------------------------------===//
-// Entry point
-//===----------------------------------------------------------------------===//
-
-  transform.named_sequence @__kernel_config(%variant_op: !transform.any_op {transform.consumed}) {
-    transform.foreach_match in %variant_op
-        , @match_op -> @apply_op_config
-      : (!transform.any_op) -> (!transform.any_op)
-    transform.yield
-  }
-} ////  module
diff --git a/tuner/examples/dispatch/config_prolog.mlir b/tuner/examples/dispatch/config_prolog.mlir
deleted file mode 100644
index 377ac3f8f..000000000
--- a/tuner/examples/dispatch/config_prolog.mlir
+++ /dev/null
@@ -1,32 +0,0 @@
-// Transform dialect specification for attention on MI300 with MFMA.
-module attributes { transform.with_named_sequence } {
-//===----------------------------------------------------------------------===//
-// Matmul tuning
-//===----------------------------------------------------------------------===//
-
-  transform.named_sequence @match_mmt_f16_f16_f32(%root: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
-    transform.match.operation_name %root ["linalg.generic"] : !transform.any_op
-    // transform.print %root {name = "Generic"} : !transform.any_op
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %root {
-      ^bb0(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %out: tensor<?x?xf32>):
-      %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
-                                            affine_map<(d0, d1, d2) -> (d1, d2)>,
-                                            affine_map<(d0, d1, d2) -> (d0, d1)>],
-                           iterator_types = ["parallel", "parallel", "reduction"]}
-          ins(%lhs, %rhs : tensor<?x?xf16>, tensor<?x?xf16>) outs(%out : tensor<?x?xf32>) {
-        ^bb0(%in: f16, %in_0: f16, %acc: f32):
-          %8 = arith.extf %in : f16 to f32
-          %9 = arith.extf %in_0 : f16 to f32
-          %10 = arith.mulf %8, %9 : f32
-          %11 = arith.addf %acc, %10 : f32
-          linalg.yield %11 : f32
-        } -> tensor<?x?xf32>
-    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-    transform.yield %root : !transform.any_op
-  }
-
-  transform.named_sequence @apply_op_config(%op: !transform.any_op {transform.readonly}, %config: !transform.any_param {transform.readonly}) {
-    transform.annotate %op "compilation_info" = %config : !transform.any_op, !transform.any_param
-    // transform.print %op {name = "Applied"} : !transform.any_op
-    transform.yield
-  }
diff --git a/tuner/examples/dispatch/dispatch_tuner.py b/tuner/examples/dispatch/dispatch_tuner.py
deleted file mode 100644
index 0f5b54979..000000000
--- a/tuner/examples/dispatch/dispatch_tuner.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright 2024 Advanced Micro Devices, Inc
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-"""
-Sample Usage:
-
-python -m examples.dispatch benchmark.mlir --lhs-dims=bmk --rhs-dims=bkn --tile-dims=*mnk --devices=hip://0,hip://1 --num-candidates=64
-
-
-Recommended Trial Run:
-
-python -m examples.dispatch benchmark.mlir --num-candidates=10
-
-
-Dry Run Test (no gpu required):
-
-python -m examples.dispatch benchmark.mlir --num-candidates=64 --dry-run
-
-"""
-
-from tuner import libtuner
-from pathlib import Path, PurePath
-import os
-
-
-class DispatchTuner(libtuner.TuningClient):
-    def get_dispatch_compile_timeout_s(self) -> int:
-        return 10
-
-    def get_dispatch_compile_command(
-        self, candidate_tracker: libtuner.CandidateTracker
-    ) -> list[str]:
-        assert candidate_tracker.dispatch_mlir_path is not None
-        mlir_path: Path = candidate_tracker.dispatch_mlir_path
-        script_dir = Path(__file__).resolve().parent
-        command = [
-            (script_dir / "compile_dispatch.sh").as_posix(),
-            mlir_path.as_posix(),
-        ]
-        return command
-
-    def get_dispatch_benchmark_timeout_s(self) -> int:
-        return 15
-
-    def get_dispatch_benchmark_command(
-        self,
-        candidate_tracker: libtuner.CandidateTracker,
-    ) -> list[str]:
-        compiled_vmfb_path = candidate_tracker.compiled_dispatch_path
-        assert compiled_vmfb_path is not None
-
-        command = [
-            "iree-benchmark-module",
-            f"--device={libtuner.DEVICE_ID_PLACEHOLDER}",
-            f"--module={compiled_vmfb_path.resolve()}",
-            "--batch_size=1000",
-            "--benchmark_repetitions=3",
-            "--benchmark_format=json",
-        ]
-
-        return command
-
-    def get_model_compile_timeout_s(self) -> int:
-        return 0
-
-    def get_model_compile_command(
-        self, candidate_tracker: libtuner.CandidateTracker
-    ) -> list[str]:
-        return []
-
-    def get_model_benchmark_timeout_s(self) -> int:
-        return 0
-
-    def get_model_benchmark_command(
-        self, candidate_tracker: libtuner.CandidateTracker
-    ) -> list[str]:
-        return []
-
-    def get_iree_compile_flags(self) -> list[str]:
-        return []
-
-    def get_iree_benchmark_module_flags(self) -> list[str]:
-        return []
-
-    def get_benchmark_timeout_s(self) -> int:
-        return 0
-
-
-def main():
-    args = libtuner.parse_arguments()
-    path_config = libtuner.PathConfig()
-    # These will not be used, so always default to the empty config in the script dir.
-    script_dir = Path(__file__).resolve().parent
-    path_config.global_config_prolog_mlir = (
-        script_dir / path_config.global_config_prolog_mlir
-    )
-    path_config.global_config_epilog_mlir = (
-        script_dir / path_config.global_config_epilog_mlir
-    )
-    path_config.base_dir.mkdir(parents=True, exist_ok=True)
-    path_config.output_unilog.touch()
-    candidate_trackers: list[libtuner.CandidateTracker] = []
-    dispatch_tuner = DispatchTuner()
-    stop_after_phase: str = args.stop_after
-
-    print("Setup logging")
-    libtuner.setup_logging(args, path_config)
-    print(path_config.run_log, end="\n\n")
-
-    if not args.dry_run:
-        print("Validating devices")
-        libtuner.validate_devices(args.devices)
-        print("Validation successful!\n")
-
-    print("Generating candidates...")
-    candidates = libtuner.generate_candidates(args, path_config, candidate_trackers)
-    print(f"Stored candidates in {path_config.candidates_dir}\n")
-    if stop_after_phase == libtuner.ExecutionPhases.generate_candidates:
-        return
-
-    print("Compiling candidates...")
-    compiled_candidates = libtuner.compile_dispatches(
-        args, path_config, candidates, candidate_trackers, dispatch_tuner
-    )
-    print(f"Compiled files are stored in {path_config.compiled_dir}\n")
-    if stop_after_phase == libtuner.ExecutionPhases.compile_dispatches:
-        return
-
-    print("Benchmarking compiled candidates...")
-    top_candidates = libtuner.benchmark_dispatches(
-        args, path_config, compiled_candidates, candidate_trackers, dispatch_tuner
-    )
-    print(f"\nStored results in {path_config.output_unilog.resolve()}\n")
-    if stop_after_phase == libtuner.ExecutionPhases.benchmark_dispatches:
-        return
-
-    libtuner.save_pickle(path_config.candidate_trackers_pkl, candidate_trackers)
-    print(f"Candidate trackers are saved in {path_config.candidate_trackers_pkl}\n")
-
-    print("Check the detailed execution logs in:")
-    print(path_config.run_log.resolve())
-
-    for candidate in candidate_trackers:
-        libtuner.logging.debug(candidate)
diff --git a/tuner/examples/dispatch/mmt.mlir b/tuner/examples/dispatch/mmt.mlir
deleted file mode 100644
index b9d6c5f4c..000000000
--- a/tuner/examples/dispatch/mmt.mlir
+++ /dev/null
@@ -1,11 +0,0 @@
-!matA_0 = tensor<2048x1280xf16>
-!matB_0 = tensor<10240x1280xf16>
-!matC_0 = tensor<2048x10240xf32>
-
-func.func @main_0(%arg0: !matA_0, %arg1: !matB_0) -> !matC_0 {
-  %cst = arith.constant 0.000000e+00 : f16
-  %5 = tensor.empty() : !matC_0
-  %6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_0) -> !matC_0
-  %8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_0, !matB_0) outs(%6 : !matC_0) -> !matC_0
-  return %8 : !matC_0
-}
diff --git a/tuner/examples/punet/.gitignore b/tuner/examples/punet/.gitignore
deleted file mode 100644
index fae904ffb..000000000
--- a/tuner/examples/punet/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# Test files/dirs recommended by README.md.
-dump-mmt
-test-benchmark.mlir
diff --git a/tuner/examples/punet/README.md b/tuner/examples/punet/README.md
deleted file mode 100644
index 777d1c194..000000000
--- a/tuner/examples/punet/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-# Punet Tuner
-
-## Environments
-Follow instructions in [`/tuner/README.md`](../README.md)
-
-## Shell Scripts
-
-The required shell scripts can be downloaded from:
-[sdxl-scripts](https://github.com/nod-ai/sdxl-scripts).
-
-These scripts include:
-1. `compile-punet-base.sh` - Used for compiling model candidates.
-2. `compile_candidate.sh` - Used for compiling dispatch candidates.
-3. `punet.sh` - Invoked by `compile_candidate.sh`.
-
-Add the parent directories of these scripts to your `PATH` environment variable,
-so that they can be picked up by `punet_autotune.py`.
-
-## Running the Tuner
-
-### [Optional] Generate a tunable mlir
-Use
-[`punet.sh`](https://github.com/nod-ai/sdxl-scripts/blob/main/tuning/punet.sh)
-to compile the sample matmul `mmt.mlir` (can also find here:
-[`mmt_unet.mlir`](https://github.com/nod-ai/sdxl-scripts/blob/main/tuning/mmt_unet.mlir)):
-```shell
-punet.sh mmt.mlir -o mmt.vmfb --iree-hal-dump-executable-files-to=dump-mmt
-cp ./dump-mmt/module_main_0_dispatch_0_rocm_hsaco_fb_benchmark.mlir test-benchmark.mlir
-```
-
-### Recommended Trial Run
-For an initial trial to test the tuning loop, use:
-```shell
-python -m examples.punet test-benchmark.mlir --num-candidates=10
-```
-
-### Dry Run Test
-To perform a dry run (no GPU required), use:
-```shell
-python -m examples.punet test-benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
-```
-
-### Basic Usage
-```shell
-python -m examples.punet test-benchmark.mlir
-```
diff --git a/tuner/examples/punet/__init__.py b/tuner/examples/punet/__init__.py
deleted file mode 100644
index a85ba359d..000000000
--- a/tuner/examples/punet/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright 2024 Advanced Micro Devices, Inc.
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/tuner/examples/punet/__main__.py b/tuner/examples/punet/__main__.py
deleted file mode 100644
index ca092d502..000000000
--- a/tuner/examples/punet/__main__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright 2024 Advanced Micro Devices, Inc.
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from . import punet_autotune
-
-punet_autotune.main()
diff --git a/tuner/examples/punet/mmt.mlir b/tuner/examples/punet/mmt.mlir
deleted file mode 100644
index b9d6c5f4c..000000000
--- a/tuner/examples/punet/mmt.mlir
+++ /dev/null
@@ -1,11 +0,0 @@
-!matA_0 = tensor<2048x1280xf16>
-!matB_0 = tensor<10240x1280xf16>
-!matC_0 = tensor<2048x10240xf32>
-
-func.func @main_0(%arg0: !matA_0, %arg1: !matB_0) -> !matC_0 {
-  %cst = arith.constant 0.000000e+00 : f16
-  %5 = tensor.empty() : !matC_0
-  %6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_0) -> !matC_0
-  %8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_0, !matB_0) outs(%6 : !matC_0) -> !matC_0
-  return %8 : !matC_0
-}
diff --git a/tuner/examples/punet/punet_autotune.py b/tuner/examples/punet/punet_autotune.py
deleted file mode 100644
index 2bfdb4d24..000000000
--- a/tuner/examples/punet/punet_autotune.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright 2024 Advanced Micro Devices, Inc
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-"""
-Sample Usage:
-
-python -m examples.punet benchmark.mlir --lhs-dims=bmk --rhs-dims=bkn --tile-dims=*mnk --devices=hip://0,hip://1 --num-candidates=64
-
-
-Recommended Trial Run:
-
-python -m examples.punet benchmark.mlir --num-candidates=1
-
-
-Dry Run Test (no gpu requried):
-
-python -m examples.punet benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
-
-"""
-
-from tuner import libtuner
-from pathlib import Path
-
-
-class PunetClient(libtuner.TuningClient):
-    def get_dispatch_compile_timeout_s(self) -> int:
-        return 4
-
-    def get_dispatch_compile_command(
-        self, candidate_tracker: libtuner.CandidateTracker
-    ) -> list[str]:
-        mlir_path = candidate_tracker.dispatch_mlir_path
-        assert mlir_path is not None
-        command = [
-            "compile_candidate.sh",
-            mlir_path.as_posix(),
-        ]
-        return command
-
-    def get_dispatch_benchmark_timeout_s(self) -> int:
-        return 15
-
-    def get_dispatch_benchmark_command(
-        self,
-        candidate_tracker: libtuner.CandidateTracker,
-    ) -> list[str]:
-        compiled_vmfb_path = candidate_tracker.compiled_dispatch_path
-        assert compiled_vmfb_path is not None
-
-        command = [
-            "iree-benchmark-module",
-            f"--device={libtuner.DEVICE_ID_PLACEHOLDER}",
-            f"--module={compiled_vmfb_path.resolve()}",
-            "--hip_use_streams=true",
-            "--hip_allow_inline_execution=true",
-            "--batch_size=1000",
-            "--benchmark_repetitions=3",
-            "--benchmark_format=json",
-        ]
-
-        return command
-
-    def get_model_compile_timeout_s(self) -> int:
-        return 300
-
-    def get_model_compile_command(
-        self, candidate_tracker: libtuner.CandidateTracker
-    ) -> list[str]:
-        mlir_spec_path = candidate_tracker.spec_path
-        assert mlir_spec_path is not None
-        target_dir = mlir_spec_path.resolve().parent.parent.parent
-        output_name = f"unet_candidate_{candidate_tracker.candidate_id}.vmfb"
-        command = [
-            "compile-punet-base.sh",
-            "iree-compile",
-            "gfx942",
-            f"{mlir_spec_path.resolve()}",
-            "./punet.mlir",
-            "-o",
-            (target_dir / output_name).as_posix(),
-        ]
-        return command
-
-    def get_model_benchmark_timeout_s(self) -> int:
-        return 180
-
-    def get_model_benchmark_command(
-        self, candidate_tracker: libtuner.CandidateTracker
-    ) -> list[str]:
-        unet_candidate_path = candidate_tracker.compiled_model_path
-        assert unet_candidate_path is not None
-
-        command = [
-            "iree-benchmark-module",
-            f"--device={libtuner.DEVICE_ID_PLACEHOLDER}",
-            "--hip_use_streams=true",
-            "--hip_allow_inline_execution=true",
-            "--device_allocator=caching",
-            f"--module={unet_candidate_path.resolve()}",
-            "--parameters=model=punet.irpa",
-            "--function=main",
-            "--input=1x4x128x128xf16",
-            "--input=1xsi32",
-            "--input=2x64x2048xf16",
-            "--input=2x1280xf16",
-            "--input=2x6xf16",
-            "--input=1xf16",
-            "--benchmark_repetitions=5",
-            "--benchmark_format=json",
-        ]
-        return command
-
-    def get_iree_compile_flags(self) -> list[str]:
-        return []
-
-    def get_iree_benchmark_module_flags(self) -> list[str]:
-        return []
-
-    def get_benchmark_timeout_s(self) -> int:
-        return 0
-
-
-def main():
-    args = libtuner.parse_arguments()
-    path_config = libtuner.PathConfig()
-    path_config.base_dir.mkdir(parents=True, exist_ok=True)
-    path_config.output_unilog.touch()
-    candidate_trackers: list[libtuner.CandidateTracker] = []
-    punet_client = PunetClient()
-    stop_after_phase: str = args.stop_after
-
-    print("Setup logging")
-    libtuner.setup_logging(args, path_config)
-    print(path_config.run_log, end="\n\n")
-
-    if not args.dry_run:
-        print("Validating devices")
-        libtuner.validate_devices(args.devices)
-        print("Validation successful!\n")
-
-    print("Generating candidates...")
-    candidates = libtuner.generate_candidates(args, path_config, candidate_trackers)
-    print(f"Stored candidates in {path_config.candidates_dir}\n")
-    if stop_after_phase == libtuner.ExecutionPhases.generate_candidates:
-        return
-
-    print("Compiling candidates...")
-    compiled_candidates = libtuner.compile_dispatches(
-        args, path_config, candidates, candidate_trackers, punet_client
-    )
-    print(f"Compiled files are stored in {path_config.compiled_dir}\n")
-    if stop_after_phase == libtuner.ExecutionPhases.compile_dispatches:
-        return
-
-    print("Benchmarking compiled candidates...")
-    top_candidates = libtuner.benchmark_dispatches(
-        args, path_config, compiled_candidates, candidate_trackers, punet_client
-    )
-    print(f"Stored results in {path_config.output_unilog}\n")
-    if stop_after_phase == libtuner.ExecutionPhases.benchmark_dispatches:
-        return
-
-    print(f"Compiling top model candidates...")
-    punet_candidates = libtuner.compile_models(
-        args, path_config, top_candidates, candidate_trackers, punet_client
-    )
-    print(f"Model candidates compiled in {path_config.base_dir}\n")
-    if stop_after_phase == libtuner.ExecutionPhases.compile_models:
-        return
-
-    print("Benchmarking model candidates...")
-    libtuner.benchmark_models(
-        args, path_config, punet_candidates, candidate_trackers, punet_client
-    )
-    print(f"Stored results in {path_config.output_unilog}")
-    if stop_after_phase == libtuner.ExecutionPhases.benchmark_models:
-        return
-
-    libtuner.summerize_top_candidates(path_config, candidate_trackers)
-    print(f"Stored top candidates info in {path_config.result_summary_log}\n")
-
-    libtuner.save_pickle(path_config.candidate_trackers_pkl, candidate_trackers)
-    print(f"Candidate trackers are saved in {path_config.candidate_trackers_pkl}\n")
-
-    print("Check the detailed execution logs in:")
-    print(path_config.run_log)
-
-    for candidate in candidate_trackers:
-        libtuner.logging.debug(candidate)
-        if args.verbose:
-            print(candidate)
diff --git a/tuner/examples/test/README.md b/tuner/examples/test/README.md
index 5dfba0da3..47ae7a8fe 100644
--- a/tuner/examples/test/README.md
+++ b/tuner/examples/test/README.md
@@ -35,5 +35,6 @@ python -m examples.test double_mmt.mlir mmt_benchmark.mlir \
 python -m examples.test <model_file_path> <benchmark_file_path> \
     --test_num_dispatch_candidates=<num_dispatch_candidates> \
     --test_num_model_candidates=<num_model_candidates> \
-    --test_hip_target=<hip_target> \ --num-candidates=<num_generated_candidates>
+    --test_hip_target=<hip_target> \
+    --num-candidates=<num_generated_candidates>
 ```
diff --git a/tuner/examples/test/tuner_test.py b/tuner/examples/test/tuner_test.py
index 528f03b80..22a0d2f4d 100644
--- a/tuner/examples/test/tuner_test.py
+++ b/tuner/examples/test/tuner_test.py
@@ -90,7 +90,6 @@ def main():
 
     path_config = libtuner.PathConfig()
     path_config.base_dir.mkdir(parents=True, exist_ok=True)
-    path_config.output_unilog.touch()
     # TODO(Max191): Make candidate_trackers internal to TuningClient.
     candidate_trackers: list[libtuner.CandidateTracker] = []
     stop_after_phase: str = args.stop_after
diff --git a/tuner/tuner/candidate_gen.py b/tuner/tuner/candidate_gen.py
index 45cb3512a..b6264792e 100644
--- a/tuner/tuner/candidate_gen.py
+++ b/tuner/tuner/candidate_gen.py
@@ -11,21 +11,17 @@
 Generate candidates by tweaking op configuration for tuning.
 
 It can be invoked in two ways:
-    1. From another python script, import and call `tune()`
+    1. From another python script, import and call `generate_configs_and_td_specs()`
     2. Run this script directly from the command
-
-Usage: ./candidate_gen.py 121.mlir -o "tuning/candidates" -l 1024 --lhs-dims=mk --rhs-dims=nk --tile-dims=mnk
-
+Usage: python -m tuner.candidate_gen mmt_benchmark.mlir -o spec_dir -l 1024
 """
 
 import argparse
 import logging
-import pickle
-import re
 from dataclasses import dataclass
-from os import path, makedirs
+from pathlib import Path
+import subprocess
 from typing import Optional
-from textwrap import indent
 from abc import abstractmethod
 
 from iree.compiler import ir  # type: ignore
@@ -40,61 +36,6 @@
 tune_logger = logging.getLogger("tune")
 
 
-def apply_configuration(
-    template: list[str],
-    compilation_info: iree_codegen.CompilationInfoAttr,
-) -> str:
-    lowering_config = compilation_info.lowering_config
-    intrinsic = lowering_config.mma_kind
-    (
-        subgroup_m_count,
-        subgroup_n_count,
-    ) = lowering_config.subgroup_count_mn
-    workgroup_sizes = lowering_config.workgroup_tile_sizes
-    reduction_sizes = lowering_config.reduction_tile_sizes
-    gpu_pipeline_options = compilation_info.translation_info.configuration[
-        GPU_PIPELINE_OPTIONS_KEY
-    ]
-    waves_per_eu = compilation_info.translation_info.configuration[LLVM_FUNC_ATTRS_KEY][
-        WAVES_PER_EU_KEY
-    ]
-    tune_logger.info(f"Applying: {compilation_info}")
-    expr0 = re.compile(
-        r"<intrinsic = #iree_gpu\.mma_layout<(.+)>, subgroup_m_count = ([0-9]+), subgroup_n_count = ([0-9]+)>"
-    )
-    expr1 = re.compile(
-        r"LLVMGPUVectorDistribute workgroup_size = \[.+\] subgroup_size = ([0-9]+),"
-    )
-    expr2 = re.compile(r"workgroup = \[([0-9]+)(, ([0-9]+))+\]")
-    expr3 = re.compile(r"reduction = \[([0-9]+)(, ([0-9]+))+\]")
-    expr4 = re.compile(r"gpu_pipeline_options = #iree_gpu\.pipeline_options<([^>]*)>")
-    expr5 = re.compile(r"\"amdgpu-waves-per-eu\" = \"([0-9])\"")
-    repl0 = f"<intrinsic = {intrinsic}, subgroup_m_count = {subgroup_m_count}, subgroup_n_count = {subgroup_n_count}>"
-    repl1 = f'LLVMGPUVectorDistribute workgroup_size = [{", ".join(map(str, compilation_info.translation_info.workgroup_size))}] subgroup_size = {compilation_info.translation_info.subgroup_size},'
-    repl2 = f"workgroup = {workgroup_sizes}"
-    repl3 = f"reduction = {reduction_sizes}"
-    repl4 = f"gpu_pipeline_options = {gpu_pipeline_options}"
-    repl5 = f'"amdgpu-waves-per-eu" = {waves_per_eu}'
-
-    new_mlir = ""
-    for line in template:
-        if "intrinsic =" in line:
-            line = re.sub(expr0, repl0, line)
-        if "LLVMGPUVectorDistribute " in line:
-            line = re.sub(expr1, repl1, line)
-        if "workgroup" in line:
-            line = re.sub(expr2, repl2, line)
-        if "reduction" in line:
-            line = re.sub(expr3, repl3, line)
-        if "gpu_pipeline_options =" in line:
-            line = re.sub(expr4, repl4, line)
-        if "amdgpu-waves-per-eu" in line:
-            line = re.sub(expr5, repl5, line)
-        new_mlir += line
-
-    return new_mlir
-
-
 class DispatchTuner(DispatchParser):
     # TODO(https://github.com/nod-ai/shark-ai/issues/453): Remove this in favor of configuring using transform dialect.
     @abstractmethod
@@ -206,321 +147,6 @@ def get_td_spec(
         return build_td_spec(ir_module.context, conv_op, compilation_info, func_name)
 
 
-class MmtTuner(DispatchTuner, MmtParser):
-    def get_transform_function_mmt(
-        self,
-        problem_size: ProblemSize,
-        functionName: str,
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> str:
-        return f"""
-    transform.named_sequence @{functionName}(%matmul: !transform.any_op {{transform.readonly}}) -> (!transform.any_op, !transform.any_param) {{
-    %mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
-    %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
-    %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
-    transform.iree.match.cast_compatible_type %lhs = tensor<{problem_size.lhs_type}> : !transform.any_value
-    transform.iree.match.cast_compatible_type %rhs = tensor<{problem_size.rhs_type}> : !transform.any_value
-    %config = transform.param.constant {compilation_info} -> !transform.any_param
-    transform.yield %matmul, %config : !transform.any_op, !transform.any_param
-    }}
-    """
-
-    def apply_params(
-        self,
-        problem_size: ProblemSize,
-        template: list[str],
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> MLIRTransformation:
-        M, N, K = problem_size.MNK
-        modified = indent(
-            self.get_transform_function_mmt(
-                problem_size, f"match_mmt_{M}x{N}x{K}", compilation_info
-            ),
-            "//   ",
-        )
-        modified += apply_configuration(
-            template,
-            compilation_info,
-        )
-        embeddable = indent(
-            self.get_transform_function_mmt(
-                problem_size, f"match_op", compilation_info
-            ),
-            "  ",
-        )
-        return MLIRTransformation(template, modified, embeddable)
-
-    def get_td_spec(
-        self,
-        ir_module: ir.Module,
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> ir.Module:
-        raise NotImplementedError
-
-
-class ConvTuner(DispatchTuner, ConvParser):
-    def get_transform_function_conv(
-        self,
-        problem_size: ProblemSize,
-        functionName: str,
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> str:
-        dynamic_batch_input_ty = problem_size.lhs_type
-        dynamic_batch_input_ty.shape = dynamic_batch_input_ty.shape.copy()
-        dynamic_batch_input_ty.shape[0] = -1
-
-        dynamic_batch_output_ty = problem_size.res_type
-        dynamic_batch_output_ty.shape = dynamic_batch_output_ty.shape.copy()
-        dynamic_batch_output_ty.shape[0] - 1
-
-        input = f"tensor<{dynamic_batch_input_ty}>"
-        filter = f"tensor<{problem_size.rhs_type}>"
-        output = f"tensor<{dynamic_batch_output_ty}>"
-
-        return f"""
-    transform.named_sequence @{functionName}(%conv: !transform.any_op {{transform.readonly}})
-    -> (!transform.any_op, !transform.any_param) {{
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %conv {{
-    ^bb0(%lhs: {input}, %rhs: {filter}, %out: {output}):
-        %13 = linalg.conv_2d_nhwc_hwcf {{dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}}
-        ins(%lhs, %rhs : {input}, {filter})
-        outs(%out : {output}) -> {output}
-    }} : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-        %config = transform.param.constant {compilation_info} -> !transform.any_param
-    transform.yield %conv, %config : !transform.any_op, !transform.any_param
-    }}
-    """
-
-    def apply_params(
-        self,
-        problem_size: ProblemSize,
-        template: list[str],
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> MLIRTransformation:
-        conv_dims = ConvDimInfo.from_problem_size(problem_size)
-        modified = indent(
-            self.get_transform_function_conv(
-                problem_size,
-                f"match_conv_2d_nhwc_hwcf_Bx{conv_dims.oh}x{conv_dims.ow}x{conv_dims.oc}x{conv_dims.fh}x{conv_dims.fw}x{conv_dims.ic}",
-                compilation_info,
-            ),
-            "//   ",
-        )
-        modified += apply_configuration(
-            template,
-            compilation_info,
-        )
-        embeddable = indent(
-            self.get_transform_function_conv(
-                problem_size, f"match_op", compilation_info
-            ),
-            "  ",
-        )
-        return MLIRTransformation(template, modified, embeddable)
-
-    def get_td_spec(
-        self,
-        ir_module: ir.Module,
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> ir.Module:
-        raise NotImplementedError
-
-
-class ContractionTuner(DispatchTuner, ContractionParser):
-    def get_transform_function_broadcast_rhs_mmt(
-        self,
-        problem_size: ProblemSize,
-        functionName: str,
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> str:
-        lhs_dynamic_batch = problem_size.lhs_type
-        lhs_dynamic_batch.shape = lhs_dynamic_batch.shape.copy()
-        lhs_dynamic_batch.shape[0] = -1
-
-        return f"""
-transform.named_sequence @{functionName}(%generic: !transform.any_op {{transform.readonly}}) -> (!transform.any_op, !transform.any_param) {{
-%mmt = transform.include @match_broadcast_rhs_mmt_i8_i8_i32 failures(propagate) (%generic) : (!transform.any_op) -> !transform.any_op
-%lhs = transform.get_operand %generic[0] : (!transform.any_op) -> !transform.any_value
-%rhs = transform.get_operand %generic[1] : (!transform.any_op) -> !transform.any_value
-transform.iree.match.cast_compatible_type %lhs = tensor<{lhs_dynamic_batch}> : !transform.any_value
-transform.iree.match.cast_compatible_type %rhs = tensor<{problem_size.rhs_type}> : !transform.any_value
-%config = transform.param.constant {compilation_info} -> !transform.any_param
-transform.yield %generic, %config : !transform.any_op, !transform.any_param
-}}
-"""
-
-    def apply_params_broadcast_rhs_mmt(
-        self,
-        problem_size: ProblemSize,
-        template: list[str],
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> MLIRTransformation:
-        M, N, K = problem_size.MNK
-        modified = indent(
-            self.get_transform_function_broadcast_rhs_mmt(
-                problem_size, f"match_broadcast_rhs_mmt_Bx{M}x{N}x{K}", compilation_info
-            ),
-            "//   ",
-        )
-        modified += apply_configuration(
-            template,
-            compilation_info,
-        )
-
-        embeddable = indent(
-            self.get_transform_function_broadcast_rhs_mmt(
-                problem_size, f"match_op", compilation_info
-            ),
-            "  ",
-        )
-        return MLIRTransformation(template, modified, embeddable)
-
-    def apply_params(
-        self,
-        problem_size: ProblemSize,
-        template: list[str],
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> MLIRTransformation:
-        if self.is_broadcast_rhs_mmt(template):
-            return self.apply_params_broadcast_rhs_mmt(
-                problem_size, template, compilation_info
-            )
-
-        # TODO: Generate transform function.
-        return MLIRTransformation(
-            template,
-            apply_configuration(
-                template,
-                compilation_info,
-            ),
-            "",
-        )
-
-    def get_td_spec(
-        self,
-        ir_module: ir.Module,
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> ir.Module:
-        raise NotImplementedError
-
-
-class BatchMmtTuner(DispatchTuner, BatchMmtParser):
-    def get_transform_function_batch_mmt(
-        self,
-        problem_size: ProblemSize,
-        functionName: str,
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> str:
-        return f"""
-transform.named_sequence @{functionName}(%generic: !transform.any_op {{transform.readonly}}) -> (!transform.any_op, !transform.any_param) {{
-%mmt = transform.include @match_batch_mmt_i8_i8_i32 failures(propagate) (%generic) : (!transform.any_op) -> !transform.any_op
-%lhs = transform.get_operand %generic[0] : (!transform.any_op) -> !transform.any_value
-%rhs = transform.get_operand %generic[1] : (!transform.any_op) -> !transform.any_value
-transform.iree.match.cast_compatible_type %lhs = tensor<{problem_size.lhs_type}> : !transform.any_value
-transform.iree.match.cast_compatible_type %rhs = tensor<{problem_size.rhs_type}> : !transform.any_value
-%config = transform.param.constant {compilation_info} -> !transform.any_param
-transform.yield %generic, %config : !transform.any_op, !transform.any_param
-}}
-"""
-
-    def apply_params(
-        self,
-        problem_size: ProblemSize,
-        template: list[str],
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> MLIRTransformation:
-        M, N, K = problem_size.MNK
-        B = problem_size.matmul_size.B
-        modified = indent(
-            self.get_transform_function_batch_mmt(
-                problem_size, f"match_batch_mmt_{B}x{M}x{N}x{K}", compilation_info
-            ),
-            "//   ",
-        )
-        modified += apply_configuration(
-            template,
-            compilation_info,
-        )
-
-        embeddable = indent(
-            self.get_transform_function_batch_mmt(
-                problem_size, f"match_op", compilation_info
-            ),
-            "  ",
-        )
-        return MLIRTransformation(template, modified, embeddable)
-
-    def get_td_spec(
-        self,
-        ir_module: ir.Module,
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> ir.Module:
-        raise NotImplementedError
-
-
-class BatchMatmulTuner(DispatchTuner, BatchMatmulParser):
-    def get_transform_function_batch_matmul(
-        self,
-        problem_size: ProblemSize,
-        tile_dims: str,
-        functionName: str,
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> str:
-        input0 = f"tensor<{problem_size.lhs_type}>"
-        input1 = f"tensor<{problem_size.rhs_type}>"
-        output = f"tensor<{problem_size.res_type}>"
-
-        return f"""
-    transform.named_sequence @{functionName}(%batch_matmul: !transform.any_op {{transform.readonly}})
-    -> (!transform.any_op, !transform.any_param) {{
-    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %batch_matmul {{
-    ^bb0(%lhs: {input0}, %rhs: {input1}, %out: {output}):
-        %13 = linalg.batch_matmul
-        ins(%lhs, %rhs : {input0}, {input1})
-        outs(%out : {output}) -> {output}
-    }} : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
-        %config = transform.param.constant {compilation_info} -> !transform.any_param
-    transform.yield %batch_matmul, %config : !transform.any_op, !transform.any_param
-    }}
-    """
-
-    def apply_params(
-        self,
-        problem_size: ProblemSize,
-        template: list[str],
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> MLIRTransformation:
-        M, N, K = problem_size.MNK
-        modified = indent(
-            self.get_transform_function_batch_matmul(
-                problem_size,
-                self.tile_dims,
-                f"match_batch_matmul_{problem_size.matmul_size.B}x{M}x{N}x{K}",
-                compilation_info,
-            ),
-            "//   ",
-        )
-        modified += apply_configuration(
-            template,
-            compilation_info,
-        )
-
-        embeddable = indent(
-            self.get_transform_function_batch_matmul(
-                problem_size, self.tile_dims, f"match_op", compilation_info
-            ),
-            "  ",
-        )
-        return MLIRTransformation(template, modified, embeddable)
-
-    def get_td_spec(
-        self,
-        ir_module: ir.Module,
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> ir.Module:
-        raise NotImplementedError
-
-
 @dataclass
 class OpWalkResult:
     was_interrupted: bool = False
@@ -563,82 +189,6 @@ def get_default_output_dir() -> str:
     return "tuning_" + datetime.now().strftime("%Y_%m_%d_%H_%M")
 
 
-# TODO(https://github.com/nod-ai/shark-ai/issues/453): Remove in favor of using tune_with_td.
-def tune(
-    input: str,  # Path to the mlir file to be tuned
-    output: str = "",  # Path to the output directory, auto creates one if not given
-    limit: int = 4096,  # Max candidates to be generated
-    num_subgroups: int = 4,  # GPU spec, used to determine candidate generation constraints
-    lhs_dims: str = "mk",  # Dimensions for the left-hand side operand in matrix operations
-    rhs_dims: str = "nk",  # Dimensions for the right-hand side operand in matrix operations
-    tile_dims: str = "mnk",  # Dimensions for the tile size
-):
-    input_file = str(input)
-
-    if not output:
-        output = get_default_output_dir()
-
-    # Create the directory if it does not exist
-    makedirs(str(output), exist_ok=True)
-
-    tune_logger.debug(f"Output directory {output}")
-    tune_logger.debug(f"Processing {input_file}")
-    mlir_template = read_input_mlir(input_file)
-    mlir_text = "".join(mlir_template)
-
-    with ir.Context() as ctx:
-        tuner_context = TunerContext(ctx, tune_logger)
-        mlir_module = parse_mlir(mlir_text, tuner_context)
-        # Save the input file as the first candidate.
-        with open(path.join(output, f"0.mlir"), "w") as f:
-            f.write(mlir_text)
-
-        dispatch_tuner_registry = DispatchTunerRegistry()
-        dispatch_tuner_registry.register(
-            [
-                MmtTuner(),
-                ConvTuner(),
-                ContractionTuner(lhs_dims, rhs_dims, tile_dims),
-                BatchMmtTuner(),
-                BatchMatmulTuner(lhs_dims, rhs_dims, tile_dims),
-            ]
-        )
-
-        walk_result: OpWalkResult = walk_mlir_op(mlir_module, dispatch_tuner_registry)
-
-        variant_op_list = iree_codegen.get_executable_variant_ops(mlir_module)
-        assert len(variant_op_list) == 1, "Expect one executable variant op"
-        variant_op = variant_op_list[0]
-        # Get the MMA intrinisic intructions supported by the target.
-        mma_list = iree_codegen.query_mma_intrinsics(variant_op)
-
-        dispatch_tuner = walk_result.dispatch_tuner
-        assert dispatch_tuner, "No suitable dispatch tuner found"
-        problem_size: ProblemSize = dispatch_tuner.get_shapes(mlir_template)
-        tune_logger.debug(str(problem_size))
-        configs = []
-        for i, config in enumerate(
-            generate_solutions(tuner_context, problem_size, num_subgroups, mma_list)
-        ):
-            if i >= limit:
-                break
-            tune_logger.info(f"Solution #{i+1}: {config}")
-            configs.append(config)
-            tf_mlir = dispatch_tuner.apply_params(problem_size, mlir_template, config)
-
-            with open(path.join(output, f"{i+1}.mlir"), "w") as f:
-                f.write(tf_mlir.modified)
-            with open(path.join(output, f"{i+1}_config.mlir"), "w") as f:
-                f.write(tf_mlir.embeddable)
-
-        # TODO: Fix pickling for ir types.
-        # with open(path.join(output, "configs.pkl"), "wb") as file:
-        #    pickle.dump(configs, file)
-
-        tune_logger.info(f"Generated {len(configs)} candidates")
-        tune_logger.info(f"Configurations .pkl is stored in {output}/configs.pkl")
-
-
 def generate_configs_and_td_specs(
     input_module: ir.Module,  # Path to the mlir file to be tuned
     tuner_context: TunerContext,
@@ -684,6 +234,98 @@ def generate_configs_and_td_specs(
     return config_specs
 
 
+@dataclass
+class RunPack:
+    command: list[str]
+    check: bool = True
+    timeout_seconds: Optional[int] = None
+
+
+@dataclass
+class RunResult:
+    process_res: Optional[subprocess.CompletedProcess]
+    is_timeout: bool
+
+
+def run_command(run_pack: RunPack) -> RunResult:
+    command = run_pack.command
+    check = run_pack.check
+    timeout_seconds = run_pack.timeout_seconds
+
+    result = None
+    is_timeout = False
+    try:
+        # Convert the command list to a command string for logging
+        command_str = " ".join(command)
+        logging.debug(f"Run: {command_str}")
+
+        # Add timeout to subprocess.run call
+        result = subprocess.run(
+            command,
+            check=check,
+            capture_output=True,
+            text=True,
+            timeout=timeout_seconds,
+        )
+
+        if result.stdout:
+            logging.debug(f"stdout: {result.stdout}")
+        if result.stderr:
+            logging.debug(f"stderr: {result.stderr}")
+    except subprocess.TimeoutExpired as e:
+        logging.warning(
+            f"Command '{command_str}' timed out after {timeout_seconds} seconds."
+        )
+        is_timeout = True
+    except subprocess.CalledProcessError as e:
+        print(e.output)
+        logging.error(
+            f"Command '{command_str}' returned non-zero exit status {e.returncode}."
+        )
+        logging.error(f"Command '{command_str}' failed with error: {e.stderr}")
+        if check:
+            raise
+    except KeyboardInterrupt:
+        print("Ctrl+C detected, terminating child processes...")
+
+    return RunResult(result, is_timeout)
+
+
+# The `strip_root_op_attr` and `strip_compilation_info` functions are used for
+# getting consistent inputs to the compilation step in tuning. Inputs may come
+# in with lowering configs, translation info, and root_op attrs when the input
+# is a benchmark, but not when the input is a source MLIR file. Stripping the
+# info makes the inputs to compilation consistent, and allows for overwriting
+# the compilation info with generated TD specs during codegen.
+def strip_root_op_attr(module: ir.Module):
+    root_ops: list[ir.Operation] = get_ops_from_module(module, is_root_op)
+    for root_op in root_ops:
+        assert (
+            ROOT_OP_ATTR_NAME in root_op.opview.attributes
+        ), f"expected root op to have '{ROOT_OP_ATTR_NAME}' attr"
+        del root_op.opview.attributes[ROOT_OP_ATTR_NAME]
+
+
+# See the above comment for `strip_root_op_attr`.
+def strip_compilation_info(input_path: Path) -> str:
+    # Strip compilation info from the source and save the stripped IR
+    strip_command = [
+        f"iree-opt",
+        f"{input_path}",
+        f"--iree-codegen-strip-compilation-info",
+    ]
+    result = run_command(
+        RunPack(
+            command=strip_command,
+            check=True,
+        )
+    )
+    assert (
+        result.process_res is not None
+    ), "expected result from stripping compilation info"
+    return result.process_res.stdout
+
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("input", help="Input mlir file", type=str)
@@ -703,15 +345,6 @@ def main():
         type=int,
         default=-1,
     )
-    parser.add_argument(
-        "--lhs-dims", help="Map of LHS matmul dims", type=str, default="mk"
-    )
-    parser.add_argument(
-        "--rhs-dims", help="Map of RHS matmul dims", type=str, default="nk"
-    )
-    parser.add_argument(
-        "--tile-dims", help="Map of tile size matmul dims", type=str, default="mnk"
-    )
     parser.add_argument(
         "--verbose", "-v", action="store_true", help="Enable verbose output to stdout"
     )
@@ -727,20 +360,22 @@ def main():
     console_handler.setFormatter(formatter)
     tune_logger.addHandler(console_handler)
 
-    # # Optionally, add a file handler to log to a file
-    # file_handler = logging.FileHandler("tune.log")
-    # file_handler.setFormatter(formatter)
-    # tune_logger.addHandler(file_handler)
-
-    tune(
-        args.input,
-        args.output,
-        args.limit,
-        args.num_subgroups,
-        args.lhs_dims,
-        args.rhs_dims,
-        args.tile_dims,
-    )
+    with ir.Context() as ctx:
+        tuner_ctx = TunerContext(ctx, tune_logger)
+        mlir_text = strip_compilation_info(args.input)
+        mlir_module = parse_mlir(mlir_text, tuner_ctx)
+        specs = generate_configs_and_td_specs(
+            mlir_module,
+            tuner_ctx,
+            args.limit,
+            args.num_subgroups,
+        )
+        for candidate_num, spec in enumerate(specs):
+            spec_dir = Path(args.output)
+            spec_path = spec_dir / f"{candidate_num}_spec.mlir"
+            spec_dir.mkdir(parents=True, exist_ok=True)
+            with open(spec_path, "w") as f:
+                f.write(str(spec))
 
 
 if __name__ == "__main__":
diff --git a/tuner/tuner/candidate_gen_test.py b/tuner/tuner/candidate_gen_test.py
index d135a8502..8b0ca58d3 100644
--- a/tuner/tuner/candidate_gen_test.py
+++ b/tuner/tuner/candidate_gen_test.py
@@ -32,12 +32,6 @@ def tuner_ctx() -> Generator[common.TunerContext, None, None]:
         yield common.TunerContext(ctx, logger)
 
 
-def remove_comments(mlir: str) -> str:
-    return "\n".join(
-        filter(lambda x: not x.lstrip().startswith("//"), mlir.splitlines())
-    )
-
-
 def test_get_td_spec_contraction(tuner_ctx: common.TunerContext) -> None:
     context = tuner_ctx.mlir_ctx
     module_str = """
@@ -213,505 +207,3 @@ def test_get_td_spec_convolution(tuner_ctx: common.TunerContext) -> None:
         "gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false>"
         in matcher_sequence_str
     )
-
-
-def test_apply_params_mmt(tuner_ctx: common.TunerContext) -> None:
-    mlir_template = [
-        "<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 16, subgroup_n_count = 16>",
-        "<LLVMGPUVectorDistribute workgroup_size = [16, 16] subgroup_size = 16,",
-        "<workgroup = [8, 8, 8], reduction = [8, 8, 8]>",
-        "gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = None>",
-        '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}',
-    ]
-
-    M, N, K = 2048, 1280, 1280
-
-    mma_intrinsic = iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16
-    mma_attr = iree_gpu.MMAAttr.get(mma_intrinsic)
-    lowering_config = common.get_lowering_config(
-        tuner_ctx=tuner_ctx,
-        mma_kind=mma_attr,
-        workgroup=[8, 8, 0],
-        reduction=[0, 0, 8],
-        subgroup_m_count=16,
-        subgroup_n_count=16,
-    )
-    pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
-        iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
-    )
-    pipeline_options = iree_gpu.PipelineOptionsAttr.get(prefetch_shared_memory=True)
-    config_dict = common.get_translation_info_config(pipeline_options, 8)
-    translation_info = iree_codegen.TranslationInfoAttr.get(
-        pipeline_attr, None, [16, 16, 1], 16, config_dict
-    )
-    compilation_info = iree_codegen.CompilationInfoAttr.get(
-        lowering_config, translation_info
-    )
-
-    problem_size = common.ProblemSize(
-        common.MatmulSize(M, N, K),
-        common.ShapedType([M, K], tuner_ctx.type.f16),
-        common.ShapedType([N, K], tuner_ctx.type.f16),
-        common.ShapedType([M, N], tuner_ctx.type.f32),
-        common.DispatchKind.mmt,
-    )
-    tf_mlir = candidate_gen.MmtTuner().apply_params(
-        problem_size, mlir_template, compilation_info
-    )
-
-    modified = tf_mlir.modified
-    embeddable = tf_mlir.embeddable
-
-    assert modified
-    modified = remove_comments(modified)
-    assert embeddable
-    assert (
-        "intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 16, subgroup_n_count = 16"
-        in modified
-    )
-    assert (
-        "LLVMGPUVectorDistribute workgroup_size = [16, 16, 1] subgroup_size = 16"
-        in modified
-    )
-    assert "workgroup = [8, 8, 0]" in modified
-    assert "reduction = [0, 0, 8]" in modified
-    assert (
-        "gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>"
-        in modified
-    )
-    assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "8"}' in modified
-
-
-def test_apply_params_conv(tuner_ctx: common.TunerContext) -> None:
-    mlir_template = [
-        "<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 16, subgroup_n_count = 16>",
-        "<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64,",
-        "<workgroup = [1, 1, 64, 128, 1, 1, 32], reduction = [1, 1, 64, 128, 1, 1, 32]>",
-        'gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>, {llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}',
-    ]
-
-    n, oh, ow, oc, fh, fw, ic = 2, 64, 64, 640, 3, 3, 16
-
-    mma_intrinsic = iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16
-    mma_attr = iree_gpu.MMAAttr.get(mma_intrinsic)
-    lowering_config = common.get_lowering_config(
-        tuner_ctx=tuner_ctx,
-        mma_kind=mma_attr,
-        workgroup=[n, oh, ow, oc, fh, fw, 0],
-        reduction=[0, 0, 0, 0, 0, 0, ic],
-        subgroup_m_count=1,
-        subgroup_n_count=4,
-    )
-    pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
-        iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
-    )
-    pipeline_options = iree_gpu.PipelineOptionsAttr.get(
-        reorder_workgroups_strategy=iree_gpu.ReorderWorkgroupsStrategyAttr.get(
-            iree_gpu.ReorderWorkgroupsStrategy.Transpose
-        )
-    )
-    config_dict = common.get_translation_info_config(pipeline_options, 2)
-    translation_info = iree_codegen.TranslationInfoAttr.get(
-        pipeline_attr, None, [256, 1, 1], 64, config_dict
-    )
-    compilation_info = iree_codegen.CompilationInfoAttr.get(
-        lowering_config, translation_info
-    )
-
-    problem_size = common.ProblemSize(
-        common.MatmulSize(oh * ow, oc, fh * fw * ic),
-        common.ShapedType([n, oh + 2, ow + 2, oc], tuner_ctx.type.f16),
-        common.ShapedType([fh, fw, ic, oc], tuner_ctx.type.f16),
-        common.ShapedType([n, oh, ow, oc], tuner_ctx.type.f32),
-        common.DispatchKind.conv,
-    )
-    tf_mlir = candidate_gen.ConvTuner().apply_params(
-        problem_size, mlir_template, compilation_info
-    )
-
-    modified = tf_mlir.modified
-    embeddable = tf_mlir.embeddable
-
-    assert modified
-    modified = remove_comments(modified)
-    assert embeddable
-    assert (
-        "intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4"
-        in modified
-    )
-    assert (
-        "LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64"
-        in modified
-    )
-    assert "workgroup = [2, 64, 64, 640, 3, 3, 0]" in modified
-    assert "reduction = [0, 0, 0, 0, 0, 0, 16]" in modified
-    assert (
-        "gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = <Transpose>>"
-        in modified
-    )
-    assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}' in modified
-
-
-def test_apply_params_contract(tuner_ctx: common.TunerContext) -> None:
-    mlir_template = [
-        "<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>}>",
-        "<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64,",
-        "<workgroup = [1, 1, 1, 64, 64, 128], reduction = [1, 1, 1, 64, 64, 128]>",
-        '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}',
-    ]
-
-    tile_dims = "*mnk"
-    problem_size = common.ProblemSize(
-        common.MatmulSize(2048, 3840, 1280),
-        common.ShapedType([2, 1024, 1280], tuner_ctx.type.f16),
-        common.ShapedType([3, 20, 64, 1280], tuner_ctx.type.f16),
-        common.ShapedType([3, 2, 20, 1024, 64], tuner_ctx.type.f32),
-        common.DispatchKind.contraction,
-    )
-
-    mma_intrinsic = iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16
-    mma_attr = iree_gpu.MMAAttr.get(mma_intrinsic)
-    lowering_config = common.get_lowering_config(
-        tuner_ctx=tuner_ctx,
-        mma_kind=mma_attr,
-        workgroup=[1, 480, 384, 0],
-        reduction=[0, 0, 0, 32],
-        subgroup_m_count=1,
-        subgroup_n_count=4,
-    )
-    pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
-        iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
-    )
-    pipeline_options = iree_gpu.PipelineOptionsAttr.get()
-    config_dict = common.get_translation_info_config(pipeline_options, 2)
-    translation_info = iree_codegen.TranslationInfoAttr.get(
-        pipeline_attr, None, [256, 1, 1], 64, config_dict
-    )
-    compilation_info = iree_codegen.CompilationInfoAttr.get(
-        lowering_config, translation_info
-    )
-
-    tf_mlir = candidate_gen.ContractionTuner("mk", "nk", tile_dims).apply_params(
-        problem_size, mlir_template, compilation_info
-    )
-
-    new_mlir = tf_mlir.modified
-
-    assert new_mlir
-    assert (
-        "intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 1, subgroup_n_count = 4"
-        in new_mlir
-    )
-    assert (
-        "LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64"
-        in new_mlir
-    )
-    assert "workgroup = [1, 480, 384, 0]" in new_mlir
-    assert "reduction = [0, 0, 0, 32]" in new_mlir
-    assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}' in new_mlir
-
-
-def test_apply_params_batch_matmul(tuner_ctx: common.TunerContext) -> None:
-    mlir_template = [
-        "<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 4, subgroup_n_count = 1>}>",
-        "<LLVMGPUVectorDistribute workgroup_size = [64, 4, 1] subgroup_size = 64,",
-        "<workgroup = [1, 128, 64, 64], reduction = [1, 128, 64, 64]>",
-        '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}',
-    ]
-
-    tile_dims = "bmnk"
-    problem_size = common.ProblemSize(
-        common.MatmulSize(968, 320, 640, 64),
-        common.ShapedType([64, 968, 640], tuner_ctx.type.f16),
-        common.ShapedType([64, 640, 320], tuner_ctx.type.f16),
-        common.ShapedType([64, 968, 320], tuner_ctx.type.f32),
-        common.DispatchKind.batch_matmul,
-    )
-
-    mma_intrinsic = iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16
-    mma_attr = iree_gpu.MMAAttr.get(mma_intrinsic)
-    lowering_config = common.get_lowering_config(
-        tuner_ctx=tuner_ctx,
-        mma_kind=mma_attr,
-        workgroup=[1, 416, 320, 0],
-        reduction=[0, 0, 0, 128],
-        subgroup_m_count=2,
-        subgroup_n_count=2,
-    )
-    pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
-        iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
-    )
-    pipeline_options = iree_gpu.PipelineOptionsAttr.get()
-    config_dict = common.get_translation_info_config(pipeline_options, 2)
-    translation_info = iree_codegen.TranslationInfoAttr.get(
-        pipeline_attr, None, [128, 2, 1], 64, config_dict
-    )
-    compilation_info = iree_codegen.CompilationInfoAttr.get(
-        lowering_config, translation_info
-    )
-
-    tf_mlir = candidate_gen.BatchMatmulTuner("mk", "nk", tile_dims).apply_params(
-        problem_size, mlir_template, compilation_info
-    )
-
-    modified = tf_mlir.modified
-    embeddable = tf_mlir.embeddable
-
-    assert modified
-    modified = remove_comments(modified)
-
-    assert embeddable
-    assert (
-        "intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 2, subgroup_n_count = 2"
-        in modified
-    )
-    assert (
-        "LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64"
-        in modified
-    )
-    assert "workgroup = [1, 416, 320, 0]" in modified
-    assert "reduction = [0, 0, 0, 128]" in modified
-    assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}' in modified
-
-
-def test_apply_params_batch_mmt_float(tuner_ctx: common.TunerContext) -> None:
-    mlir_template = [
-        "<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 4, subgroup_n_count = 1>}>",
-        "<LLVMGPUVectorDistribute workgroup_size = [64, 4, 1] subgroup_size = 64,",
-        "<workgroup = [1, 128, 128, 64], reduction = [1, 128, 128, 64]>",
-        '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}',
-    ]
-
-    problem_size = common.ProblemSize(
-        common.MatmulSize(4096, 640, 640, 2),
-        common.ShapedType([2, 4096, 640], tuner_ctx.type.f16),
-        common.ShapedType([2, 640, 640], tuner_ctx.type.f16),
-        common.ShapedType([2, 4096, 640], tuner_ctx.type.f32),
-        common.DispatchKind.batch_mmt,
-    )
-
-    mma_intrinsic = iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16
-    mma_attr = iree_gpu.MMAAttr.get(mma_intrinsic)
-    lowering_config = common.get_lowering_config(
-        tuner_ctx=tuner_ctx,
-        mma_kind=mma_attr,
-        workgroup=[1, 128, 64, 0],
-        reduction=[0, 0, 0, 128],
-        subgroup_m_count=2,
-        subgroup_n_count=2,
-    )
-    pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
-        iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
-    )
-    pipeline_options = iree_gpu.PipelineOptionsAttr.get()
-    config_dict = common.get_translation_info_config(pipeline_options, 2)
-    translation_info = iree_codegen.TranslationInfoAttr.get(
-        pipeline_attr, None, [128, 2, 1], 64, config_dict
-    )
-    compilation_info = iree_codegen.CompilationInfoAttr.get(
-        lowering_config, translation_info
-    )
-
-    tf_mlir = candidate_gen.BatchMmtTuner().apply_params(
-        problem_size, mlir_template, compilation_info
-    )
-
-    modified = tf_mlir.modified
-    embeddable = tf_mlir.embeddable
-
-    assert embeddable
-    assert modified
-    assert (
-        "intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2"
-        in modified
-    )
-    assert (
-        "LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64"
-        in modified
-    )
-    assert "workgroup = [1, 128, 64, 0]" in modified
-    assert "reduction = [0, 0, 0, 128]" in modified
-    assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}' in modified
-
-
-def test_apply_params_batch_mmt_int(tuner_ctx: common.TunerContext) -> None:
-    mlir_template = [
-        "<intrinsic = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>, subgroup_m_count = 4, subgroup_n_count = 1>}>",
-        "<LLVMGPUVectorDistribute workgroup_size = [64, 4, 1] subgroup_size = 64,",
-        "<workgroup = [1, 128, 128, 64], reduction = [1, 128, 128, 64]>",
-        '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}',
-    ]
-
-    problem_size = common.ProblemSize(
-        common.MatmulSize(4096, 640, 640, 2),
-        common.ShapedType([2, 4096, 640], tuner_ctx.type.i8),
-        common.ShapedType([2, 640, 640], tuner_ctx.type.i8),
-        common.ShapedType([2, 4096, 640], tuner_ctx.type.i32),
-        common.DispatchKind.batch_mmt,
-    )
-
-    mma_intrinsic = iree_gpu.MMAIntrinsic.MFMA_I32_32x32x16_I8
-    mma_attr = iree_gpu.MMAAttr.get(mma_intrinsic)
-    lowering_config = common.get_lowering_config(
-        tuner_ctx=tuner_ctx,
-        mma_kind=mma_attr,
-        workgroup=[1, 128, 64, 0],
-        reduction=[0, 0, 0, 128],
-        subgroup_m_count=2,
-        subgroup_n_count=2,
-    )
-    pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
-        iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
-    )
-    pipeline_options = iree_gpu.PipelineOptionsAttr.get()
-    config_dict = common.get_translation_info_config(pipeline_options, 4)
-    translation_info = iree_codegen.TranslationInfoAttr.get(
-        pipeline_attr, None, [128, 2, 1], 64, config_dict
-    )
-    compilation_info = iree_codegen.CompilationInfoAttr.get(
-        lowering_config, translation_info
-    )
-
-    tf_mlir = candidate_gen.BatchMmtTuner().apply_params(
-        problem_size, mlir_template, compilation_info
-    )
-
-    modified = tf_mlir.modified
-    embeddable = tf_mlir.embeddable
-
-    assert modified
-    assert "//   transform.named_sequence @match_batch_mmt_2x4096x640x640(" in modified
-    modified = remove_comments(modified)
-
-    assert (
-        "intrinsic = #iree_gpu.mma_layout<MFMA_I32_32x32x16_I8>, subgroup_m_count = 2, subgroup_n_count = 2"
-        in modified
-    )
-    assert (
-        "LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64"
-        in modified
-    )
-    assert "workgroup = [1, 128, 64, 0]" in modified
-    assert "reduction = [0, 0, 0, 128]" in modified
-    assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}' in modified
-
-    assert embeddable
-    assert "transform.named_sequence @match_op(" in embeddable
-    assert (
-        "transform.include @match_batch_mmt_i8_i8_i32 failures(propagate)" in embeddable
-    )
-    assert (
-        "transform.iree.match.cast_compatible_type %lhs = tensor<2x4096x640xi8> : !transform.any_value"
-        in embeddable
-    )
-    assert (
-        "transform.iree.match.cast_compatible_type %rhs = tensor<2x640x640xi8> : !transform.any_value"
-        in embeddable
-    )
-    assert (
-        "%config = transform.param.constant #iree_codegen.compilation_info<"
-        in embeddable
-    )
-    assert "workgroup = [1, 128, 64, 0]" in embeddable
-    assert "reduction = [0, 0, 0, 128]" in embeddable
-    assert 'llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}' in embeddable
-    assert "workgroup_size = [128, 2, 1] subgroup_size = 64" in embeddable
-
-
-def test_apply_params_broadcast_rhs_mmt(tuner_ctx: common.TunerContext) -> None:
-    mlir_template = [
-        "<intrinsic = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>, subgroup_m_count = 4, subgroup_n_count = 1>}>",
-        "<LLVMGPUVectorDistribute workgroup_size = [64, 4, 1] subgroup_size = 64,",
-        "<workgroup = [1, 128, 128, 64]], reduction = [1, 128, 128, 64]>",
-        '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}',
-    ]
-
-    problem_size = common.ProblemSize(
-        common.MatmulSize(4096, 640, 640, 2),
-        common.ShapedType([2, 4096, 640], tuner_ctx.type.i8),
-        common.ShapedType([640, 640], tuner_ctx.type.i8),
-        common.ShapedType([2, 4096, 640], tuner_ctx.type.i32),
-        common.DispatchKind.broadcast_rhs_mmt,
-    )
-
-    mma_intrinsic = iree_gpu.MMAIntrinsic.MFMA_I32_32x32x16_I8
-    mma_attr = iree_gpu.MMAAttr.get(mma_intrinsic)
-    lowering_config = common.get_lowering_config(
-        tuner_ctx=tuner_ctx,
-        mma_kind=mma_attr,
-        workgroup=[1, 128, 64, 0],
-        reduction=[0, 0, 0, 128],
-        subgroup_m_count=2,
-        subgroup_n_count=2,
-    )
-    pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
-        iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
-    )
-    pipeline_options = iree_gpu.PipelineOptionsAttr.get()
-    config_dict = common.get_translation_info_config(pipeline_options, 4)
-    translation_info = iree_codegen.TranslationInfoAttr.get(
-        pipeline_attr, None, [128, 2, 1], 64, config_dict
-    )
-    compilation_info = iree_codegen.CompilationInfoAttr.get(
-        lowering_config, translation_info
-    )
-
-    tf_mlir = candidate_gen.ContractionTuner(
-        "mk", "nk", "mnk"
-    ).apply_params_broadcast_rhs_mmt(problem_size, mlir_template, compilation_info)
-
-    modified = tf_mlir.modified
-    embeddable = tf_mlir.embeddable
-
-    assert modified
-    assert (
-        "//   transform.named_sequence @match_broadcast_rhs_mmt_Bx4096x640x640("
-        in modified
-    )
-    modified = remove_comments(modified)
-
-    assert (
-        "intrinsic = #iree_gpu.mma_layout<MFMA_I32_32x32x16_I8>, subgroup_m_count = 2, subgroup_n_count = 2"
-        in modified
-    )
-    assert (
-        "LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64"
-        in modified
-    )
-    assert "workgroup = [1, 128, 64, 0]" in modified
-    assert "reduction = [0, 0, 0, 128]" in modified
-    assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}' in modified
-
-    assert embeddable
-    assert "transform.named_sequence @match_op(" in embeddable
-    assert (
-        "transform.include @match_broadcast_rhs_mmt_i8_i8_i32 failures(propagate)"
-        in embeddable
-    )
-    assert (
-        "transform.iree.match.cast_compatible_type %lhs = tensor<?x4096x640xi8> : !transform.any_value"
-        in embeddable
-    )
-    assert (
-        "transform.iree.match.cast_compatible_type %rhs = tensor<640x640xi8> : !transform.any_value"
-        in embeddable
-    )
-    assert (
-        "%config = transform.param.constant #iree_codegen.compilation_info<"
-        in embeddable
-    )
-    assert "workgroup = [1, 128, 64, 0]" in embeddable
-    assert "reduction = [0, 0, 0, 128]" in embeddable
-    assert 'llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}' in embeddable
-    assert "workgroup_size = [128, 2, 1] subgroup_size = 64" in embeddable
-
-
-def test_detect_broadcast_rhs_mmt(tuner_ctx: common.TunerContext) -> None:
-    mlir_lines = [
-        r"%18 = tensor.empty() : tensor<2x1024x10240xi32>",
-        r"%19 = linalg.fill {lowering_config = #iree_codegen.lowering_config<workgroup = [[1, 64, 128, 0]], reduction = [[0, 0, 0, 128]]>} ins(%c0_i32 : i32) outs(%18 : tensor<2x1024x10240xi32>) -> tensor<2x1024x10240xi32>",
-        r'%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<2x1024x1280xi8>, tensor<10240x1280xi8>) outs(%19 : tensor<2x1024x10240xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 128]]>} {',
-    ]
-    assert candidate_gen.ContractionTuner("mk", "nk", "mnk").is_broadcast_rhs_mmt(
-        mlir_lines
-    )
diff --git a/tuner/tuner/common.py b/tuner/tuner/common.py
index 78e3a8e9d..54051df47 100644
--- a/tuner/tuner/common.py
+++ b/tuner/tuner/common.py
@@ -45,12 +45,8 @@ def __init__(self, mlir_ctx: ir.Context, logger: logging.Logger):
 
 
 class DispatchKind(Enum):
-    conv = 1
-    mmt = 2
-    contraction = 3
-    batch_mmt = 4
-    batch_matmul = 5
-    broadcast_rhs_mmt = 6
+    conv = 0
+    contraction = 1
 
 
 @dataclass
@@ -108,11 +104,10 @@ def is_comptible(mma_intrinsic: iree_gpu.MMAIntrinsic) -> bool:
         a_type, b_type, c_type = mma_attr.abc_element_types
         if not isinstance(problem_size.res_type.element_type, type(c_type)):
             return False
-        if problem_size.dispatch_kind != DispatchKind.batch_matmul:
-            if not isinstance(
-                problem_size.lhs_type.element_type, type(a_type)
-            ) or not isinstance(problem_size.rhs_type.element_type, type(b_type)):
-                return False
+        if not isinstance(
+            problem_size.lhs_type.element_type, type(a_type)
+        ) or not isinstance(problem_size.rhs_type.element_type, type(b_type)):
+            return False
         return True
 
     return list(filter(is_comptible, mma_intrinsics))
diff --git a/tuner/tuner/common_test.py b/tuner/tuner/common_test.py
index 6157bb355..b23360ccc 100644
--- a/tuner/tuner/common_test.py
+++ b/tuner/tuner/common_test.py
@@ -123,11 +123,13 @@ def test_get_compatible_mfma_intrinsics(tuner_ctx: common.TunerContext) -> None:
             common.ShapedType([2048, 1280], tuner_ctx.type.f16),
             common.ShapedType([1280, 1280], tuner_ctx.type.f16),
             common.ShapedType([2048, 1280], tuner_ctx.type.f32),
-            common.DispatchKind.mmt,
+            common.DispatchKind.contraction,
         ),
         [
             iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
             iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16,
+            iree_gpu.MMAIntrinsic.MFMA_I32_16x16x32_I8,
+            iree_gpu.MMAIntrinsic.MFMA_I32_32x32x16_I8,
         ],
     ) == [
         iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
@@ -140,9 +142,11 @@ def test_get_compatible_mfma_intrinsics(tuner_ctx: common.TunerContext) -> None:
             common.ShapedType([2048, 1280], tuner_ctx.type.i8),
             common.ShapedType([1280, 1280], tuner_ctx.type.i8),
             common.ShapedType([2048, 1280], tuner_ctx.type.i32),
-            common.DispatchKind.mmt,
+            common.DispatchKind.contraction,
         ),
         [
+            iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
+            iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16,
             iree_gpu.MMAIntrinsic.MFMA_I32_16x16x32_I8,
             iree_gpu.MMAIntrinsic.MFMA_I32_32x32x16_I8,
         ],
@@ -151,38 +155,6 @@ def test_get_compatible_mfma_intrinsics(tuner_ctx: common.TunerContext) -> None:
         iree_gpu.MMAIntrinsic.MFMA_I32_32x32x16_I8,
     ]
 
-    assert common.get_compatible_mfma_intrinsics(
-        common.ProblemSize(
-            common.MatmulSize(968, 320, 640, 64),
-            common.ShapedType([64, 968, 640], tuner_ctx.type.f32),
-            common.ShapedType([64, 640, 320], tuner_ctx.type.f32),
-            common.ShapedType([64, 968, 320], tuner_ctx.type.f32),
-            common.DispatchKind.batch_matmul,
-        ),
-        [
-            iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
-            iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16,
-        ],
-    ) == [
-        iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
-        iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16,
-    ]
-
-    assert common.get_compatible_mfma_intrinsics(
-        common.ProblemSize(
-            common.MatmulSize(968, 320, 640, 64),
-            common.ShapedType([64, 968, 640], tuner_ctx.type.f32),
-            common.ShapedType([64, 640, 320], tuner_ctx.type.f32),
-            common.ShapedType([64, 968, 320], tuner_ctx.type.f32),
-            common.DispatchKind.batch_matmul,
-        ),
-        [
-            iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16,
-        ],
-    ) == [
-        iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16,
-    ]
-
     assert (
         common.get_compatible_mfma_intrinsics(
             common.ProblemSize(
@@ -190,9 +162,11 @@ def test_get_compatible_mfma_intrinsics(tuner_ctx: common.TunerContext) -> None:
                 common.ShapedType([64, 968, 640], tuner_ctx.type.f32),
                 common.ShapedType([64, 640, 320], tuner_ctx.type.f32),
                 common.ShapedType([64, 968, 320], tuner_ctx.type.f32),
-                common.DispatchKind.batch_matmul,
+                common.DispatchKind.contraction,
             ),
             [
+                iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
+                iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16,
                 iree_gpu.MMAIntrinsic.MFMA_I32_16x16x32_I8,
                 iree_gpu.MMAIntrinsic.MFMA_I32_32x32x16_I8,
             ],
diff --git a/tuner/tuner/dispatch_constraints.py b/tuner/tuner/dispatch_constraints.py
index 914c04bbf..f6de5179d 100644
--- a/tuner/tuner/dispatch_constraints.py
+++ b/tuner/tuner/dispatch_constraints.py
@@ -232,15 +232,16 @@ def generate_solutions(
             problem_size.lhs_type.element_type,
             problem_size.rhs_type.element_type,
         )
+        workgroup_tiles = [lookup(m), lookup(n), 0]
+        reduction_tiles = [0, 0, lookup(k)]
+        if problem_size.dispatch_kind == DispatchKind.conv:
+            workgroup_tiles = [1, 1, lookup(m), lookup(n), 0, 0, 0]
+            reduction_tiles = [0, 0, 0, 0, 1, 1, lookup(k)]
         lowering_config = get_lowering_config(
             tuner_ctx=tuner_ctx,
             mma_kind=mma_attr,
-            workgroup=[lookup(m), lookup(n), 0],
-            reduction=[
-                0,
-                0,
-                lookup(k),
-            ],
+            workgroup=workgroup_tiles,
+            reduction=reduction_tiles,
             subgroup_m_count=lookup(sg_m_cnt),
             subgroup_n_count=lookup(sg_n_cnt),
         )
diff --git a/tuner/tuner/dispatch_constraints_test.py b/tuner/tuner/dispatch_constraints_test.py
index 842ea8509..5c82f555f 100644
--- a/tuner/tuner/dispatch_constraints_test.py
+++ b/tuner/tuner/dispatch_constraints_test.py
@@ -36,7 +36,7 @@ def test_generate_solutions(tuner_ctx: common.TunerContext) -> None:
     rhs_type = common.ShapedType([3840, 1280], tuner_ctx.type.f16)
     res_type = common.ShapedType([2048, 3840], tuner_ctx.type.f32)
     problem_size = common.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.mmt
+        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.contraction
     )
     configs = dispatch_constraints.generate_solutions(
         tuner_ctx,
@@ -59,7 +59,7 @@ def test_calculate_shared_memory_usage_in_bytes(tuner_ctx: common.TunerContext)
     rhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.f16)
     res_type = common.ShapedType([1024, 1024], tuner_ctx.type.f32)
     problem_size = common.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.mmt
+        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.contraction
     )
     assert (
         dispatch_constraints.calculate_shared_memory_usage_in_bytes(
@@ -70,7 +70,7 @@ def test_calculate_shared_memory_usage_in_bytes(tuner_ctx: common.TunerContext)
 
     lhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.i8)
     problem_size = common.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.mmt
+        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.contraction
     )
     assert (
         dispatch_constraints.calculate_shared_memory_usage_in_bytes(
@@ -81,7 +81,7 @@ def test_calculate_shared_memory_usage_in_bytes(tuner_ctx: common.TunerContext)
 
     rhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.i32)
     problem_size = common.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.mmt
+        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.contraction
     )
     assert (
         dispatch_constraints.calculate_shared_memory_usage_in_bytes(
@@ -97,7 +97,7 @@ def test_generate_constraints_valid_input(tuner_ctx: common.TunerContext) -> Non
     rhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.f16)
     res_type = common.ShapedType([1024, 1024], tuner_ctx.type.f32)
     problem_size = common.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.mmt
+        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.contraction
     )
     # Define input parameters as z3 Ints
     m, n, k = (
@@ -149,7 +149,7 @@ def test_generate_constraints_invalid_input(tuner_ctx: common.TunerContext) -> N
     rhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.f16)
     res_type = common.ShapedType([1024, 1024], tuner_ctx.type.f32)
     problem_size = common.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.mmt
+        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.contraction
     )
     m, n, k = (
         z3.Int("m"),
diff --git a/tuner/tuner/dispatch_parser.py b/tuner/tuner/dispatch_parser.py
index 735d6145c..502968ea8 100644
--- a/tuner/tuner/dispatch_parser.py
+++ b/tuner/tuner/dispatch_parser.py
@@ -7,65 +7,12 @@
 # Given an input dispatch, this code modifies the hyperparameters
 # in the code and runs it.
 
-import math
-import re
 from abc import ABCMeta, abstractmethod
 
 from .op_matchers import *
 from .common import *
 
 
-def parse_tensor_type(tensor_type: str) -> ShapedType:
-    shaped_ty = ir.RankedTensorType(ir.Type.parse(tensor_type))
-    assert shaped_ty
-    return ShapedType(shaped_ty.shape, shaped_ty.element_type)
-
-
-def get_contract_workgroup_sizes(
-    configuration: iree_codegen.CompilationInfoAttr, tile_dims: str
-) -> list[int]:
-    m, n, _k = configuration.lowering_config.workgroup_tile_sizes
-
-    workgroup_size = [1] * len(tile_dims)
-    for idx, dim in enumerate(tile_dims):
-        if dim == "m":
-            workgroup_size[idx] = m
-        if dim == "n":
-            workgroup_size[idx] = n
-        if dim == "k":
-            workgroup_size[idx] = 0
-
-    return workgroup_size
-
-
-def get_contract_reduction_sizes(
-    configuration: iree_codegen.CompilationInfoAttr, tile_dims: str
-) -> list[int]:
-    _m, _n, k = configuration.lowering_config.reduction_tile_sizes
-    reduction_size = [0] * len(tile_dims)
-    for idx, dim in enumerate(tile_dims):
-        if dim == "k":
-            reduction_size[idx] = k
-
-    return reduction_size
-
-
-class MlirRegex(Enum):
-    ssa_value = r"%[a-zA-Z0-9-_]+"
-    tensor_type = r"tensor<([^>]+)>"
-
-    def __str__(self) -> str:
-        return self.value
-
-    @staticmethod
-    def dps_ins_two_args() -> str:
-        return rf"ins\({MlirRegex.ssa_value}, {MlirRegex.ssa_value} : (?P<LHS>{MlirRegex.tensor_type}), (?P<RHS>{MlirRegex.tensor_type})\)"
-
-    @staticmethod
-    def dps_outs_one_arg() -> str:
-        return rf"outs\({MlirRegex.ssa_value} : (?P<RES>{MlirRegex.tensor_type})\)"
-
-
 def parse_mlir(mlir_text: str, ctx: TunerContext) -> ir.Module:
     mlir_module = None
     try:
@@ -179,359 +126,3 @@ def get_shapes(self, template: list[str]) -> ProblemSize:
             res_type=ShapedType(res_type.shape, res_type.element_type),
             dispatch_kind=DispatchKind.conv,
         )
-
-
-class MmtParser(DispatchParser):
-    def supports(self, op_name: str) -> bool:
-        return "matmul_transpose_b" in op_name
-
-    def get_shapes(self, template: list[str]) -> ProblemSize:
-        mmt_re = None
-        dps = None
-        for line in template:
-            if "linalg.generic" not in line:
-                continue
-            if r'iterator_types = ["parallel", "parallel", "reduction"]' not in line:
-                continue
-            # ins(%13, %14 : tensor<2048x1280xf16>, tensor<1280x1280xf16>) outs(%19 : tensor<2048x1280xf32>)
-            mmt_re = rf"{MlirRegex.dps_ins_two_args()}\s+{MlirRegex.dps_outs_one_arg()}"
-            dps = re.search(mmt_re, line)
-            if dps is None:
-                continue
-
-            lhs_tensor_type = dps.group("LHS")
-            rhs_tensor_type = dps.group("RHS")
-            lhs_shaped_type = parse_tensor_type(lhs_tensor_type)
-            assert lhs_shaped_type.rank() == 2
-            lhs_M, lhs_K = lhs_shaped_type.shape
-
-            rhs_shaped_type = parse_tensor_type(rhs_tensor_type)
-            assert rhs_shaped_type.rank() == 2
-            rhs_N, rhs_K = rhs_shaped_type.shape
-
-            assert lhs_shaped_type.element_type == rhs_shaped_type.element_type
-            assert lhs_K == rhs_K
-
-            res_tensor_type = dps.group("RES")
-            res_shaped_type = parse_tensor_type(res_tensor_type)
-            assert res_shaped_type.rank() == 2
-            res_M, res_N = res_shaped_type.shape
-
-            assert lhs_M == res_M
-            assert rhs_N == res_N
-
-            matmul_size = MatmulSize(
-                lhs_shaped_type.shape[0],
-                rhs_shaped_type.shape[0],
-                lhs_shaped_type.shape[1],
-            )
-            return ProblemSize(
-                matmul_size,
-                lhs_type=lhs_shaped_type,
-                rhs_type=rhs_shaped_type,
-                res_type=res_shaped_type,
-                dispatch_kind=DispatchKind.mmt,
-            )
-        assert mmt_re
-        assert False, f"'{mmt_re}' not found in given context"
-
-
-class ConvParser(DispatchParser):
-    def supports(self, op_name: str) -> bool:
-        return "conv_2d_nhwc_hwcf" in op_name
-
-    def get_shapes(self, template: list[str]) -> ProblemSize:
-        for line in template:
-            if "linalg.conv_2d_nhwc_hwcf" not in line:
-                continue
-
-            # ins(%19, %20 : tensor<2x34x34x1280xf16>, tensor<3x3x1280x1280xf16>) outs (%27 : tensor<2x32x32x1280xf32>)
-            conv_re = (
-                rf"{MlirRegex.dps_ins_two_args()}\s+{MlirRegex.dps_outs_one_arg()}"
-            )
-            dps = re.search(conv_re, line)
-            if dps is None:
-                continue
-
-            lhs_tensor_type = dps.group("LHS")
-            rhs_tensor_type = dps.group("RHS")
-            lhs_shaped_type = parse_tensor_type(lhs_tensor_type)
-            assert lhs_shaped_type.rank() == 4
-
-            rhs_shaped_type = parse_tensor_type(rhs_tensor_type)
-            assert rhs_shaped_type.rank() == 4
-
-            res_tensor_type = dps.group("RES")
-            res_shaped_type = parse_tensor_type(res_tensor_type)
-            assert res_shaped_type.rank() == 4
-
-            dim_info = ConvDimInfo.from_rhs_res(rhs_shaped_type, res_shaped_type)
-            return ProblemSize(
-                MatmulSize(
-                    M=dim_info.oh * dim_info.ow,
-                    N=dim_info.oc,
-                    K=dim_info.fh * dim_info.fw * dim_info.ic,
-                    B=dim_info.n,
-                ),
-                lhs_shaped_type,
-                rhs_shaped_type,
-                res_shaped_type,
-                DispatchKind.conv,
-            )
-
-        assert False, "Shape not found"
-
-
-class ContractionParser(DispatchParser):
-    def __init__(self, lhs_dims: str, rhs_dims: str, tile_dims: str):
-        self.lhs_dims = lhs_dims
-        self.rhs_dims = rhs_dims
-        self.tile_dims = tile_dims
-
-    def supports(self, op_name: str) -> bool:
-        return "matmul_like" in op_name
-
-    def is_broadcast_rhs_mmt_op(self, line: str) -> bool:
-        if "linalg.generic" not in line:
-            return False
-        if (
-            r'iterator_types = ["parallel", "parallel", "parallel", "reduction"]'
-            not in line
-        ):
-            return False
-        if (
-            r"indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>"
-            not in line
-        ):
-            return False
-        return True
-
-    def is_broadcast_rhs_mmt(self, template: list[str]) -> bool:
-        return any(self.is_broadcast_rhs_mmt_op(line) for line in template)
-
-    def get_shapes_broadcast_rhs_mmt(self, template: list[str]) -> ProblemSize:
-        for line in template:
-            if not self.is_broadcast_rhs_mmt_op(line):
-                continue
-
-            # ins(%11, %12 : tensor<2x1024x1280xi8>, tensor<10240x1280xi8>) outs(%19 : tensor<2x1024x10240xi32>)
-            bmmt_re = (
-                rf"{MlirRegex.dps_ins_two_args()}\s+{MlirRegex.dps_outs_one_arg()}"
-            )
-            dps = re.search(bmmt_re, line)
-            if dps is None:
-                continue
-
-            lhs_tensor_type = dps.group("LHS")
-            rhs_tensor_type = dps.group("RHS")
-            lhs_shaped_type = parse_tensor_type(lhs_tensor_type)
-            assert lhs_shaped_type.rank() == 3
-
-            rhs_shaped_type = parse_tensor_type(rhs_tensor_type)
-            assert rhs_shaped_type.rank() == 2
-
-            res_tensor_type = dps.group("RES")
-            res_shaped_type = parse_tensor_type(res_tensor_type)
-            assert res_shaped_type.rank() == 3
-
-            B0, M0, K0 = lhs_shaped_type.shape
-            N1, K1 = rhs_shaped_type.shape
-            B2, M2, N2 = res_shaped_type.shape
-            assert B0 == B2
-            assert M0 == M2
-            assert N1 == N2
-            assert K0 == K1
-            return ProblemSize(
-                MatmulSize(M0, N1, K0, B0),
-                lhs_shaped_type,
-                rhs_shaped_type,
-                res_shaped_type,
-                DispatchKind.broadcast_rhs_mmt,
-            )
-
-        assert False, "Shape not found"
-
-    def get_shapes(self, template: list[str]) -> ProblemSize:
-        if self.is_broadcast_rhs_mmt(template):
-            return self.get_shapes_broadcast_rhs_mmt(template)
-
-        for line in template:
-            if "linalg.generic" not in line:
-                continue
-            if "lowering_config =" not in line:
-                continue
-            if '"reduction"' not in line:
-                continue
-
-            # ins(%7, %8 : tensor<2x1024x1280xf16>, tensor<20x64x1280xf16>)
-            cont_re = (
-                rf"{MlirRegex.dps_ins_two_args()}\s+{MlirRegex.dps_outs_one_arg()}"
-            )
-            dps = re.search(cont_re, line)
-            if dps is None:
-                continue
-
-            lhs_tensor_type = dps.group("LHS")
-            rhs_tensor_type = dps.group("RHS")
-            lhs_shaped_type = parse_tensor_type(lhs_tensor_type)
-            assert lhs_shaped_type.rank() == len(self.lhs_dims)
-
-            rhs_shaped_type = parse_tensor_type(rhs_tensor_type)
-            assert rhs_shaped_type.rank() == len(self.rhs_dims)
-
-            res_tensor_type = dps.group("RES")
-            res_shaped_type = parse_tensor_type(res_tensor_type)
-            assert res_shaped_type.rank() >= 2
-
-            M = math.prod(
-                val if dim == "m" else 1
-                for dim, val in zip(self.lhs_dims, lhs_shaped_type.shape)
-            )
-            N = math.prod(
-                val if dim == "n" else 1
-                for dim, val in zip(self.rhs_dims, rhs_shaped_type.shape)
-            )
-            K0 = math.prod(
-                val if dim == "k" else 1
-                for dim, val in zip(self.lhs_dims, lhs_shaped_type.shape)
-            )
-            K1 = math.prod(
-                val if dim == "k" else 1
-                for dim, val in zip(self.rhs_dims, rhs_shaped_type.shape)
-            )
-            assert K0 == K1
-
-            return ProblemSize(
-                MatmulSize(M, N, K0),
-                lhs_type=lhs_shaped_type,
-                rhs_type=rhs_shaped_type,
-                res_type=res_shaped_type,
-                dispatch_kind=DispatchKind.contraction,
-            )
-
-        assert False, "Shape not found"
-
-
-class BatchMmtParser(DispatchParser):
-    def supports(self, op_name: str) -> bool:
-        return "batch_matmul_transpose_b" in op_name
-
-    def get_shapes(self, template: list[str]) -> ProblemSize:
-        for line in template:
-            if "linalg.generic" not in line:
-                continue
-            if (
-                r'iterator_types = ["parallel", "parallel", "parallel", "reduction"]'
-                not in line
-            ):
-                continue
-            # ins(%11, %12 : tensor<2x4096x640xi8>, tensor<2x640x640xi8>) outs(%19 : tensor<2x4096x640xi32>)
-            bmmt_re = (
-                rf"{MlirRegex.dps_ins_two_args()}\s+{MlirRegex.dps_outs_one_arg()}"
-            )
-            dps = re.search(bmmt_re, line)
-            if dps is None:
-                continue
-
-            lhs_tensor_type = dps.group("LHS")
-            rhs_tensor_type = dps.group("RHS")
-            lhs_shaped_type = parse_tensor_type(lhs_tensor_type)
-            assert lhs_shaped_type.rank() == 3
-
-            rhs_shaped_type = parse_tensor_type(rhs_tensor_type)
-            assert rhs_shaped_type.rank() == 3
-
-            res_tensor_type = dps.group("RES")
-            res_shaped_type = parse_tensor_type(res_tensor_type)
-            assert res_shaped_type.rank() == 3
-
-            B0, M0, K0 = lhs_shaped_type.shape
-            B1, N1, K1 = rhs_shaped_type.shape
-            B2, M2, N2 = res_shaped_type.shape
-            assert B0 == B1
-            assert B0 == B2
-            assert M0 == M2
-            assert N1 == N2
-            assert K0 == K1
-            return ProblemSize(
-                MatmulSize(M0, N1, K0, B0),
-                lhs_shaped_type,
-                rhs_shaped_type,
-                res_shaped_type,
-                DispatchKind.batch_mmt,
-            )
-
-        assert False, "Shape not found"
-
-
-class BatchMatmulParser(DispatchParser):
-    def __init__(self, lhs_dims: str, rhs_dims: str, tile_dims: str):
-        self.lhs_dims = lhs_dims
-        self.rhs_dims = rhs_dims
-        self.tile_dims = tile_dims
-
-    def supports(self, op_name: str) -> bool:
-        return "batch_matmul" in op_name
-
-    def get_shapes(self, template: list[str]) -> ProblemSize:
-        for line in template:
-            if "linalg.batch_matmul" not in line:
-                continue
-            # ins(%9, %10 : tensor<64x72x1280xf16>, tensor<64x1280x1280xf16>)
-            # outs(%12 : tensor<64x72x1280xf32>)
-            cont_re = (
-                rf"{MlirRegex.dps_ins_two_args()}\s+{MlirRegex.dps_outs_one_arg()}"
-            )
-            dps = re.search(cont_re, line)
-            if dps is None:
-                continue
-
-            lhs_tensor_type = dps.group("LHS")
-            rhs_tensor_type = dps.group("RHS")
-            lhs_shaped_type = parse_tensor_type(lhs_tensor_type)
-            assert lhs_shaped_type.rank() == len(self.lhs_dims)
-
-            rhs_shaped_type = parse_tensor_type(rhs_tensor_type)
-            assert rhs_shaped_type.rank() == len(self.rhs_dims)
-
-            res_tensor_type = dps.group("RES")
-            res_shaped_type = parse_tensor_type(res_tensor_type)
-            assert res_shaped_type.rank() == lhs_shaped_type.rank()
-
-            LHS = lhs_shaped_type.shape
-            RHS = rhs_shaped_type.shape
-            RES = res_shaped_type.shape
-
-            B = math.prod(
-                val if dim == "b" else 1 for dim, val in zip(self.lhs_dims, LHS)
-            )
-            B0 = math.prod(
-                val if dim == "b" else 1 for dim, val in zip(self.lhs_dims, RHS)
-            )
-            B1 = math.prod(
-                val if dim == "b" else 1 for dim, val in zip(self.lhs_dims, RES)
-            )
-            M = math.prod(
-                val if dim == "m" else 1 for dim, val in zip(self.lhs_dims, LHS)
-            )
-            N = math.prod(
-                val if dim == "n" else 1 for dim, val in zip(self.rhs_dims, RHS)
-            )
-            K0 = math.prod(
-                val if dim == "k" else 1 for dim, val in zip(self.lhs_dims, LHS)
-            )
-            K1 = math.prod(
-                val if dim == "k" else 1 for dim, val in zip(self.rhs_dims, RHS)
-            )
-            assert B == B0 and B == B1
-            assert K0 == K1
-
-            return ProblemSize(
-                MatmulSize(M, N, K0, B),
-                lhs_type=lhs_shaped_type,
-                rhs_type=rhs_shaped_type,
-                res_type=res_shaped_type,
-                dispatch_kind=DispatchKind.batch_matmul,
-            )
-
-        assert False, "Shape not found"
diff --git a/tuner/tuner/dispatch_parser_test.py b/tuner/tuner/dispatch_parser_test.py
index 0b87be659..c35b17bed 100644
--- a/tuner/tuner/dispatch_parser_test.py
+++ b/tuner/tuner/dispatch_parser_test.py
@@ -32,15 +32,6 @@ def tuner_ctx() -> Generator[common.TunerContext, None, None]:
         yield common.TunerContext(ctx, logger)
 
 
-def test_parse_tensor_type(tuner_ctx: common.TunerContext) -> None:
-    assert dispatch_parser.parse_tensor_type("tensor<1x2x3xf32>") == common.ShapedType(
-        [1, 2, 3], tuner_ctx.type.f32
-    )
-    assert dispatch_parser.parse_tensor_type("tensor<123xi8>") == common.ShapedType(
-        [123], tuner_ctx.type.i8
-    )
-
-
 CONTRACTION_TEMPLATE = r"""
 builtin.module{{
     func.func @test(%arg0: {lhs_type}, %arg1: {rhs_type}) -> {res_type} {{
@@ -207,151 +198,6 @@ def test_get_conv_tile_sizes(tuner_ctx: common.TunerContext) -> None:
     ]
 
 
-def test_get_contract_tile_sizes(tuner_ctx: common.TunerContext) -> None:
-    mma_intrinsic = iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16
-    mma_attr = iree_gpu.MMAAttr.get(mma_intrinsic)
-    lowering_config = common.get_lowering_config(
-        tuner_ctx=tuner_ctx,
-        mma_kind=mma_attr,
-        workgroup=[4, 8, 0],
-        reduction=[0, 0, 16],
-        subgroup_m_count=1,
-        subgroup_n_count=1,
-    )
-    pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
-        iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
-    )
-    pipeline_options = iree_gpu.PipelineOptionsAttr.get()
-    config_dict = common.get_translation_info_config(pipeline_options, 2)
-    translation_info = iree_codegen.TranslationInfoAttr.get(
-        pipeline_attr, None, [16, 16, 1], 32, config_dict
-    )
-    compilation_info = iree_codegen.CompilationInfoAttr.get(
-        lowering_config, translation_info
-    )
-    assert dispatch_parser.get_contract_workgroup_sizes(compilation_info, "mnk") == [
-        4,
-        8,
-        0,
-    ]
-    assert dispatch_parser.get_contract_reduction_sizes(compilation_info, "mnk") == [
-        0,
-        0,
-        16,
-    ]
-    assert dispatch_parser.get_contract_workgroup_sizes(compilation_info, "nmk") == [
-        8,
-        4,
-        0,
-    ]
-    assert dispatch_parser.get_contract_reduction_sizes(compilation_info, "nmk") == [
-        0,
-        0,
-        16,
-    ]
-    assert dispatch_parser.get_contract_workgroup_sizes(compilation_info, "knm") == [
-        0,
-        8,
-        4,
-    ]
-    assert dispatch_parser.get_contract_reduction_sizes(compilation_info, "knm") == [
-        16,
-        0,
-        0,
-    ]
-    assert dispatch_parser.get_contract_workgroup_sizes(compilation_info, "kkk") == [
-        0,
-        0,
-        0,
-    ]
-    assert dispatch_parser.get_contract_reduction_sizes(compilation_info, "kkk") == [
-        16,
-        16,
-        16,
-    ]
-
-
-def test_get_shapes_mmt(tuner_ctx: common.TunerContext) -> None:
-    template = [
-        r"%18 = tensor.empty() : tensor<2048x1280xf32>",
-        r"%19 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 128, 64]]>} ins(%cst : f32) outs(%18 : tensor<2048x1280xf32>) -> tensor<2048x1280xf32>",
-        r'%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%13, %14 : tensor<2048x1280xf16>, tensor<1280x1280xf16>) outs(%19 : tensor<2048x1280xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 128, 64]]>} {',
-        r"^bb0(%in: f16, %in_0: f16, %out: f32):",
-    ]
-    assert dispatch_parser.MmtParser().get_shapes(template) == common.ProblemSize(
-        common.MatmulSize(2048, 1280, 1280),
-        common.ShapedType([2048, 1280], tuner_ctx.type.f16),
-        common.ShapedType([1280, 1280], tuner_ctx.type.f16),
-        common.ShapedType([2048, 1280], tuner_ctx.type.f32),
-        dispatch_parser.DispatchKind.mmt,
-    )
-
-
-def test_get_shapes_conv(tuner_ctx: common.TunerContext) -> None:
-    template = [
-        r"%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 32, 256, 1, 1, 32]]>} ins(%cst : f32) outs(%4 : tensor<1x1x32x256xf32>) -> tensor<1x1x32x256xf32>",
-        r"%8 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 32, 256, 1, 1, 32]]>, strides = dense<1> : vector<2xi64>} ins(%5, %6 : tensor<1x3x34x1280xf16>, tensor<3x3x1280x256xf16>) outs(%7 : tensor<1x1x32x256xf32>) -> tensor<1x1x32x256xf32>",
-        r"flow.dispatch.tensor.store %8, %2, offsets = [%workgroup_id_z, %workgroup_id_y, 0, %3], sizes = [1, 1, 32, 256], strides = [1, 1, 1, 1] : tensor<1x1x32x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x32x1280xf32>>",
-    ]
-    assert dispatch_parser.ConvParser().get_shapes(template) == common.ProblemSize(
-        common.MatmulSize(32, 256, 11520),
-        common.ShapedType([1, 3, 34, 1280], tuner_ctx.type.f16),
-        common.ShapedType([3, 3, 1280, 256], tuner_ctx.type.f16),
-        common.ShapedType([1, 1, 32, 256], tuner_ctx.type.f32),
-        dispatch_parser.DispatchKind.conv,
-    )
-
-
-def test_get_shapes_contract(tuner_ctx: common.TunerContext) -> None:
-    template = [
-        r"%18 = tensor.empty() : tensor<2048x1280xf32>",
-        r"%19 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 128, 64]]>} ins(%cst : f32) outs(%18 : tensor<2048x1280xf32>) -> tensor<2048x1280xf32>",
-        r'%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%13, %14 : tensor<2048x1280xf16>, tensor<1280x1280xf16>) outs(%19 : tensor<2048x1280xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 128, 64]]>} {',
-        r"^bb0(%in: f16, %in_0: f16, %out: f32):",
-    ]
-    assert dispatch_parser.ContractionParser("mk", "nk", "mnk").get_shapes(
-        template
-    ) == common.ProblemSize(
-        common.MatmulSize(2048, 1280, 1280),
-        common.ShapedType([2048, 1280], tuner_ctx.type.f16),
-        common.ShapedType([1280, 1280], tuner_ctx.type.f16),
-        common.ShapedType([2048, 1280], tuner_ctx.type.f32),
-        dispatch_parser.DispatchKind.contraction,
-    )
-
-
-def test_get_shapes_batch_matmul(tuner_ctx: common.TunerContext) -> None:
-    template = [
-        "%10 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>",
-        "%11 = linalg.batch_matmul ins(%8, %9 : tensor<1x32x1024xf32>, tensor<1x1024x32xf32>) outs(%10 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>",
-        "flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 32, 32], strides = [1, 1, 1] : tensor<1x32x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x32x64xf32>>",
-    ]
-    assert dispatch_parser.BatchMatmulParser("bmk", "bkn", "mnk").get_shapes(
-        template
-    ) == common.ProblemSize(
-        common.MatmulSize(32, 32, 1024, 1),
-        common.ShapedType([1, 32, 1024], tuner_ctx.type.f32),
-        common.ShapedType([1, 1024, 32], tuner_ctx.type.f32),
-        common.ShapedType([1, 32, 32], tuner_ctx.type.f32),
-        dispatch_parser.DispatchKind.batch_matmul,
-    )
-
-
-def test_get_shapes_batch_mmt(tuner_ctx: common.TunerContext) -> None:
-    template = [
-        r"%19 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 128]]>} ins(%c0_i32 : i32) outs(%18 : tensor<2x4096x640xi32>) -> tensor<2x4096x640xi32>",
-        r'%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<2x4096x640xi8>, tensor<2x640x640xi8>) outs(%19 : tensor<2x4096x640xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 128]]>} {',
-        r"flow.dispatch.tensor.store %21, %10, offsets = [0, 0, 0], sizes = [2, 4096, 640], strides = [1, 1, 1] : tensor<2x4096x640xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x640xf16>>",
-    ]
-    assert dispatch_parser.BatchMmtParser().get_shapes(template) == common.ProblemSize(
-        common.MatmulSize(4096, 640, 640, 2),
-        common.ShapedType([2, 4096, 640], tuner_ctx.type.i8),
-        common.ShapedType([2, 640, 640], tuner_ctx.type.i8),
-        common.ShapedType([2, 4096, 640], tuner_ctx.type.i32),
-        dispatch_parser.DispatchKind.batch_mmt,
-    )
-
-
 def test_parse_mlir(tuner_ctx: common.TunerContext) -> None:
     mlir_str = r"""
     builtin.module  {
diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py
index ff7b78a11..4e2a97ec8 100644
--- a/tuner/tuner/libtuner.py
+++ b/tuner/tuner/libtuner.py
@@ -6,11 +6,9 @@
 
 """
 Provides fundamental functions for tuning:
-    - generate_candidates()
-    - compile_dispatches()
-    - benchmark_dispatches()
-    - compile_models()
-    - benchmark_models()
+    - generate_candidate_specs()
+    - compile()
+    - benchmark()
 
 Requires a wrapper Python script to import `libtuner`,
 use the `TuningClient` API, customize compilation and benchmarking commands,
@@ -20,9 +18,9 @@
 
 import math
 import signal
+import subprocess
 import sys
 import shutil
-import subprocess
 import logging
 import argparse
 from datetime import datetime
@@ -32,13 +30,9 @@
 import multiprocessing
 import queue
 from tqdm import tqdm
-import re
 import hashlib
 from dataclasses import dataclass, field
 from typing import Type, Optional, Callable, Iterable, Any
-import pickle
-import random
-import json
 from abc import ABC, abstractmethod
 import iree.runtime as ireert  # type: ignore
 import iree.compiler as ireec  # type: ignore
@@ -66,78 +60,32 @@
 DEVICE_ID_PLACEHOLDER = "!DEVICE_ID!"
 
 
-# TODO(Max191): Remove most of the fields here after refactoring is complete,
-# since many of them will be unused.
 @dataclass
 class CandidateTracker:
     candidate_id: int
     mlir_path: Optional[Path] = None
-    dispatch_mlir_path: Optional[Path] = None
-    dispatch_config_path: Optional[Path] = None
-    configuration: Optional[candidate_gen.iree_codegen.CompilationInfoAttr] = None
-    compilation_successful: Optional[bool] = None
     compiled_vmfb_path: Optional[Path] = None
-    compiled_dispatch_path: Optional[Path] = None
-    compiled_dispatch_hash: Optional[str] = None
-    first_benchmark_time: Optional[float] = None
-    first_benchmark_device_id: Optional[str] = None
     spec_path: Optional[Path] = None
-    compiled_model_path: Optional[Path] = None
-    compiled_model_hash: Optional[str] = None
-    model_benchmark_time: Optional[float] = None
-    model_benchmark_device_id: Optional[str] = None
-    baseline_benchmark_time: Optional[float] = None
-    calibrated_benchmark_diff: Optional[float] = None
 
 
 @dataclass()
 class PathConfig:
-    # Preset constants
-    global_config_prolog_mlir: Path = Path("config_prolog.mlir")
-    global_config_epilog_mlir: Path = Path("config_epilog.mlir")
-    model_baseline_vmfb: Path = Path("baseline.vmfb")
-
     # Dynamic paths
     base_dir: Path = field(init=False)
-    local_config_prolog_mlir: Path = field(init=False)
-    local_config_epilog_mlir: Path = field(init=False)
     template_mlir: Path = field(init=False)
     candidates_dir: Path = field(init=False)
-    candidate_configs_pkl: Path = field(init=False)
     compiled_dir: Path = field(init=False)
-    compile_failed_dir: Path = field(init=False)
     specs_dir: Path = field(init=False)
 
-    output_unilog: Path = field(init=False)
-    result_summary_log: Path = field(init=False)
-    candidate_trackers_pkl: Path = field(init=False)
-
     # To be set outside of class
     run_log: Optional[Path] = field(init=False, default=None)
 
     def __post_init__(self):
         object.__setattr__(self, "base_dir", self._name_base_dir())
-        object.__setattr__(
-            self, "local_config_prolog_mlir", self.base_dir / "config_prolog.mlir"
-        )
-        object.__setattr__(
-            self, "local_config_epilog_mlir", self.base_dir / "config_epilog.mlir"
-        )
         object.__setattr__(self, "template_mlir", self.base_dir / "template.mlir")
         object.__setattr__(self, "candidates_dir", self.base_dir / "candidates")
-        object.__setattr__(
-            self, "candidate_configs_pkl", self.candidates_dir / "configs.pkl"
-        )
         object.__setattr__(self, "compiled_dir", self.candidates_dir / "compiled")
-        object.__setattr__(self, "compile_failed_dir", self.candidates_dir / "failed")
         object.__setattr__(self, "specs_dir", self.candidates_dir / "specs")
-        object.__setattr__(self, "output_unilog", self.base_dir / "output.log")
-        object.__setattr__(
-            self, "result_summary_log", self.base_dir / "result_summary.log"
-        )
-        object.__setattr__(
-            self, "candidate_trackers_pkl", self.base_dir / "candidate_trackers.pkl"
-        )
 
     def _name_base_dir(self) -> Path:
         timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M")
@@ -147,27 +95,12 @@ def _name_base_dir(self) -> Path:
     def _set_run_log(self, run_log: Path):
         object.__setattr__(self, "run_log", run_log)
 
-    def get_candidate_mlir_path(self, candidate_id: int) -> Path:
-        return self.candidates_dir / f"{candidate_id}.mlir"
-
-    def get_candidate_spec_mlir_path(self, candidate_id: int) -> Path:
-        return self.candidates_dir / "specs" / f"{candidate_id}_spec.mlir"
-
-    def get_exe_format(self, path: Path) -> str:
-        return f"./{path.as_posix()}"
-
-    def get_compiled_dispatch_index(self, file_path: Path) -> int:
-        return int(file_path.stem)
-
     def get_candidate_spec_filename(self, candidate_id: int) -> str:
         return f"{candidate_id}_spec.mlir"
 
     def get_candidate_vmfb_filename(self, candidate_id: int) -> str:
         return f"{candidate_id}.vmfb"
 
-    def get_compiled_model_index(self, file_path: Path) -> int:
-        return int(file_path.stem.split("_")[-1])
-
 
 class TuningClient(ABC):
     def __init__(self):
@@ -183,50 +116,10 @@ def get_iree_compile_flags(self) -> list[str]:
     def get_iree_benchmark_module_flags(self) -> list[str]:
         pass
 
-    @abstractmethod
-    def get_dispatch_compile_command(
-        self, candidate_tracker: CandidateTracker
-    ) -> list[str]:
-        pass
-
-    @abstractmethod
-    def get_dispatch_benchmark_command(
-        self, candidate_tracker: CandidateTracker
-    ) -> list[str]:
-        pass
-
-    @abstractmethod
-    def get_model_compile_command(
-        self, candidate_tracker: CandidateTracker
-    ) -> list[str]:
-        pass
-
-    @abstractmethod
-    def get_model_benchmark_command(
-        self, candidate_tracker: CandidateTracker
-    ) -> list[str]:
-        pass
-
     @abstractmethod
     def get_benchmark_timeout_s(self) -> int:
         pass
 
-    @abstractmethod
-    def get_dispatch_compile_timeout_s(self) -> int:
-        pass
-
-    @abstractmethod
-    def get_dispatch_benchmark_timeout_s(self) -> int:
-        pass
-
-    @abstractmethod
-    def get_model_compile_timeout_s(self) -> int:
-        pass
-
-    @abstractmethod
-    def get_model_benchmark_timeout_s(self) -> int:
-        pass
-
 
 @dataclass
 class CompilePack:
@@ -241,42 +134,6 @@ class BenchmarkPack:
     candidate_tracker: CandidateTracker
 
 
-@dataclass
-class RunPack:
-    command: list[str]
-    check: bool = True
-    timeout_seconds: Optional[int] = None
-
-
-@dataclass
-class RunResult:
-    process_res: Optional[subprocess.CompletedProcess]
-    is_timeout: bool
-
-
-@dataclass
-class TaskPack:
-    run_pack: RunPack
-    candidate_id: int
-    command_need_device_id: bool = False
-    cooling_time: int = 0
-
-
-@dataclass
-class TaskResult:
-    run_result: RunResult
-    candidate_id: int
-    device_id: str
-
-
-@dataclass
-class ParsedDisptachBenchmarkResult:
-    candidate_id: int
-    benchmark_time_in_seconds: float
-    candidate_mlir: Path
-    candidate_spec_mlir: Path
-
-
 @dataclass
 class BenchmarkResult:
     candidate_id: int
@@ -284,75 +141,17 @@ class BenchmarkResult:
     device_id: str
 
 
-@dataclass
-class IREEBenchmarkResult:
-    # Default format follows output of iree-benchmark-module
-    candidate_id: int
-
-    # A list of dictionaries, each representing a benchmark result
-    # Each dictionary contains fields like: aggregate_name: string, real_time: float, cpu_time: float, time_unit: str, repetitions: int, etc.
-    result_json: list[dict[str, Any]]
-
-    def get_mean_time_us(self) -> Optional[float]:
-        """Compute the mean time (in microseconds) for all of the benchmarks"""
-        if not self.result_json:
-            return None
+def unit_to_microseconds(real_time: float, time_unit: str) -> float:
+    unit_conversions = {
+        "s": 1e6,
+        "ms": 1e3,
+        "us": 1,
+        "ns": 1e-3,
+    }
 
-        mean_benchmark = self.find_mean_benchmark(self.result_json)
+    assert time_unit in unit_conversions, f"Unsupported time unit: {time_unit}"
 
-        if mean_benchmark:
-            real_time: float | None = mean_benchmark.get("real_time")
-            time_unit: str | None = mean_benchmark.get("time_unit")
-
-            if real_time is not None:
-                assert time_unit is not None
-                return self.unit_to_microseconds(real_time, time_unit)
-
-        return None
-
-    @staticmethod
-    def find_mean_benchmark(result_json: list[dict[str, Any]]) -> Optional[dict]:
-        for benchmark in result_json:
-            if benchmark.get("aggregate_name") == "mean":
-                return benchmark
-
-        return None
-
-    @staticmethod
-    def unit_to_microseconds(real_time: float, time_unit: str) -> float:
-        unit_conversions = {
-            "s": 1e6,
-            "ms": 1e3,
-            "us": 1,
-            "ns": 1e-3,
-        }
-
-        assert time_unit in unit_conversions, f"Unsupported time unit: {time_unit}"
-
-        return real_time * unit_conversions[time_unit]
-
-
-def generate_display_DBR(candidate_id: int, mean_time: float) -> str:
-    """Generate dispatch_benchmark_result string for displaying"""
-    return f"{candidate_id}\tMean Time: {mean_time:.1f}"
-
-
-def generate_display_MBR(
-    candidate_vmfb_path_str: str,
-    device_id: str,
-    t1: float,
-    calibrated_diff: Optional[float] = None,
-) -> str:
-    """Generate model_benchmark_result string for displaying"""
-    if calibrated_diff:
-        percentage_change = calibrated_diff * 100
-        change_str = f"({percentage_change:+.3f}%)"
-        res_str = f"Benchmarking: {candidate_vmfb_path_str} on device {device_id}: {t1:.3g} {change_str}"
-    else:
-        res_str = (
-            f"Benchmarking: {candidate_vmfb_path_str} on device {device_id}: {t1:.3g}"
-        )
-    return res_str
+    return real_time * unit_conversions[time_unit]
 
 
 def extract_driver_names(user_devices: list[str]) -> set[str]:
@@ -605,85 +404,6 @@ def create_worker_context_queue(device_ids: list[int]) -> queue.Queue[tuple[int,
     return worker_contexts_queue
 
 
-def run_command(run_pack: RunPack) -> RunResult:
-    command = run_pack.command
-    check = run_pack.check
-    timeout_seconds = run_pack.timeout_seconds
-
-    result = None
-    is_timeout = False
-    try:
-        # Convert the command list to a command string for logging
-        command_str = " ".join(command)
-        logging.debug(f"Run: {command_str}")
-
-        # Add timeout to subprocess.run call
-        result = subprocess.run(
-            command,
-            check=check,
-            capture_output=True,
-            text=True,
-            timeout=timeout_seconds,
-        )
-
-        if result.stdout:
-            logging.debug(f"stdout: {result.stdout}")
-        if result.stderr:
-            logging.debug(f"stderr: {result.stderr}")
-    except subprocess.TimeoutExpired as e:
-        logging.warning(
-            f"Command '{command_str}' timed out after {timeout_seconds} seconds."
-        )
-        is_timeout = True
-    except subprocess.CalledProcessError as e:
-        print(e.output)
-        logging.error(
-            f"Command '{command_str}' returned non-zero exit status {e.returncode}."
-        )
-        logging.error(f"Command '{command_str}' failed with error: {e.stderr}")
-        if check:
-            raise
-    except KeyboardInterrupt:
-        print("Ctrl+C detected, terminating child processes...")
-
-    return RunResult(result, is_timeout)
-
-
-# The `strip_root_op_attr` and `strip_compilation_info` functions are used for
-# getting consistent inputs to the compilation step in tuning. Inputs may come
-# in with lowering configs, translation info, and root_op attrs when the input
-# is a benchmark, but not when the input is a source MLIR file. Stripping the
-# info makes the inputs to compilation consistent, and allows for overwriting
-# the compilation info with generated TD specs during codegen.
-def strip_root_op_attr(module: ir.Module):
-    root_ops: list[ir.Operation] = get_ops_from_module(module, is_root_op)
-    for root_op in root_ops:
-        assert (
-            ROOT_OP_ATTR_NAME in root_op.opview.attributes
-        ), f"expected root op to have '{ROOT_OP_ATTR_NAME}' attr"
-        del root_op.opview.attributes[ROOT_OP_ATTR_NAME]
-
-
-# See the above comment for `strip_root_op_attr`.
-def strip_compilation_info(input_path: Path) -> str:
-    # Strip compilation info from the source and save the stripped IR
-    strip_command = [
-        f"iree-opt",
-        f"{input_path}",
-        f"--iree-codegen-strip-compilation-info",
-    ]
-    result = run_command(
-        RunPack(
-            command=strip_command,
-            check=True,
-        )
-    )
-    assert (
-        result.process_res is not None
-    ), "expected result from stripping compilation info"
-    return result.process_res.stdout
-
-
 def run_iree_compile_command(compile_pack: CompilePack) -> Optional[int]:
     candidate_tracker = compile_pack.candidate_tracker
 
@@ -790,11 +510,21 @@ def run_iree_benchmark_module_command(benchmark_pack: BenchmarkPack):
             assert (
                 len(time_and_unit) == 2
             ), "expected the benchmark time to be the time and unit separated by a space."
-            time_us = IREEBenchmarkResult.unit_to_microseconds(
+            time_us = unit_to_microseconds(
                 real_time=float(time_and_unit[0]),
                 time_unit=time_and_unit[1],
             )
             times.append(time_us)
+
+    # If there are no times, then benchmarking failed at runtime. Record the
+    # time as math.inf.
+    if len(times) == 0:
+        return BenchmarkResult(
+            candidate_id=candidate_id,
+            time=math.inf,
+            device_id=str(device_id),
+        )
+
     mean_benchmark_time = sum(times) / float(len(times))
     logging.debug(f"Benchmark time of candidate {candidate_id}: {mean_benchmark_time}")
     return BenchmarkResult(
@@ -804,30 +534,6 @@ def run_iree_benchmark_module_command(benchmark_pack: BenchmarkPack):
     )
 
 
-def run_command_wrapper(task_pack: TaskPack) -> TaskResult:
-    """Help handle extra requirements and record more data for run_command()"""
-    if task_pack.command_need_device_id:
-        # Worker searches for the special symbol and substitutes it with the actual device_id
-        pattern = re.compile(re.escape(DEVICE_ID_PLACEHOLDER))
-        task_pack.run_pack.command = [
-            pattern.sub(str(device_id), s) for s in task_pack.run_pack.command
-        ]
-
-    run_result = run_command(task_pack.run_pack)
-
-    task_result = TaskResult(
-        run_result, task_pack.candidate_id, device_id=str(-1)
-    )  # Main process
-    if device_id:
-        task_result = TaskResult(
-            run_result, task_pack.candidate_id, device_id
-        )  # Subprocess
-
-    time.sleep(task_pack.cooling_time)
-
-    return task_result
-
-
 def multiprocess_progress_wrapper(
     num_worker: int,
     task_list: list,
@@ -861,44 +567,6 @@ def multiprocess_progress_wrapper(
     return results
 
 
-def extract_benchmark_from_run_result(
-    run_result: RunResult,
-) -> Optional[list[dict[str, Any]]]:
-    """Extract the benchmark from the result JSON"""
-    if run_result.process_res and run_result.process_res.stdout:
-        try:
-            result_json = json.loads(run_result.process_res.stdout)
-
-            return result_json.get("benchmarks", None)
-        except json.JSONDecodeError as e:
-            handle_error(
-                condition=True,
-                msg=f"Failed to parse JSON from stdout: {e}",
-                error_type=ValueError,
-                exit_program=True,
-            )
-
-    return None
-
-
-def numerical_sort_key(path: Path) -> tuple[int | float, str]:
-    """
-    Define a sort key function that splits the filename into a numeric and a string part.
-    Order: 0 | 0_a | 0_b | 1 | 1_a | 2
-    """
-    numeric_part: int | float
-    # Extract the numeric part at the start of the filename
-    match = re.match(r"(\d+)", path.stem)
-    if match:
-        numeric_part = int(match.group(1))
-        # The rest of the filename after the numeric part
-        remaining_part = path.stem[len(match.group(0)) :]
-    else:
-        numeric_part = float("inf")
-        remaining_part = path.stem
-    return (numeric_part, remaining_part)
-
-
 def calculate_md5(file_path: Path) -> str:
     md5 = hashlib.md5()
     with open(file_path, "rb") as f:
@@ -933,111 +601,6 @@ def find_collisions(
     return collisions_exist, hash_values
 
 
-def load_pickle(file_path: Path) -> list[Any]:
-    handle_error(
-        condition=(not file_path.exists()),
-        msg=f"Configuration file not found: {file_path}",
-        error_type=FileNotFoundError,
-    )
-    with open(file_path, "rb") as file:
-        loaded_array = pickle.load(file)
-    return loaded_array
-
-
-def save_pickle(file_path: Path, input_list: list[Any]) -> None:
-    with open(file_path, "wb") as file:
-        pickle.dump(input_list, file)
-
-
-def append_to_file(lines: list[str], filepath: Path, title: str = "") -> None:
-    """Appends new content to the end of the output.log."""
-    title_str = "=" * 5 + f" {title} " + "=" * 5 + "\n" if title != "" else ""
-    with open(filepath, "a") as file:
-        file.write(title_str)
-        file.writelines(lines)
-        file.write("\n")
-
-
-# TODO(Max191): Remove in favor of using generate_candidate_specs.
-def generate_candidates(
-    args: argparse.Namespace,
-    path_config: PathConfig,
-    candidate_trackers: list[CandidateTracker],
-) -> list[int]:
-    """Generate candidate files for tuning. Returns the list of candidate indexes"""
-    logging.debug("generate_candidates()")
-
-    try:
-        shutil.copy(
-            path_config.global_config_epilog_mlir, path_config.local_config_epilog_mlir
-        )
-        shutil.copy(
-            path_config.global_config_prolog_mlir, path_config.local_config_prolog_mlir
-        )
-    except FileNotFoundError as e:
-        handle_error(
-            condition=True,
-            msg=f"Configuration file not found: {e}",
-            error_type=FileNotFoundError,
-        )
-
-    shutil.copy(args.input_file, path_config.template_mlir)
-
-    mlirs = []
-    try:
-        logging.debug("Captured messages from candidate_gen.py:")
-        candidate_gen.tune(
-            input=str(path_config.template_mlir),
-            output=str(path_config.candidates_dir),
-            limit=args.num_candidates,
-            num_subgroups=args.num_subgroups,
-            lhs_dims=args.lhs_dims,
-            rhs_dims=args.rhs_dims,
-            tile_dims=args.tile_dims,
-        )
-        mlirs = sorted(
-            path_config.candidates_dir.glob("*.mlir"), key=numerical_sort_key
-        )
-    except Exception as e:
-        logging.error("An error occurred during candidates generation: %s", str(e))
-        # Capture and log debug messages from candidate_gen.py
-        tune_logger = logging.getLogger("tune")
-        for handler in logging.getLogger().handlers:
-            if isinstance(handler, logging.FileHandler):
-                tune_logger.handlers.append(handler)
-        tune_logger.exception("Error in candidate_gen.py:")
-        raise
-    logging.debug("candidate_gen.py ends")
-
-    candidate_configs = load_pickle(path_config.candidate_configs_pkl)
-    candidate_configs.insert(0, None)  # No Configuration class for 0.mlir
-
-    # Create candidate trackers
-    assert len(mlirs) // 2 + 1 == len(candidate_configs)
-    candidates = []
-    for mlir in mlirs:
-        if "_config.mlir" not in mlir.name:
-            candidates.append(int(mlir.stem))
-            new_candidate = CandidateTracker(
-                candidate_id=int(mlir.stem),
-                dispatch_mlir_path=mlir,
-                configuration=candidate_configs[int(mlir.stem)],
-            )
-            candidate_trackers.append(new_candidate)
-        else:
-            candidate_trackers[
-                int(mlir.stem.split("_config")[0])
-            ].dispatch_config_path = mlir
-
-    handle_error(
-        condition=(len(candidates) == 0), msg="Failed to generate any candidates"
-    )
-
-    logging.info(f"Generated [{len(candidates)}] candidates")
-
-    return candidates
-
-
 def generate_candidate_specs(
     args: argparse.Namespace,
     path_config: PathConfig,
@@ -1056,7 +619,7 @@ def generate_candidate_specs(
         # Strip compilation info before generating td_specs, since the generated
         # td_specs can end up matching against the compilation info from the
         # source mlir.
-        mlir_text = strip_compilation_info(path_config.template_mlir)
+        mlir_text = candidate_gen.strip_compilation_info(path_config.template_mlir)
         mlir_module = dispatch_parser.parse_mlir(mlir_text, tuning_client.tuner_context)
         with tuning_client.tuner_context.mlir_ctx:
             logging.debug("Captured messages from candidate_gen.py:")
@@ -1140,10 +703,10 @@ def compile(
 
     # Strip compilation info and root_op attribute from the source and save
     # the stripped IR, since the TD specs do not expect these attributes.
-    stripped_mlir = strip_compilation_info(path_config.template_mlir)
+    stripped_mlir = candidate_gen.strip_compilation_info(path_config.template_mlir)
     context = tuning_client.tuner_context.mlir_ctx
     stripped_module = ir.Module.parse(stripped_mlir, context=context)
-    strip_root_op_attr(stripped_module)
+    candidate_gen.strip_root_op_attr(stripped_module)
     stripped_mlir = str(stripped_module)
     with open(path_config.template_mlir, "w") as f:
         f.write(stripped_mlir)
@@ -1200,273 +763,6 @@ def compile(
     return compiled_candidates
 
 
-# TODO(Max191): Remove in favor of using `compile` for both model and dispatch
-# tuning.
-def compile_dispatches(
-    args: argparse.Namespace,
-    path_config: PathConfig,
-    candidates: list[int],
-    candidate_trackers: list[CandidateTracker],
-    tuning_client: TuningClient,
-) -> list[int]:
-    logging.debug("compile_dispatches()")
-
-    if not candidates:
-        logging.warning("No candidates to compile.")
-        return []
-
-    path_config.compiled_dir.mkdir(parents=True, exist_ok=True)
-    path_config.compile_failed_dir.mkdir(parents=True, exist_ok=True)
-    path_config.specs_dir.mkdir(parents=True, exist_ok=True)
-
-    task_list = [
-        TaskPack(
-            RunPack(
-                command=tuning_client.get_dispatch_compile_command(
-                    candidate_trackers[i]
-                ),
-                check=False,
-                timeout_seconds=tuning_client.get_dispatch_compile_timeout_s(),
-            ),
-            candidate_id=i,
-        )
-        for i in candidates
-    ]
-    num_worker = min(args.max_cpu_workers, len(task_list))
-    multiprocess_progress_wrapper(
-        num_worker=num_worker, task_list=task_list, function=run_command_wrapper
-    )
-
-    # Note: failed/incomplete candidates can also be detected by checking if subprocess.res is None
-    compiled_files = sorted(
-        path_config.compiled_dir.glob("*.vmfb"), key=numerical_sort_key
-    )
-    failed_files = sorted(
-        path_config.compile_failed_dir.glob("*.mlir"), key=numerical_sort_key
-    )
-
-    total, good, bad = len(task_list), len(compiled_files), len(failed_files)
-    compiling_rate = good / total * 100
-    logging.info(
-        f"Total: {total} | Compiled: {good} | Failed: {bad} | Compiling Rate: {compiling_rate:.1f}%"
-    )
-
-    # Update candidate tracker
-    for failed_file in failed_files:
-        index = path_config.get_compiled_dispatch_index(failed_file)
-        candidate_trackers[index].compilation_successful = False
-    compiled_candidates = []
-    compiled_candidates_hash_list = []
-    for compiled_file in compiled_files:
-        index = path_config.get_compiled_dispatch_index(compiled_file)
-        compiled_candidates.append(index)
-        candidate_trackers[index].compilation_successful = True
-        candidate_trackers[index].compiled_dispatch_path = compiled_file
-        compiled_vmfb_path = candidate_trackers[index].compiled_dispatch_path
-        assert compiled_vmfb_path is not None
-        hash_val = calculate_md5(compiled_vmfb_path)
-        candidate_trackers[index].compiled_dispatch_hash = hash_val
-        compiled_candidates_hash_list.append((index, hash_val))
-
-    handle_error(
-        condition=(good == 0),
-        msg="All candidate dispatches .mlir files failed to compile",
-    )
-    handle_error(
-        condition=(compiling_rate < 10),
-        msg=f"Compiling rate [{compiling_rate:.1f}%] < 10%",
-        level=logging.WARNING,
-    )
-
-    collision_detected, unique_indexes = collision_handler(
-        compiled_candidates_hash_list
-    )
-    if collision_detected:
-        logging.info(f"Remains [{len(unique_indexes)}] unique candidate indexes")
-
-    return compiled_candidates if not collision_detected else unique_indexes
-
-
-def parse_dispatch_benchmark_results(
-    path_config: PathConfig,
-    benchmark_results: list[TaskResult],
-    candidate_trackers: list[CandidateTracker],
-) -> tuple[list[ParsedDisptachBenchmarkResult], list[str]]:
-    benchmark_result_configs = []
-    dump_list = []
-    incomplete_list = []
-
-    for benchmark_result in benchmark_results:
-        candidate_id = benchmark_result.candidate_id
-        process_res = benchmark_result.run_result.process_res
-
-        if not process_res:
-            if benchmark_result.run_result.is_timeout:
-                incomplete_list.append(candidate_id)
-            continue
-
-        res_json = extract_benchmark_from_run_result(benchmark_result.run_result)
-        assert res_json is not None
-        res = IREEBenchmarkResult(candidate_id, res_json)
-        benchmark_time = res.get_mean_time_us()
-        assert benchmark_time is not None
-        candidate_trackers[candidate_id].first_benchmark_time = benchmark_time
-        candidate_trackers[
-            candidate_id
-        ].spec_path = path_config.specs_dir / path_config.get_candidate_spec_filename(
-            candidate_id
-        )
-        mlir_path = candidate_trackers[candidate_id].dispatch_mlir_path
-        spec_path = candidate_trackers[candidate_id].spec_path
-        assert mlir_path is not None and spec_path is not None
-        dump_list.append(generate_display_DBR(candidate_id, benchmark_time) + "\n")
-
-        benchmark_result_configs.append(
-            (
-                ParsedDisptachBenchmarkResult(
-                    candidate_id,
-                    benchmark_time,
-                    mlir_path,
-                    spec_path,
-                )
-            )
-        )
-
-    if incomplete_list:
-        dump_list += [f"Candidate {i} not completed" for i in incomplete_list]
-
-    return benchmark_result_configs, dump_list
-
-
-def generate_sample_task_result(
-    stdout: str, candidate_id: int, device_id: str
-) -> TaskResult:
-    res = subprocess.CompletedProcess(
-        args=[""],
-        stdout=stdout,
-        returncode=0,
-    )
-    run_result = RunResult(res, False)
-    return TaskResult(
-        run_result=run_result, candidate_id=candidate_id, device_id=device_id
-    )
-
-
-def generate_dryrun_dispatch_benchmark_results(
-    compiled_candidates: list[int],
-) -> list[TaskResult]:
-    logging.debug("generate_dryrun_dispatch_benchmark_results()")
-
-    task_results = [
-        generate_sample_task_result(
-            f"process_time/real_time_mean    {random.uniform(100.0, 500.0):.3g} ms",
-            i,
-            str(0),
-        )
-        for i in compiled_candidates
-    ]
-
-    return task_results
-
-
-def generate_dryrun_model_benchmark_results(
-    model_candidates: list[int],
-) -> tuple[list[TaskResult], list[TaskResult]]:
-    candidate_results = []
-    for i, j in enumerate(model_candidates):
-        stdout = f"process_time/real_time_mean    {random.uniform(100.0, 500.0):.3g} ms"
-        candidate_results.append(generate_sample_task_result(stdout, j, str(i % 3)))
-
-    baseline_results = [
-        generate_sample_task_result(
-            f"process_time/real_time_mean    {random.uniform(100.0, 500.0):.3g} ms",
-            0,
-            str(i),
-        )
-        for i in range(3)
-    ]
-
-    return candidate_results, baseline_results
-
-
-# TODO(Max191): Remove this function in favor of `benchmark`.
-def benchmark_dispatches(
-    args: argparse.Namespace,
-    path_config: PathConfig,
-    compiled_candidates: list[int],
-    candidate_trackers: list[CandidateTracker],
-    tuning_client: TuningClient,
-):
-    logging.debug("benchmark_dispatches()")
-
-    if args.dry_run:
-        benchmark_results = generate_dryrun_dispatch_benchmark_results(
-            compiled_candidates
-        )
-    else:
-        # Benchmarking dispatch candidates
-        task_list = [
-            TaskPack(
-                RunPack(
-                    command=tuning_client.get_dispatch_benchmark_command(
-                        candidate_trackers[i]
-                    ),
-                    check=False,
-                    timeout_seconds=tuning_client.get_dispatch_benchmark_timeout_s(),
-                ),
-                candidate_id=i,
-                command_need_device_id=True,
-            )
-            for i in compiled_candidates
-        ]
-        worker_context_queue = create_worker_context_queue(args.devices)
-        benchmark_results = multiprocess_progress_wrapper(
-            num_worker=len(args.devices),
-            task_list=task_list,
-            function=run_command_wrapper,
-            initializer=init_worker_context,
-            initializer_inputs=(worker_context_queue,),
-        )
-
-    (
-        parsed_benchmark_results,
-        dispatch_benchmark_dump_list,
-    ) = parse_dispatch_benchmark_results(
-        path_config, benchmark_results, candidate_trackers
-    )
-    append_to_file(
-        dispatch_benchmark_dump_list,
-        filepath=path_config.output_unilog,
-        title="All Dispatch Benchmark Results",
-    )
-
-    benchmarking_rate = (len(parsed_benchmark_results) / len(benchmark_results)) * 100
-    logging.info(
-        f"Total: {len(benchmark_results)} | Benchmarked: {len(parsed_benchmark_results)} | Failed: {len(benchmark_results) - len(parsed_benchmark_results)} | Benchmarking Rate: {benchmarking_rate:.1f}%"
-    )
-    handle_error(
-        condition=(len(benchmark_results) == 0),
-        msg="Failed to benchmark all candidate .vmfb files",
-    )
-
-    # Select top candidates
-    best_results = sorted(
-        parsed_benchmark_results, key=lambda x: float(x.benchmark_time_in_seconds)
-    )[: args.num_model_candidates]
-    logging.info(f"Selected top[{len(best_results)}]")
-
-    dump_list = [
-        f"{result.benchmark_time_in_seconds}\t{result.candidate_mlir.as_posix()}\t{result.candidate_spec_mlir.as_posix()}\n"
-        for result in best_results
-    ]
-    append_to_file(
-        dump_list, filepath=path_config.output_unilog, title="Top Candidates Results"
-    )
-
-    top_candidates = [result.candidate_id for result in best_results]
-    return top_candidates
-
-
 def benchmark(
     args: argparse.Namespace,
     path_config: PathConfig,
@@ -1533,317 +829,3 @@ def get_speedup(result: BenchmarkResult) -> float:
 
     top_candidates = [result.candidate_id for result in best_results]
     return top_candidates
-
-
-# TODO(Max191): Remove in favor of using `compile` for both model and dispatch
-# tuning.
-def compile_models(
-    args: argparse.Namespace,
-    path_config: PathConfig,
-    candidates: list[int],
-    candidate_trackers: list[CandidateTracker],
-    tuning_client: TuningClient,
-) -> list[int]:
-    logging.debug("compile_models()")
-
-    candidate_trackers[0].compiled_model_path = path_config.model_baseline_vmfb
-
-    if args.dry_run:
-        for i in candidates:
-            candidate_trackers[i].compiled_model_path = Path(f"model_{i}.vmfb")
-        return candidates
-
-    if not candidates:
-        logging.warning("No model candidates to compile.")
-        return []
-
-    task_list = [
-        TaskPack(
-            RunPack(
-                command=tuning_client.get_model_compile_command(candidate_trackers[i]),
-                check=False,
-                timeout_seconds=tuning_client.get_model_compile_timeout_s(),
-            ),
-            candidate_id=i,
-        )
-        for i in candidates
-        if i != 0
-    ]
-    num_worker = min(args.max_cpu_workers, len(task_list))
-    multiprocess_progress_wrapper(
-        num_worker=num_worker, task_list=task_list, function=run_command_wrapper
-    )
-
-    model_candidates_files = list(path_config.base_dir.glob("*.vmfb"))
-
-    model_candidates_indexes = []
-    model_candidates_hash_list = []
-
-    # Update candidate tracker
-    for model_candidate in model_candidates_files:
-        assert model_candidate is not None
-        index = path_config.get_compiled_model_index(model_candidate)
-        candidate_trackers[index].compiled_model_path = model_candidate
-        hash_val = calculate_md5(model_candidate)
-        candidate_trackers[index].compiled_model_hash = hash_val
-        model_candidates_hash_list.append((index, hash_val))
-        model_candidates_indexes.append(index)
-
-    # Check if model candidate produces tbe same .vmfb
-    collision_detected, unique_model_candidates_indexes = collision_handler(
-        model_candidates_hash_list
-    )
-
-    if collision_detected:
-        logging.info(
-            f"Remains [{len(unique_model_candidates_indexes)}] unique candidate indexes"
-        )
-
-    return (
-        unique_model_candidates_indexes
-        if collision_detected
-        else model_candidates_indexes
-    )
-
-
-def group_benchmark_results_by_device_id(
-    benchmark_results: list[TaskResult],
-) -> list[list[TaskResult]]:
-    """
-    Groups benchmark results by device ID.
-
-    e.g.
-    [TaskResult(res1, device_1), TaskResult(res2, device_2), TaskResult(res3, device_1)]
-    ----->
-    [ [TaskResult(res1, device_1), TaskResult(res3, device_1)], [TaskResult(res2, device_2)] ]
-    """
-    grouped_results: dict[str, list[TaskResult]] = {}
-    for result in benchmark_results:
-        assert result.device_id is not None
-        if result.device_id not in grouped_results:
-            grouped_results[result.device_id] = []
-        grouped_results[result.device_id].append(result)
-
-    grouped_benchmark_results = [
-        grouped_results[device_id] for device_id in sorted(grouped_results)
-    ]
-
-    return grouped_benchmark_results
-
-
-def parse_model_benchmark_results(
-    candidate_trackers: list[CandidateTracker],
-    candidate_results: list[TaskResult],
-    baseline_results: list[TaskResult],
-):
-    """Update candidate_tracker and format a list of result strings to be saved later."""
-    candidate_results = sorted(candidate_results, key=lambda br: br.device_id)
-    baseline_results = sorted(baseline_results, key=lambda tr: tr.device_id)
-
-    # Assign candidates to the same groups by device_id
-    grouped_candidate_results = group_benchmark_results_by_device_id(candidate_results)
-
-    # Insert baseline results to the head of each list
-    grouped_benchmark_results = [
-        [x] + y for x, y in zip(baseline_results, grouped_candidate_results)
-    ]
-
-    dump_list = []
-    incomplete_list: list[
-        tuple[int, Optional[str]]
-    ] = []  # format: [(candidate_id, device_id)]
-
-    baseline_time = None
-    for same_device_results in grouped_benchmark_results:
-        dump_unsort_list: list[tuple[float, str]] = []
-        for task_result in same_device_results:
-            candidate_id = task_result.candidate_id
-            device_id = task_result.device_id
-            process_res = task_result.run_result.process_res
-
-            # Check if benchmarking has completed
-            if not process_res:
-                if task_result.run_result.is_timeout:
-                    incomplete_list.append((candidate_id, device_id))
-                if candidate_id == 0:
-                    baseline_time = None
-                continue
-
-            result_json = extract_benchmark_from_run_result(task_result.run_result)
-            assert result_json is not None
-            res = IREEBenchmarkResult(candidate_id, result_json)
-            benchmark_time = res.get_mean_time_us()
-            assert benchmark_time is not None
-
-            # Record baseline benchmarking result and skip rest processes
-            if candidate_id == 0:
-                baseline_time = benchmark_time
-                baseline_vmfb_path = candidate_trackers[
-                    candidate_id
-                ].compiled_model_path
-                assert baseline_vmfb_path is not None
-                dump_str = (
-                    generate_display_MBR(
-                        candidate_vmfb_path_str=baseline_vmfb_path.as_posix(),
-                        device_id=device_id,
-                        t1=benchmark_time,
-                    )
-                    + "\n\n"
-                )
-                dump_list.append(dump_str)
-                continue
-
-            # Update candidate_tracker
-            candidate_trackers[candidate_id].model_benchmark_time = benchmark_time
-            candidate_trackers[candidate_id].model_benchmark_device_id = device_id
-
-            # Calculate candidate improvement based on baseline.
-            if baseline_time:
-                candidate_trackers[candidate_id].baseline_benchmark_time = baseline_time
-                calibrated_benchmark_diff = (
-                    benchmark_time - baseline_time
-                ) / baseline_time
-                candidate_trackers[
-                    candidate_id
-                ].calibrated_benchmark_diff = calibrated_benchmark_diff
-            else:
-                calibrated_benchmark_diff = None
-
-            # Collect candidate dump str
-            candidate_vmfb_path = candidate_trackers[candidate_id].compiled_model_path
-            assert candidate_vmfb_path is not None
-            dump_str = (
-                generate_display_MBR(
-                    candidate_vmfb_path_str=candidate_vmfb_path.as_posix(),
-                    device_id=device_id,
-                    t1=benchmark_time,
-                    calibrated_diff=calibrated_benchmark_diff,
-                )
-                + "\n\n"
-            )
-
-            dump_unsort_list.append((benchmark_time, dump_str))
-
-        # Sort model candidate benchmarking result str in ascending time order.
-        dump_list = dump_list + [
-            dump_str for _, dump_str in sorted(dump_unsort_list, key=lambda x: x[0])
-        ]
-
-    # Store incomplete .vmfb file at the end of dump_list.
-    for index, device in incomplete_list:
-        file_path = candidate_trackers[index].compiled_model_path
-        assert file_path is not None
-        error_msg = f"Benchmarking result of {file_path.as_posix()} on device {device} is incomplete"
-        handle_error(condition=True, msg=error_msg, level=logging.WARNING)
-        dump_list.append(error_msg + "\n")
-
-    return dump_list
-
-
-# TODO(Max191): Remove this function in favor of `benchmark`.
-def benchmark_models(
-    args: argparse.Namespace,
-    path_config: PathConfig,
-    model_candidates: list[int],
-    candidate_trackers: list[CandidateTracker],
-    tuning_client: TuningClient,
-):
-    """Benchmark U-Net candidate files and log the results."""
-    logging.debug("benchmark_models()")
-
-    if args.dry_run:
-        candidate_results, baseline_results = generate_dryrun_model_benchmark_results(
-            model_candidates
-        )
-    else:
-        # Benchmarking model candidates
-        worker_context_queue = create_worker_context_queue(args.devices)
-        benchmark_task_list = [
-            TaskPack(
-                RunPack(
-                    command=tuning_client.get_model_benchmark_command(
-                        candidate_trackers[i]
-                    ),
-                    check=False,
-                    timeout_seconds=tuning_client.get_dispatch_benchmark_timeout_s(),
-                ),
-                candidate_id=i,
-                command_need_device_id=True,
-                cooling_time=10,
-            )
-            for i in model_candidates
-        ]
-        candidate_results = multiprocess_progress_wrapper(
-            num_worker=len(args.devices),
-            task_list=benchmark_task_list,
-            function=run_command_wrapper,
-            initializer=init_worker_context,
-            initializer_inputs=(worker_context_queue,),
-        )
-
-        # Benchmarking baselines on each involved device
-        candidate_trackers[0].compiled_model_path = path_config.model_baseline_vmfb
-        worker_context_queue = create_worker_context_queue(args.devices)
-        baseline_task_list = [
-            TaskPack(
-                RunPack(
-                    command=tuning_client.get_model_benchmark_command(
-                        candidate_trackers[0]
-                    ),
-                    check=False,
-                    timeout_seconds=tuning_client.get_model_benchmark_timeout_s(),
-                ),
-                candidate_id=0,
-                command_need_device_id=True,
-            )
-        ] * len(group_benchmark_results_by_device_id(candidate_results))
-        baseline_results = multiprocess_progress_wrapper(
-            num_worker=len(args.devices),
-            task_list=baseline_task_list,
-            function=run_command_wrapper,
-            initializer=init_worker_context,
-            initializer_inputs=(worker_context_queue,),
-        )
-
-    dump_list = parse_model_benchmark_results(
-        candidate_trackers, candidate_results, baseline_results
-    )
-
-    append_to_file(
-        dump_list, filepath=path_config.output_unilog, title="Model Benchmark Results"
-    )
-
-
-def summarize_top_candidates(
-    path_config: PathConfig, candidate_trackers: list[CandidateTracker]
-):
-    dump_list = []
-    top_candidates = []
-    for candidate in candidate_trackers:
-        if candidate.candidate_id == 0 or candidate.model_benchmark_time is None:
-            continue
-        top_candidates.append(
-            (candidate.candidate_id, candidate.model_benchmark_time)
-        )  # collect (id, time)
-
-    top_candidates = sorted(
-        top_candidates, key=lambda x: x[1]
-    )  # sort the list in ascending benchmark time order
-    top_candidate_ids = [item[0] for item in top_candidates]  # get list of candidate id
-
-    for candidate_id in top_candidate_ids:
-        candidate = candidate_trackers[candidate_id]
-        assert candidate.dispatch_config_path is not None
-        with open(candidate.dispatch_config_path, "r") as file:
-            config_file_contents = file.read()
-        final_str = f"Candidate {candidate.candidate_id}:\nModel benchmark time: {candidate.model_benchmark_time} on device {candidate.model_benchmark_device_id}\nDispatch benchmark time: {candidate.first_benchmark_time} on device {candidate.model_benchmark_device_id}\nSpec file path: {candidate.spec_path}\nSpec contents:{config_file_contents}\n\n"
-        dump_list.append(final_str)
-
-    with open(path_config.result_summary_log, "w") as file:
-        file.writelines(dump_list)
-
-
-def sanitize_filename(filename: str) -> str:
-    # Replace invalid characters by an underscore
-    sanitized = re.sub(r"[^\w\.-]", "_", filename)
-    return sanitized
diff --git a/tuner/tuner/libtuner_test.py b/tuner/tuner/libtuner_test.py
index 1b659268b..767a6aff4 100644
--- a/tuner/tuner/libtuner_test.py
+++ b/tuner/tuner/libtuner_test.py
@@ -16,31 +16,6 @@
 """
 
 
-def test_group_benchmark_results_by_device_id() -> None:
-    # Create mock TaskResult objects with device_id attributes
-    task_result_1: libtuner.TaskResult = MagicMock(spec=libtuner.TaskResult)
-    task_result_1.device_id = "device_1"
-
-    task_result_2: libtuner.TaskResult = MagicMock(spec=libtuner.TaskResult)
-    task_result_2.device_id = "device_2"
-
-    task_result_3: libtuner.TaskResult = MagicMock(spec=libtuner.TaskResult)
-    task_result_3.device_id = "device_1"
-
-    benchmark_results = [task_result_1, task_result_2, task_result_3]
-
-    expected_grouped_results = [
-        [task_result_1, task_result_3],  # Grouped by device_1
-        [task_result_2],  # Grouped by device_2
-    ]
-
-    grouped_results = libtuner.group_benchmark_results_by_device_id(benchmark_results)
-
-    assert grouped_results == expected_grouped_results
-    assert grouped_results[0][0].device_id == "device_1"
-    assert grouped_results[1][0].device_id == "device_2"
-
-
 def test_find_collisions() -> None:
     input = [(1, "abc"), (2, "def"), (3, "abc")]
     assert libtuner.find_collisions(input) == (True, [("abc", [1, 3]), ("def", [2])])
@@ -58,307 +33,6 @@ def test_collision_handler() -> None:
     assert libtuner.collision_handler(input) == (False, [])
 
 
-def test_IREEBenchmarkResult_get() -> None:
-    # Time is int in us
-    int_json = [{"aggregate_name": "mean", "real_time": 1, "time_unit": "us"}]
-
-    res = libtuner.IREEBenchmarkResult(candidate_id=1, result_json=int_json)
-    assert res.get_mean_time_us() == float(1)
-
-    # Time is float in us
-    float_json = [{"aggregate_name": "mean", "real_time": 123.45, "time_unit": "us"}]
-
-    res = libtuner.IREEBenchmarkResult(candidate_id=2, result_json=float_json)
-    assert res.get_mean_time_us() == 123.45
-
-    # Time is in seconds
-    seconds_json = [{"aggregate_name": "mean", "real_time": 1.0, "time_unit": "s"}]
-
-    res = libtuner.IREEBenchmarkResult(candidate_id=3, result_json=seconds_json)
-    assert res.get_mean_time_us() == 1.0 * 1e6
-
-    # Time is in miliseconds
-    miliseconds_json = [{"aggregate_name": "mean", "real_time": 1.0, "time_unit": "ms"}]
-
-    res = libtuner.IREEBenchmarkResult(candidate_id=4, result_json=miliseconds_json)
-    assert res.get_mean_time_us() == 1.0 * 1e3
-
-    # Time is in nanoseconds
-    nanoseconds_json = [{"aggregate_name": "mean", "real_time": 1.0, "time_unit": "ns"}]
-
-    res = libtuner.IREEBenchmarkResult(candidate_id=5, result_json=nanoseconds_json)
-    assert res.get_mean_time_us() == 1.0 * 1e-3
-
-    small_number_json = [
-        {
-            "aggregate_name": "mean",
-            "real_time": 3.4591828516259519e-02,
-            "time_unit": "ms",
-        }
-    ]
-
-    res = libtuner.IREEBenchmarkResult(candidate_id=6, result_json=small_number_json)
-    assert res.get_mean_time_us() == 34.591828516259519
-
-    # Invalid json: missing real_time
-    invalid_real_time_json = [{"aggregate_name": "mean", "real_time": None}]
-
-    res = libtuner.IREEBenchmarkResult(
-        candidate_id=7, result_json=invalid_real_time_json
-    )
-    assert res.get_mean_time_us() == None
-
-    # Invalid json: empty dictionary
-    res = libtuner.IREEBenchmarkResult(candidate_id=8, result_json=[])
-    assert res.get_mean_time_us() is None
-
-    # Invalid json: invalid time unit
-    invalid_time_unit_json = [
-        {"aggregate_name": "mean", "real_time": 1.0, "time_unit": "invalid_unit"}
-    ]
-
-    with pytest.raises(AssertionError, match="Unsupported time unit: invalid_unit"):
-        res = libtuner.IREEBenchmarkResult(
-            candidate_id=9, result_json=invalid_time_unit_json
-        )
-        res.get_mean_time_us()
-
-    # Invalid json: missing aggregate_name
-    invalid_aggregate_name_json = [{"real_time": 1.0, "time_unit": "us"}]
-
-    res = libtuner.IREEBenchmarkResult(
-        candidate_id=10, result_json=invalid_aggregate_name_json
-    )
-    assert res.get_mean_time_us() is None
-
-
-def test_generate_display_BR() -> None:
-    output = libtuner.generate_display_DBR(1, 3.14)
-    expected = f"1\tMean Time: 3.1"
-    assert output == expected, "DispatchBenchmarkResult generates invalid sample string"
-
-    output = libtuner.generate_display_MBR("baseline.vmfb", str(1), 567.89)
-    expected = "Benchmarking: baseline.vmfb on device 1: 568"
-    assert output == expected, "ModelBenchmarkResult generates invalid sample string"
-    output = libtuner.generate_display_MBR("baseline.vmfb", str(1), 567.89, 0.0314)
-    expected = "Benchmarking: baseline.vmfb on device 1: 568 (+3.140%)"
-    assert output == expected, "ModelBenchmarkResult generates invalid sample string"
-    output = libtuner.generate_display_MBR("baseline.vmfb", str(1), 567.89, -3.14)
-    expected = "Benchmarking: baseline.vmfb on device 1: 568 (-314.000%)"
-    assert output == expected, "ModelBenchmarkResult generates invalid sample string"
-
-
-def make_mock_task_result() -> libtuner.TaskResult:
-    process: CompletedProcess = MagicMock(spec=CompletedProcess)
-    run_result = libtuner.RunResult(process, False)
-    task_result = libtuner.TaskResult(run_result, 0, "")
-    return task_result
-
-
-def test_parse_dispatch_benchmark_results() -> None:
-    base_path = libtuner.Path("/mock/base/dir")
-    spec_dir = base_path / "specs"
-    path_config = libtuner.PathConfig()
-    object.__setattr__(path_config, "specs_dir", spec_dir)
-
-    mock_result_1 = make_mock_task_result()
-    mock_json_1 = {
-        "benchmarks": [
-            {"aggregate_name": "mean", "real_time": 100.0, "time_unit": "us"}
-        ]
-    }
-    assert mock_result_1.run_result.process_res is not None
-    mock_result_1.run_result.process_res.stdout = json.dumps(mock_json_1)
-    mock_result_1.candidate_id = 1
-    mock_result_2 = make_mock_task_result()
-    mock_json_2 = {
-        "benchmarks": [
-            {"aggregate_name": "mean", "real_time": 200.0, "time_unit": "us"}
-        ]
-    }
-    assert mock_result_2.run_result.process_res is not None
-    mock_result_2.run_result.process_res.stdout = json.dumps(mock_json_2)
-    mock_result_2.candidate_id = 2
-    mock_result_3 = make_mock_task_result()
-    mock_json_3 = {
-        "benchmarks": [
-            {
-                "aggregate_name": "mean",
-                "real_time": 3.4591828516259519e-02,
-                "time_unit": "ms",
-            }
-        ]
-    }
-    assert mock_result_3.run_result.process_res is not None
-    mock_result_3.run_result.process_res.stdout = json.dumps(mock_json_3)
-    mock_result_3.candidate_id = 3
-    # Incomplete result.
-    mock_result_4 = libtuner.TaskResult(libtuner.RunResult(None, True), 4, "4")
-    benchmark_results = [mock_result_1, mock_result_2, mock_result_3, mock_result_4]
-
-    candidate_trackers = []
-    for i in range(4):
-        tracker = libtuner.CandidateTracker(candidate_id=i)
-        tracker.dispatch_mlir_path = libtuner.Path(f"/mock/mlir/path/{i}.mlir")
-        candidate_trackers.append(tracker)
-
-    expected_parsed_results = [
-        libtuner.ParsedDisptachBenchmarkResult(
-            candidate_id=1,
-            benchmark_time_in_seconds=100.0,
-            candidate_mlir=libtuner.Path("/mock/mlir/path/1.mlir"),
-            candidate_spec_mlir=libtuner.Path("/mock/base/dir/specs/1_spec.mlir"),
-        ),
-        libtuner.ParsedDisptachBenchmarkResult(
-            candidate_id=2,
-            benchmark_time_in_seconds=200.0,
-            candidate_mlir=libtuner.Path("/mock/mlir/path/2.mlir"),
-            candidate_spec_mlir=libtuner.Path("/mock/base/dir/specs/2_spec.mlir"),
-        ),
-        libtuner.ParsedDisptachBenchmarkResult(
-            candidate_id=3,
-            benchmark_time_in_seconds=34.591828516259519,
-            candidate_mlir=libtuner.Path("/mock/mlir/path/3.mlir"),
-            candidate_spec_mlir=libtuner.Path("/mock/base/dir/specs/3_spec.mlir"),
-        ),
-    ]
-    expected_dump_list = [
-        "1\tMean Time: 100.0\n",
-        "2\tMean Time: 200.0\n",
-        "3\tMean Time: 34.6\n",
-        "Candidate 4 not completed",
-    ]
-
-    parsed_results, dump_list = libtuner.parse_dispatch_benchmark_results(
-        path_config, benchmark_results, candidate_trackers
-    )
-
-    assert parsed_results == expected_parsed_results
-    assert dump_list == expected_dump_list
-    assert candidate_trackers[1].first_benchmark_time == 100.0
-    assert candidate_trackers[1].spec_path == libtuner.Path(
-        "/mock/base/dir/specs/1_spec.mlir"
-    )
-    assert candidate_trackers[2].first_benchmark_time == 200.0
-    assert candidate_trackers[2].spec_path == libtuner.Path(
-        "/mock/base/dir/specs/2_spec.mlir"
-    )
-    assert candidate_trackers[3].first_benchmark_time == 34.591828516259519
-    assert candidate_trackers[3].spec_path == libtuner.Path(
-        "/mock/base/dir/specs/3_spec.mlir"
-    )
-
-
-def test_parse_model_benchmark_results() -> None:
-    # Setup mock data for candidate_trackers
-    tracker0 = libtuner.CandidateTracker(0)
-    tracker0.compiled_model_path = libtuner.Path("/path/to/baseline.vmfb")
-
-    tracker1 = libtuner.CandidateTracker(1)
-    tracker1.compiled_model_path = libtuner.Path("/path/to/model_1.vmfb")
-
-    tracker2 = libtuner.CandidateTracker(2)
-    tracker2.compiled_model_path = libtuner.Path("/path/to/model_2.vmfb")
-
-    tracker3 = libtuner.CandidateTracker(3)
-    tracker3.compiled_model_path = libtuner.Path("/path/to/model_3.vmfb")
-
-    candidate_trackers = [tracker0, tracker1, tracker2, tracker3]
-
-    # Setup mock data for task results
-    result1 = make_mock_task_result()
-    result_json_1 = {"benchmarks": [{"real_time": 1.23}]}
-    assert result1.run_result.process_res is not None
-    result1.run_result.process_res.stdout = json.dumps(result_json_1)
-    result1.candidate_id = 1
-    result1.device_id = "device1"
-
-    result2 = make_mock_task_result()
-    result_json_2 = {"benchmarks": [{"real_time": 4.56}]}
-    assert result2.run_result.process_res is not None
-    result2.run_result.process_res.stdout = json.dumps(result_json_2)
-    result2.candidate_id = 2
-    result2.device_id = "device2"
-
-    result3 = make_mock_task_result()
-    result_json_3 = {"benchmarks": [{"real_time": 0.98}]}
-    assert result3.run_result.process_res is not None
-    result3.run_result.process_res.stdout = json.dumps(result_json_3)
-    result3.candidate_id = 0
-    result3.device_id = "device1"
-
-    result4 = make_mock_task_result()
-    result_json_4 = {"benchmarks": [{"real_time": 4.13}]}
-    assert result4.run_result.process_res is not None
-    result4.run_result.process_res.stdout = json.dumps(result_json_4)
-    result4.candidate_id = 0
-    result4.device_id = "device2"
-
-    # Incomplete baseline on device3
-    result5 = libtuner.TaskResult(libtuner.RunResult(None, True), 0, "device3")
-
-    result6 = make_mock_task_result()
-    result_json_6 = {"benchmarks": [{"real_time": 3.38}]}
-    assert result6.run_result.process_res is not None
-    result6.run_result.process_res.stdout = json.dumps(result_json_6)
-    result6.candidate_id = 3
-    result6.device_id = "device3"
-
-    candidate_results = [result1, result2, result6]
-    baseline_results = [result3, result4, result5]
-
-    # Skip real benchmark extraction, directly use given values from above
-    def mock_get_mean_time_us(self):
-        return float(self.result_json[0]["real_time"]) if self.result_json else None
-
-    # Mock IREEBenchmarkResult to return wanted benchmark times
-    with patch(
-        f"{libtuner.__name__}.IREEBenchmarkResult.get_mean_time_us",
-        new=mock_get_mean_time_us,
-    ):
-        # Mock handle_error to avoid actual logging during tests
-        with patch(f"{libtuner.__name__}.handle_error") as mock_handle_error:
-            dump_list = libtuner.parse_model_benchmark_results(
-                candidate_trackers, candidate_results, baseline_results
-            )
-
-            # Verify interactions with candidate_trackers
-            assert tracker1.model_benchmark_time == 1.23
-            assert tracker1.model_benchmark_device_id == "device1"
-            assert tracker1.baseline_benchmark_time == 0.98
-            assert tracker1.calibrated_benchmark_diff == pytest.approx(
-                (1.23 - 0.98) / 0.98, rel=1e-6
-            )
-
-            assert tracker2.model_benchmark_time == 4.56
-            assert tracker2.model_benchmark_device_id == "device2"
-            assert tracker2.baseline_benchmark_time == 4.13
-            assert tracker2.calibrated_benchmark_diff == pytest.approx(
-                (4.56 - 4.13) / 4.13, rel=1e-6
-            )
-
-            assert tracker3.model_benchmark_time == 3.38
-            assert tracker3.model_benchmark_device_id == "device3"
-
-            assert dump_list == [
-                "Benchmarking: /path/to/baseline.vmfb on device device1: 0.98\n" "\n",
-                "Benchmarking: /path/to/model_1.vmfb on device device1: 1.23 (+25.510%)\n"
-                "\n",
-                "Benchmarking: /path/to/baseline.vmfb on device device2: 4.13\n" "\n",
-                "Benchmarking: /path/to/model_2.vmfb on device device2: 4.56 (+10.412%)\n"
-                "\n",
-                "Benchmarking: /path/to/model_3.vmfb on device device3: 3.38\n" "\n",
-                "Benchmarking result of /path/to/baseline.vmfb on device device3 is incomplete\n",
-            ]
-
-            # Verify handle_error was called correctly
-            mock_handle_error.assert_called_once_with(
-                condition=True,
-                msg="Benchmarking result of /path/to/baseline.vmfb on device device3 is incomplete",
-                level=libtuner.logging.WARNING,
-            )
-
-
 def test_extract_driver_names() -> None:
     user_devices = ["hip://0", "local-sync://default", "cuda://default"]
     expected_output = {"hip", "local-sync", "cuda"}

From 4f6f2b3e177980334af013cb11ef4f36a3f9d342 Mon Sep 17 00:00:00 2001
From: Max191 <44243577+Max191@users.noreply.github.com>
Date: Tue, 7 Jan 2025 15:44:08 -0500
Subject: [PATCH 04/35] [tuner] Add support for TileAndFuse and multi-dim
 contractions (#771)

This PR adds support for tuning contractions with multiple M, N, K, and
Batch dimensions, and adds support for tuning with the TileAndFuse
pipeline. A new flag is added called `--codegen-pipeline` that specifies
which codegen pipeline to target (`llvmgpu_vector_distribute` or
`llvmgpu_tile_and_fuse`).

---------

Signed-off-by: Max Dawkins <maxdawkins19@gmail.com>
Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
Co-authored-by: Max Dawkins <maxdawkins19@gmail.com>
---
 tuner/examples/test/README.md            |   3 +-
 tuner/tuner/candidate_gen.py             |   5 +-
 tuner/tuner/common.py                    |  53 +++-
 tuner/tuner/common_test.py               |   9 +-
 tuner/tuner/dispatch_constraints.py      | 303 ++++++++++++++++++-----
 tuner/tuner/dispatch_constraints_test.py | 241 ++++++++++++++++--
 tuner/tuner/dispatch_parser.py           |  46 ++--
 tuner/tuner/dispatch_parser_test.py      |  38 ++-
 tuner/tuner/libtuner.py                  |  24 ++
 tuner/tuner/op_matchers.py               |   2 +-
 10 files changed, 600 insertions(+), 124 deletions(-)

diff --git a/tuner/examples/test/README.md b/tuner/examples/test/README.md
index 47ae7a8fe..850a161da 100644
--- a/tuner/examples/test/README.md
+++ b/tuner/examples/test/README.md
@@ -36,5 +36,6 @@ python -m examples.test <model_file_path> <benchmark_file_path> \
     --test_num_dispatch_candidates=<num_dispatch_candidates> \
     --test_num_model_candidates=<num_model_candidates> \
     --test_hip_target=<hip_target> \
-    --num-candidates=<num_generated_candidates>
+    --num-candidates=<num_generated_candidates> \
+    --codegen-pipeline=<codegen_pipeline>
 ```
diff --git a/tuner/tuner/candidate_gen.py b/tuner/tuner/candidate_gen.py
index b6264792e..ff7019ee0 100644
--- a/tuner/tuner/candidate_gen.py
+++ b/tuner/tuner/candidate_gen.py
@@ -194,6 +194,7 @@ def generate_configs_and_td_specs(
     tuner_context: TunerContext,
     limit: int = 4096,  # Max candidates to be generated
     num_subgroups: int = 4,  # GPU spec, used to determine candidate generation constraints
+    codegen_pipeline: iree_codegen.DispatchLoweringPassPipeline = iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute,
 ) -> list[ir.Module]:
     dispatch_tuner_registry = DispatchTunerRegistry(check_translation_info=False)
     dispatch_tuner_registry.register(
@@ -221,7 +222,9 @@ def generate_configs_and_td_specs(
     variant_op = variant_op_list[0]
     mma_list = iree_codegen.query_mma_intrinsics(variant_op)
     for i, config in enumerate(
-        generate_solutions(tuner_context, problem_size, num_subgroups, mma_list)
+        generate_solutions(
+            tuner_context, problem_size, num_subgroups, mma_list, codegen_pipeline
+        )
     ):
         if i >= limit:
             break
diff --git a/tuner/tuner/common.py b/tuner/tuner/common.py
index 54051df47..45bcb0d75 100644
--- a/tuner/tuner/common.py
+++ b/tuner/tuner/common.py
@@ -6,7 +6,7 @@
 
 import re
 import logging
-from dataclasses import astuple, dataclass
+from dataclasses import astuple, dataclass, field
 from enum import Enum
 from typing import Optional
 from typing import Any
@@ -67,31 +67,64 @@ def __str__(self) -> str:
 
 
 @dataclass
-class MatmulSize:
-    M: int
-    N: int
-    K: int
-    B: int = 1
+class ContractionSizes:
+    """
+    Represents the size of the iteration space along each contraction dimension.
+    For example, the following is a simple batch mmt:
+      linalg.generic ... indexing_maps = [
+          affine_map<(b, m, n, k) -> (b, m, k)>,
+          affine_map<(b, m, n, k) -> (b, n, k)>,
+          affine_map<(b, m, n, k) -> (b, m, n)>,
+        ] ...
+        ins(%lhs: tensor<4x8x32xf16>, %rhs: tensor<4x16x32xf16>)
+        outs(%acc: tensor<4x8x16xf16>)
+    The ContractionSizes would be:
+      M = [8]
+      N = [16]
+      K = [32]
+      B = [4]
+    """
+
+    M: list[int]
+    N: list[int]
+    K: list[int]
+    B: list[int] = field(default_factory=list)
 
 
 @dataclass
 class ContractionDimensions:
-    batch: list[int]
+    """
+    Stores which dimensions of the iteration space belong to M, N, K, or Batch.
+    For example, the following is a simple batch mmt:
+    linalg.generic ... indexing_maps = [
+        affine_map<(b, m, n, k) -> (b, m, k)>,
+        affine_map<(b, m, n, k) -> (b, n, k)>,
+        affine_map<(b, m, n, k) -> (b, m, n)>,
+        ]
+    The ContractionDimensions would be:
+    M = [1]
+    N = [2]
+    K = [3]
+    B = [0]
+    """
+
     m: list[int]
     n: list[int]
     k: list[int]
+    batch: list[int] = field(default_factory=list)
 
 
 @dataclass
 class ProblemSize:
-    matmul_size: MatmulSize
+    matmul_size: ContractionSizes
     lhs_type: ShapedType
     rhs_type: ShapedType
     res_type: ShapedType
     dispatch_kind: DispatchKind
+    contraction_dims: ContractionDimensions
 
     @property
-    def MNK(self) -> tuple[int, int, int]:
+    def MNK(self) -> tuple[list[int], list[int], list[int]]:
         return (self.matmul_size.M, self.matmul_size.N, self.matmul_size.K)
 
 
@@ -130,7 +163,7 @@ def get_lowering_config(
         # A local variable to hold the transformed value.
         promoted_value = value
         match key:
-            case "workgroup" | "reduction":
+            case "workgroup" | "reduction" | "subgroup":
                 if isinstance(value, list):
                     promoted_value = ir.ArrayAttr.get(
                         [tuner_ctx.type.getI64(x) for x in value]
diff --git a/tuner/tuner/common_test.py b/tuner/tuner/common_test.py
index b23360ccc..eba5b35e1 100644
--- a/tuner/tuner/common_test.py
+++ b/tuner/tuner/common_test.py
@@ -119,11 +119,12 @@ def test_get_pipeline_config(tuner_ctx: common.TunerContext) -> None:
 def test_get_compatible_mfma_intrinsics(tuner_ctx: common.TunerContext) -> None:
     assert common.get_compatible_mfma_intrinsics(
         common.ProblemSize(
-            common.MatmulSize(2048, 1280, 1280),
+            common.ContractionSizes([2048], [1280], [1280]),
             common.ShapedType([2048, 1280], tuner_ctx.type.f16),
             common.ShapedType([1280, 1280], tuner_ctx.type.f16),
             common.ShapedType([2048, 1280], tuner_ctx.type.f32),
             common.DispatchKind.contraction,
+            common.ContractionDimensions([0], [1], [2]),
         ),
         [
             iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
@@ -138,11 +139,12 @@ def test_get_compatible_mfma_intrinsics(tuner_ctx: common.TunerContext) -> None:
 
     assert common.get_compatible_mfma_intrinsics(
         common.ProblemSize(
-            common.MatmulSize(2048, 1280, 1280),
+            common.ContractionSizes([2048], [1280], [1280]),
             common.ShapedType([2048, 1280], tuner_ctx.type.i8),
             common.ShapedType([1280, 1280], tuner_ctx.type.i8),
             common.ShapedType([2048, 1280], tuner_ctx.type.i32),
             common.DispatchKind.contraction,
+            common.ContractionDimensions([0], [1], [2]),
         ),
         [
             iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
@@ -158,11 +160,12 @@ def test_get_compatible_mfma_intrinsics(tuner_ctx: common.TunerContext) -> None:
     assert (
         common.get_compatible_mfma_intrinsics(
             common.ProblemSize(
-                common.MatmulSize(968, 320, 640, 64),
+                common.ContractionSizes([968], [320], [640], [64]),
                 common.ShapedType([64, 968, 640], tuner_ctx.type.f32),
                 common.ShapedType([64, 640, 320], tuner_ctx.type.f32),
                 common.ShapedType([64, 968, 320], tuner_ctx.type.f32),
                 common.DispatchKind.contraction,
+                common.ContractionDimensions([1], [2], [3], [0]),
             ),
             [
                 iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
diff --git a/tuner/tuner/dispatch_constraints.py b/tuner/tuner/dispatch_constraints.py
index f6de5179d..50a36d02f 100644
--- a/tuner/tuner/dispatch_constraints.py
+++ b/tuner/tuner/dispatch_constraints.py
@@ -7,6 +7,7 @@
 # Given an input dispatch, this code modifies the hyperparameters
 # in the code and runs it.
 
+import math
 import z3  # type: ignore
 from typing import Iterator
 
@@ -63,33 +64,37 @@ def get_dispatch_constraints(
 
 def calculate_shared_memory_usage_in_bytes(
     problem_size: ProblemSize,
-    m: int | z3.ArithRef,
-    n: int | z3.ArithRef,
-    k: int | z3.ArithRef,
+    m: list[int] | list[z3.ArithRef],
+    n: list[int] | list[z3.ArithRef],
+    k: list[int] | list[z3.ArithRef],
 ) -> int | z3.ArithRef:
-    lhs_memory = m * k * (problem_size.lhs_type.bitwidth // 8)
-    rhs_memory = k * n * (problem_size.rhs_type.bitwidth // 8)
+    lhs_memory = problem_size.lhs_type.bitwidth // 8
+    for size in m + k:
+        lhs_memory *= size
+    rhs_memory = problem_size.rhs_type.bitwidth // 8
+    for size in n + k:
+        rhs_memory *= size
     return lhs_memory + rhs_memory
 
 
-def generate_constraints(
+def generate_vector_distribute_constraints(
     problem_size: ProblemSize,
-    tile_sizes,
-    num_subgroups,
-    subgroup_size,
-    intrinsic_size,
-    workgroup_size,
-    subgroup_m_count,
-    subgroup_n_count,
-    waves_per_eu,
+    tile_sizes: list[list[z3.ArithRef]],
+    num_subgroups: int,
+    subgroup_size: z3.ArithRef,
+    intrinsic_size: list[z3.ArithRef],
+    workgroup_size: list[z3.ArithRef],
+    subgroup_m_count: z3.ArithRef,
+    subgroup_n_count: z3.ArithRef,
+    waves_per_eu: z3.ArithRef,
     mma_intrinsics: list[iree_gpu.MMAIntrinsic],
 ):
     M, N, K = (
-        problem_size.matmul_size.M,
-        problem_size.matmul_size.N,
-        problem_size.matmul_size.K,
+        problem_size.matmul_size.M[-1],
+        problem_size.matmul_size.N[-1],
+        problem_size.matmul_size.K[-1],
     )
-    m, n, k = tile_sizes
+    m_vars, n_vars, k_vars = tile_sizes
     intrinsic_mn, intrinsic_k = intrinsic_size
     wg_x, wg_y, wg_z = workgroup_size
     wg_threads = z3.Int("wg_threads")
@@ -101,6 +106,10 @@ def generate_constraints(
         )
     ]
     subgroup_k_count = 1
+    m = m_vars[-1]
+    n = n_vars[-1]
+    k = k_vars[-1]
+    constraints += [v == 1 for v in m_vars[:-1] + n_vars[:-1] + k_vars[:-1]]
     constraints += [
         m >= intrinsic_mn,
         m <= 512,
@@ -136,7 +145,7 @@ def generate_constraints(
     constraints += [waves_per_eu == 2]
     # constraints += [z3.Or(waves_per_eu == 2, waves_per_eu == 3, waves_per_eu == 4)]
 
-    shared_memory = calculate_shared_memory_usage_in_bytes(problem_size, m, n, k)
+    shared_memory = calculate_shared_memory_usage_in_bytes(problem_size, [m], [n], [k])
     constraints += [shared_memory <= 65536]
 
     constraints += get_dispatch_constraints(problem_size, m, n, k)
@@ -144,6 +153,96 @@ def generate_constraints(
     return constraints
 
 
+def generate_tile_and_fuse_constraints(
+    problem_size: ProblemSize,
+    tile_sizes: list[list[z3.ArithRef]],
+    num_subgroups: int,
+    subgroup_size: z3.ArithRef,
+    intrinsic_size: list[z3.ArithRef],
+    workgroup_size: list[z3.ArithRef],
+    subgroup_m_count: z3.ArithRef,
+    subgroup_n_count: z3.ArithRef,
+    waves_per_eu: z3.ArithRef,
+    mma_intrinsics: list[iree_gpu.MMAIntrinsic],
+):
+    M, N, K = problem_size.MNK
+    m_tiles, n_tiles, k_tiles, subgroup_m_tiles, subgroup_n_tiles = tile_sizes
+    intrinsic_mn, intrinsic_k = intrinsic_size
+    wg_x, wg_y, wg_z = workgroup_size
+    wg_threads = z3.Int("wg_threads")
+    constraints = [wg_x == wg_threads, wg_y == 1, wg_z == 1]
+    constraints += [subgroup_size == 64, wg_threads <= 1024]
+    constraints += [
+        get_mfma_intrinsic_constraints(
+            problem_size, intrinsic_mn, intrinsic_mn, intrinsic_k, mma_intrinsics
+        )
+    ]
+    subgroup_k_count = 1
+
+    constraints += [
+        m_tiles[-1] >= intrinsic_mn,
+        m_tiles[-1] % intrinsic_mn == 0,
+        n_tiles[-1] >= intrinsic_mn,
+        n_tiles[-1] % intrinsic_mn == 0,
+        k_tiles[-1] * intrinsic_k <= K[-1],
+        math.prod(m_tiles) <= 512,
+        math.prod(n_tiles) <= 512,
+        math.prod(k_tiles) <= 512 / intrinsic_k,
+    ]
+    constraints += [m_shape % m == 0 for m, m_shape in zip(m_tiles, M)]
+    constraints += [n_shape % n == 0 for n, n_shape in zip(n_tiles, N)]
+    constraints += [k_shape % k == 0 for k, k_shape in zip(k_tiles[:-1], K[:-1])]
+    constraints += [m >= 0 for m in m_tiles]
+    constraints += [n >= 0 for n in n_tiles]
+    constraints += [k >= 0 for k in k_tiles]
+    constraints += [K[-1] % (k_tiles[-1] * intrinsic_k) == 0]
+    constraints += [m <= m_shape for m, m_shape in zip(m_tiles, M)]
+    constraints += [n <= n_shape for n, n_shape in zip(n_tiles, N)]
+    constraints += [k <= k_shape for k, k_shape in zip(k_tiles[:-1], K[:-1])]
+    constraints += [(k_tiles[-1] * intrinsic_k) <= K[-1]]
+    for x in (subgroup_m_count, subgroup_n_count):
+        constraints += [x >= 1, x <= 32]
+
+    subgroup_m_tile_count = z3.Int("sg_m_tcnt")
+    subgroup_n_tile_count = z3.Int("sg_n_tcnt")
+    subgroup_k_tile_count = z3.Int("sg_k_tcnt")
+    for x in (subgroup_m_tile_count, subgroup_n_tile_count, subgroup_k_tile_count):
+        constraints += [x >= 1, x <= 32]
+    constraints += [math.prod(subgroup_m_tiles) == subgroup_m_tile_count]
+    constraints += [math.prod(subgroup_n_tiles) == subgroup_n_tile_count]
+    constraints += [
+        m % m_subgroup == 0 for m, m_subgroup in zip(m_tiles, subgroup_m_tiles)
+    ]
+    constraints += [
+        n % n_subgroup == 0 for n, n_subgroup in zip(n_tiles, subgroup_n_tiles)
+    ]
+    constraints += [m_subgroup > 0 for m_subgroup in subgroup_m_tiles]
+    constraints += [n_subgroup > 0 for n_subgroup in subgroup_n_tiles]
+
+    constraints += [
+        math.prod(m_tiles) == subgroup_m_count * subgroup_m_tile_count * intrinsic_mn
+    ]
+    constraints += [
+        math.prod(n_tiles) == subgroup_n_count * subgroup_n_tile_count * intrinsic_mn
+    ]
+    constraints += [math.prod(k_tiles) == subgroup_k_count * subgroup_k_tile_count]
+    subgroups = subgroup_m_count * subgroup_n_count
+    if num_subgroups > 0:
+        constraints += [subgroups == num_subgroups]
+    else:
+        constraints += [subgroups >= 1, subgroups <= 10]
+    constraints += [wg_threads == subgroups * subgroup_size]
+
+    constraints += [waves_per_eu == 2]
+
+    shared_memory = calculate_shared_memory_usage_in_bytes(
+        problem_size, m_tiles, n_tiles, k_tiles
+    )
+    constraints += [shared_memory * intrinsic_k <= 65536]
+
+    return constraints
+
+
 def getMMAAttr(
     output_type: ir.IntegerType | ir.FloatType,
     m: int,
@@ -178,10 +277,16 @@ def generate_solutions(
     problem_size: ProblemSize,
     num_subgrups: int,
     mma_intrinsics: list[iree_gpu.MMAIntrinsic],
+    codegen_pipeline: iree_codegen.DispatchLoweringPassPipeline = iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute,
 ) -> Iterator[iree_codegen.CompilationInfoAttr]:
     M, N, K = problem_size.MNK
     tuner_ctx.logger.info(f"{M},{N},{K}")
-    m, n, k = z3.Int("m"), z3.Int("n"), z3.Int("k")
+    m_vars = [z3.Int(f"m{i}") for i in range(len(M))]
+    n_vars = [z3.Int(f"n{i}") for i in range(len(N))]
+    k_vars = [z3.Int(f"k{i}") for i in range(len(K))]
+    subgroup_m_vars = [z3.Int(f"subgroup_m{i}") for i in range(len(M))]
+    subgroup_n_vars = [z3.Int(f"subgroup_n{i}") for i in range(len(N))]
+    # m, n, k = z3.Int("m"), z3.Int("n"), z3.Int("k")
     subgroup_size = z3.Int("subgroup_size")
     intrinsic_mn = z3.Int("intrinsic_mn")
     intrinsic_k = z3.Int("intrinsic_k")
@@ -189,34 +294,52 @@ def generate_solutions(
     sg_m_cnt = z3.Int("sg_m_cnt")
     sg_n_cnt = z3.Int("sg_n_cnt")
     waves_per_eu = z3.Int("waves_per_eu")
-    all_vars = [
-        m,
-        n,
-        k,
-        subgroup_size,
-        intrinsic_mn,
-        intrinsic_k,
-        wg_x,
-        wg_y,
-        wg_z,
-        sg_m_cnt,
-        sg_n_cnt,
-        waves_per_eu,
-    ]
+    all_vars = (
+        m_vars
+        + n_vars
+        + k_vars
+        + [
+            subgroup_size,
+            intrinsic_mn,
+            intrinsic_k,
+            wg_x,
+            wg_y,
+            wg_z,
+            sg_m_cnt,
+            sg_n_cnt,
+            waves_per_eu,
+        ]
+    )
 
     solver = z3.Solver()
-    constraints = generate_constraints(
-        problem_size,
-        [m, n, k],
-        num_subgrups,
-        subgroup_size,
-        [intrinsic_mn, intrinsic_k],
-        [wg_x, wg_y, wg_z],
-        sg_m_cnt,
-        sg_n_cnt,
-        waves_per_eu,
-        mma_intrinsics,
-    )
+    match codegen_pipeline:
+        case iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute:
+            constraints = generate_vector_distribute_constraints(
+                problem_size,
+                [m_vars, n_vars, k_vars],
+                num_subgrups,
+                subgroup_size,
+                [intrinsic_mn, intrinsic_k],
+                [wg_x, wg_y, wg_z],
+                sg_m_cnt,
+                sg_n_cnt,
+                waves_per_eu,
+                mma_intrinsics,
+            )
+            constraints += [v == 0 for v in subgroup_m_vars + subgroup_n_vars]
+        case iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse:
+            constraints = generate_tile_and_fuse_constraints(
+                problem_size,
+                [m_vars, n_vars, k_vars, subgroup_m_vars, subgroup_n_vars],
+                num_subgrups,
+                subgroup_size,
+                [intrinsic_mn, intrinsic_k],
+                [wg_x, wg_y, wg_z],
+                sg_m_cnt,
+                sg_n_cnt,
+                waves_per_eu,
+                mma_intrinsics,
+            )
     solver.add(z3.simplify(z3.And(constraints)))
     tuner_ctx.logger.debug(f"Initial constraints: {solver}")
 
@@ -232,21 +355,80 @@ def generate_solutions(
             problem_size.lhs_type.element_type,
             problem_size.rhs_type.element_type,
         )
-        workgroup_tiles = [lookup(m), lookup(n), 0]
-        reduction_tiles = [0, 0, lookup(k)]
-        if problem_size.dispatch_kind == DispatchKind.conv:
-            workgroup_tiles = [1, 1, lookup(m), lookup(n), 0, 0, 0]
-            reduction_tiles = [0, 0, 0, 0, 1, 1, lookup(k)]
-        lowering_config = get_lowering_config(
-            tuner_ctx=tuner_ctx,
-            mma_kind=mma_attr,
-            workgroup=workgroup_tiles,
-            reduction=reduction_tiles,
-            subgroup_m_count=lookup(sg_m_cnt),
-            subgroup_n_count=lookup(sg_n_cnt),
+
+        def set_cdim_tile_sizes(tile_sizes, contraction_dims, csizes):
+            for dim, size in zip(contraction_dims, csizes):
+                tile_sizes[dim] = size
+
+        # Get workgroup tile sizes.
+        workgroup_tile_sizes = [0] * (
+            len(M) + len(N) + len(K) + len(problem_size.contraction_dims.batch)
         )
+        set_cdim_tile_sizes(
+            workgroup_tile_sizes,
+            problem_size.contraction_dims.m,
+            [lookup(v) for v in m_vars],
+        )
+        set_cdim_tile_sizes(
+            workgroup_tile_sizes,
+            problem_size.contraction_dims.n,
+            [lookup(v) for v in n_vars],
+        )
+        set_cdim_tile_sizes(
+            workgroup_tile_sizes,
+            problem_size.contraction_dims.batch,
+            [1] * len(problem_size.contraction_dims.batch),
+        )
+
+        # Get subgroup tile sizes.
+        subgroup_tile_sizes = [0] * (
+            len(M) + len(N) + len(K) + len(problem_size.contraction_dims.batch)
+        )
+        set_cdim_tile_sizes(
+            subgroup_tile_sizes,
+            problem_size.contraction_dims.m,
+            [lookup(v) for v in subgroup_m_vars],
+        )
+        set_cdim_tile_sizes(
+            subgroup_tile_sizes,
+            problem_size.contraction_dims.n,
+            [lookup(v) for v in subgroup_n_vars],
+        )
+        set_cdim_tile_sizes(
+            subgroup_tile_sizes,
+            problem_size.contraction_dims.batch,
+            [1] * len(problem_size.contraction_dims.batch),
+        )
+
+        # Get reduction tile sizes.
+        reduction_tile_sizes = [0] * (
+            len(M) + len(N) + len(K) + len(problem_size.contraction_dims.batch)
+        )
+        set_cdim_tile_sizes(
+            reduction_tile_sizes,
+            problem_size.contraction_dims.k,
+            [lookup(v) for v in k_vars],
+        )
+
+        # Create the LoweringConfigAttr.
+        lowering_config_args = {
+            "tuner_ctx": tuner_ctx,
+            "mma_kind": mma_attr,
+            "workgroup": workgroup_tile_sizes,
+            "reduction": reduction_tile_sizes,
+            "subgroup_m_count": lookup(sg_m_cnt),
+            "subgroup_n_count": lookup(sg_n_cnt),
+        }
+        if (
+            codegen_pipeline
+            == iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse
+        ):
+            lowering_config_args["subgroup"] = subgroup_tile_sizes
+        lowering_config = get_lowering_config(**lowering_config_args)
+
+        # Create the TranslationInfoAttr
         pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
-            iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
+            codegen_pipeline
         )
         pipeline_options = iree_gpu.PipelineOptionsAttr.get()
         config_dict = get_translation_info_config(
@@ -259,9 +441,12 @@ def generate_solutions(
             lookup(subgroup_size),
             config_dict,
         )
+
+        # Create the CompilationInfoAttr.
         compilation_info = iree_codegen.CompilationInfoAttr.get(
             lowering_config, translation_info
         )
+
         solver.add(z3.simplify(z3.Not(z3.And(list(x == model[x] for x in all_vars)))))
         i += 1
         yield compilation_info
diff --git a/tuner/tuner/dispatch_constraints_test.py b/tuner/tuner/dispatch_constraints_test.py
index 5c82f555f..d31a76e90 100644
--- a/tuner/tuner/dispatch_constraints_test.py
+++ b/tuner/tuner/dispatch_constraints_test.py
@@ -31,12 +31,18 @@ def tuner_ctx() -> Generator[common.TunerContext, None, None]:
 
 
 def test_generate_solutions(tuner_ctx: common.TunerContext) -> None:
-    matmul_size = common.MatmulSize(2048, 3840, 1280)
+    matmul_size = common.ContractionSizes([2048], [3840], [1280])
+    contraction_dims = common.ContractionDimensions([0], [1], [2])
     lhs_type = common.ShapedType([2048, 1280], tuner_ctx.type.f16)
     rhs_type = common.ShapedType([3840, 1280], tuner_ctx.type.f16)
     res_type = common.ShapedType([2048, 3840], tuner_ctx.type.f32)
     problem_size = common.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.contraction
+        matmul_size,
+        lhs_type,
+        rhs_type,
+        res_type,
+        common.DispatchKind.contraction,
+        contraction_dims,
     )
     configs = dispatch_constraints.generate_solutions(
         tuner_ctx,
@@ -54,56 +60,235 @@ def test_generate_solutions(tuner_ctx: common.TunerContext) -> None:
 
 
 def test_calculate_shared_memory_usage_in_bytes(tuner_ctx: common.TunerContext) -> None:
-    matmul_size = common.MatmulSize(1024, 1024, 1024)
+    matmul_size = common.ContractionSizes([1024], [1024], [1024])
+    contraction_dims = common.ContractionDimensions([0], [1], [2])
     lhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.f16)
     rhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.f16)
     res_type = common.ShapedType([1024, 1024], tuner_ctx.type.f32)
     problem_size = common.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.contraction
+        matmul_size,
+        lhs_type,
+        rhs_type,
+        res_type,
+        common.DispatchKind.contraction,
+        contraction_dims,
     )
     assert (
         dispatch_constraints.calculate_shared_memory_usage_in_bytes(
-            problem_size, 512, 64, 128
+            problem_size, [512], [64], [128]
         )
         == 147456
     )
 
     lhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.i8)
     problem_size = common.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.contraction
+        matmul_size,
+        lhs_type,
+        rhs_type,
+        res_type,
+        common.DispatchKind.contraction,
+        contraction_dims,
     )
     assert (
         dispatch_constraints.calculate_shared_memory_usage_in_bytes(
-            problem_size, 512, 64, 128
+            problem_size, [512], [64], [128]
         )
         == 81920
     )
 
     rhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.i32)
     problem_size = common.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.contraction
+        matmul_size,
+        lhs_type,
+        rhs_type,
+        res_type,
+        common.DispatchKind.contraction,
+        contraction_dims,
     )
     assert (
         dispatch_constraints.calculate_shared_memory_usage_in_bytes(
-            problem_size, 128, 64, 32
+            problem_size, [128], [64], [32]
         )
         == 12288
     )
 
+    assert (
+        dispatch_constraints.calculate_shared_memory_usage_in_bytes(
+            problem_size, [2, 64], [4, 16], [8, 4]
+        )
+        == 12288
+    )
+
+
+def test_generate_tile_and_fuse_constraints_valid_input(
+    tuner_ctx: common.TunerContext,
+) -> None:
+    matmul_size = common.ContractionSizes(
+        M=[4, 32],
+        N=[6, 64],
+        K=[8, 128],
+        B=[2, 16],
+    )
+    contraction_dims = common.ContractionDimensions(
+        m=[1, 5],
+        n=[2, 6],
+        k=[3, 7],
+        batch=[0, 4],
+    )
+    lhs_type = common.ShapedType([2, 4, 8, 16, 32, 128], tuner_ctx.type.f16)
+    rhs_type = common.ShapedType([2, 6, 8, 16, 64, 128], tuner_ctx.type.f16)
+    res_type = common.ShapedType([2, 4, 6, 16, 32, 64], tuner_ctx.type.f32)
+    problem_size = common.ProblemSize(
+        matmul_size,
+        lhs_type,
+        rhs_type,
+        res_type,
+        common.DispatchKind.contraction,
+        contraction_dims,
+    )
+    # Define input parameters as z3 Ints
+    m, n, k = (
+        [z3.Int("m0"), z3.Int("m1")],
+        [z3.Int("n0"), z3.Int("n1")],
+        [z3.Int("k0"), z3.Int("k1")],
+    )
+    subgroup_m, subgroup_n = (
+        [z3.Int("subgroup_m0"), z3.Int("subgroup_m1")],
+        [z3.Int("subgroup_n0"), z3.Int("subgroup_n1")],
+    )
+    subgroup_size = z3.Int("subgroup_size")
+    intrinsic_mn = z3.Int("intrinsic_mn")
+    intrinsic_k = z3.Int("intrinsic_k")
+    wg_x, wg_y, wg_z = (
+        z3.Int("wg_x"),
+        z3.Int("wg_y"),
+        z3.Int("wg_z"),
+    )
+    sg_m_cnt = z3.Int("sg_m_cnt")
+    sg_n_cnt = z3.Int("sg_n_cnt")
+    waves_per_eu = z3.Int("waves_per_eu")
+
+    constraints = dispatch_constraints.generate_tile_and_fuse_constraints(
+        problem_size,
+        [m, n, k, subgroup_m, subgroup_n],
+        4,
+        subgroup_size,
+        [intrinsic_mn, intrinsic_k],
+        [wg_x, wg_y, wg_z],
+        sg_m_cnt,
+        sg_n_cnt,
+        waves_per_eu,
+        [
+            iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
+            iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16,
+            iree_gpu.MMAIntrinsic.MFMA_I32_16x16x32_I8,
+            iree_gpu.MMAIntrinsic.MFMA_I32_32x32x16_I8,
+        ],
+    )
+
+    solver = z3.Solver()
+    solver.add(constraints)
+
+    # Check if the constraints are satisfiable
+    assert solver.check() == z3.sat
+
+
+def test_generate_tile_and_fuse_constraints_invalid_input(
+    tuner_ctx: common.TunerContext,
+) -> None:
+    # Define input parameters that should lead to unsatisfiable constraints
+    matmul_size = common.ContractionSizes(
+        M=[4, 32],
+        N=[6, 64],
+        K=[8, 128],
+        B=[2, 16],
+    )
+    contraction_dims = common.ContractionDimensions(
+        m=[1, 5],
+        n=[2, 6],
+        k=[3, 7],
+        batch=[0, 4],
+    )
+    lhs_type = common.ShapedType([2, 4, 8, 16, 32, 128], tuner_ctx.type.f16)
+    rhs_type = common.ShapedType([2, 6, 8, 16, 64, 128], tuner_ctx.type.f16)
+    res_type = common.ShapedType([2, 4, 6, 16, 32, 64], tuner_ctx.type.f32)
+    problem_size = common.ProblemSize(
+        matmul_size,
+        lhs_type,
+        rhs_type,
+        res_type,
+        common.DispatchKind.contraction,
+        contraction_dims,
+    )
+    # Define input parameters as z3 Ints
+    m, n, k = (
+        [z3.Int("m0"), z3.Int("m1")],
+        [z3.Int("n0"), z3.Int("n1")],
+        [z3.Int("k0"), z3.Int("k1")],
+    )
+    subgroup_m, subgroup_n = (
+        [z3.Int("subgroup_m0"), z3.Int("subgroup_m1")],
+        [z3.Int("subgroup_n0"), z3.Int("subgroup_n1")],
+    )
+    subgroup_size = z3.Int("subgroup_size")
+    intrinsic_mn = z3.Int("intrinsic_mn")
+    intrinsic_k = z3.Int("intrinsic_k")
+    wg_x, wg_y, wg_z = (
+        z3.Int("wg_x"),
+        z3.Int("wg_y"),
+        z3.Int("wg_z"),
+    )
+    sg_m_cnt = z3.Int("sg_m_cnt")
+    sg_n_cnt = z3.Int("sg_n_cnt")
+    waves_per_eu = z3.Int("waves_per_eu")
+
+    constraints = dispatch_constraints.generate_tile_and_fuse_constraints(
+        problem_size,
+        [m, n, k, subgroup_m, subgroup_n],
+        4,
+        subgroup_size,
+        [intrinsic_mn, intrinsic_k],
+        [wg_x, wg_y, wg_z],
+        sg_m_cnt,
+        sg_n_cnt,
+        waves_per_eu,
+        [
+            iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
+            iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16,
+            iree_gpu.MMAIntrinsic.MFMA_I32_16x16x32_I8,
+            iree_gpu.MMAIntrinsic.MFMA_I32_32x32x16_I8,
+        ],
+    )
+    constraints.append(m[0] > 1000)  # Adding an additional unsatisfiable constraint
+
+    solver = z3.Solver()
+    solver.add(constraints)
+
+    # Check if the constraints are unsatisfiable
+    assert solver.check() == z3.unsat
+
 
-def test_generate_constraints_valid_input(tuner_ctx: common.TunerContext) -> None:
-    matmul_size = common.MatmulSize(1024, 1024, 1024)
+def test_generate_vector_distribute_constraints_valid_input(
+    tuner_ctx: common.TunerContext,
+) -> None:
+    matmul_size = common.ContractionSizes([1024], [1024], [1024])
+    contraction_dims = common.ContractionDimensions([0], [1], [2])
     lhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.f16)
     rhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.f16)
     res_type = common.ShapedType([1024, 1024], tuner_ctx.type.f32)
     problem_size = common.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.contraction
+        matmul_size,
+        lhs_type,
+        rhs_type,
+        res_type,
+        common.DispatchKind.contraction,
+        contraction_dims,
     )
     # Define input parameters as z3 Ints
     m, n, k = (
-        dispatch_constraints.z3.Int("m"),
-        z3.Int("n"),
-        z3.Int("k"),
+        [z3.Int("m")],
+        [z3.Int("n")],
+        [z3.Int("k")],
     )
     subgroup_size = z3.Int("subgroup_size")
     intrinsic_mn = z3.Int("intrinsic_mn")
@@ -117,7 +302,7 @@ def test_generate_constraints_valid_input(tuner_ctx: common.TunerContext) -> Non
     sg_n_cnt = z3.Int("sg_n_cnt")
     waves_per_eu = z3.Int("waves_per_eu")
 
-    constraints = dispatch_constraints.generate_constraints(
+    constraints = dispatch_constraints.generate_vector_distribute_constraints(
         problem_size,
         [m, n, k],
         4,
@@ -142,19 +327,27 @@ def test_generate_constraints_valid_input(tuner_ctx: common.TunerContext) -> Non
     assert solver.check() == z3.sat
 
 
-def test_generate_constraints_invalid_input(tuner_ctx: common.TunerContext) -> None:
+def test_generate_vector_distribute_constraints_invalid_input(
+    tuner_ctx: common.TunerContext,
+) -> None:
     # Define input parameters that should lead to unsatisfiable constraints
-    matmul_size = common.MatmulSize(1024, 1024, 1024)
+    matmul_size = common.ContractionSizes([1024], [1024], [1024])
+    contraction_dims = common.ContractionDimensions([0], [1], [2])
     lhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.f16)
     rhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.f16)
     res_type = common.ShapedType([1024, 1024], tuner_ctx.type.f32)
     problem_size = common.ProblemSize(
-        matmul_size, lhs_type, rhs_type, res_type, common.DispatchKind.contraction
+        matmul_size,
+        lhs_type,
+        rhs_type,
+        res_type,
+        common.DispatchKind.contraction,
+        contraction_dims,
     )
     m, n, k = (
-        z3.Int("m"),
-        z3.Int("n"),
-        z3.Int("k"),
+        [z3.Int("m")],
+        [z3.Int("n")],
+        [z3.Int("k")],
     )
     subgroup_size = z3.Int("subgroup_size")
     intrinsic_mn = z3.Int("intrinsic_mn")
@@ -168,7 +361,7 @@ def test_generate_constraints_invalid_input(tuner_ctx: common.TunerContext) -> N
     sg_n_cnt = z3.Int("sg_n_cnt")
     waves_per_eu = z3.Int("waves_per_eu")
 
-    constraints = dispatch_constraints.generate_constraints(
+    constraints = dispatch_constraints.generate_vector_distribute_constraints(
         problem_size,
         [m, n, k],
         4,
@@ -185,7 +378,7 @@ def test_generate_constraints_invalid_input(tuner_ctx: common.TunerContext) -> N
             iree_gpu.MMAIntrinsic.MFMA_I32_32x32x16_I8,
         ],
     )
-    constraints.append(m > 1000)  # Adding an additional unsatisfiable constraint
+    constraints.append(m[0] > 1000)  # Adding an additional unsatisfiable constraint
 
     solver = z3.Solver()
     solver.add(constraints)
diff --git a/tuner/tuner/dispatch_parser.py b/tuner/tuner/dispatch_parser.py
index 502968ea8..ad7ba3a79 100644
--- a/tuner/tuner/dispatch_parser.py
+++ b/tuner/tuner/dispatch_parser.py
@@ -60,31 +60,39 @@ def get_shapes(self, template: list[str]) -> ProblemSize:
         ir_module = ir.Module.parse("\n".join(template))
         contraction_op = match_root_op(ir_module, matcher)
         assert contraction_op is not None, f"contraction op not found"
-        cdims = matcher.contraction_dimensions
-        assert cdims, "no contraction dimensions"
+        contraction_dims = matcher.contraction_dimensions
+        assert contraction_dims, "no contraction dimensions"
         assert matcher.lhs_dims, "no lhs dimensions"
         assert matcher.rhs_dims, "no rhs dimensions"
         assert matcher.res_dims, "no result dimensions"
-        assert len(cdims.batch) <= 1, f"must have at most 1 batch dimension"
-        assert len(cdims.m) == 1, f"must have a single m dimension"
-        assert len(cdims.n) == 1, f"must have a single n dimension"
-        assert len(cdims.k) == 1, f"must have a single k dimension"
         lhs_type = ir.RankedTensorType(contraction_op.operands[0].type)
         rhs_type = ir.RankedTensorType(contraction_op.operands[1].type)
         res_type = ir.RankedTensorType(contraction_op.operands[2].type)
-        matmul_size = MatmulSize(
-            lhs_type.shape[matcher.lhs_dims.index(cdims.m[0])],
-            rhs_type.shape[matcher.rhs_dims.index(cdims.n[0])],
-            lhs_type.shape[matcher.lhs_dims.index(cdims.k[0])],
+        matmul_size = ContractionSizes(
+            M=[
+                lhs_type.shape[matcher.lhs_dims.index(dim)]
+                for dim in contraction_dims.m
+            ],
+            N=[
+                rhs_type.shape[matcher.rhs_dims.index(dim)]
+                for dim in contraction_dims.n
+            ],
+            K=[
+                lhs_type.shape[matcher.lhs_dims.index(dim)]
+                for dim in contraction_dims.k
+            ],
+            B=[
+                lhs_type.shape[matcher.lhs_dims.index(dim)]
+                for dim in contraction_dims.batch
+            ],
         )
-        if len(cdims.batch) == 1:
-            matmul_size.B = lhs_type.shape[matcher.lhs_dims.index(cdims.batch[0])]
         return ProblemSize(
             matmul_size,
             lhs_type=ShapedType(lhs_type.shape, lhs_type.element_type),
             rhs_type=ShapedType(rhs_type.shape, rhs_type.element_type),
             res_type=ShapedType(res_type.shape, res_type.element_type),
             dispatch_kind=DispatchKind.contraction,
+            contraction_dims=contraction_dims,
         )
 
 
@@ -115,14 +123,18 @@ def get_shapes(self, template: list[str]) -> ProblemSize:
         res_type = ir.RankedTensorType(conv_op.operands[2].type)
         dim_info = ConvDimInfo.from_rhs_res(rhs_type, res_type)
         return ProblemSize(
-            MatmulSize(
-                M=dim_info.oh * dim_info.ow,
-                N=dim_info.oc,
-                K=dim_info.fh * dim_info.fw * dim_info.ic,
-                B=dim_info.n,
+            matmul_size=ContractionSizes(
+                M=[dim_info.n, dim_info.oh, dim_info.ow],
+                N=[dim_info.oc],
+                K=[dim_info.fh, dim_info.fw, dim_info.ic],
             ),
             lhs_type=ShapedType(lhs_type.shape, lhs_type.element_type),
             rhs_type=ShapedType(rhs_type.shape, rhs_type.element_type),
             res_type=ShapedType(res_type.shape, res_type.element_type),
             dispatch_kind=DispatchKind.conv,
+            contraction_dims=ContractionDimensions(
+                m=[0, 1, 2],
+                n=[3],
+                k=[4, 5, 6],
+            ),
         )
diff --git a/tuner/tuner/dispatch_parser_test.py b/tuner/tuner/dispatch_parser_test.py
index c35b17bed..7ddb0bb84 100644
--- a/tuner/tuner/dispatch_parser_test.py
+++ b/tuner/tuner/dispatch_parser_test.py
@@ -78,10 +78,10 @@ def test_get_contraction_operation(tuner_ctx: common.TunerContext) -> None:
     assert mmt_op is not None
     assert isinstance(mmt_op.opview, linalg.GenericOp)
     shapes: common.ProblemSize = parser.get_shapes(transpose_b_str.splitlines())
-    assert shapes.matmul_size.B == 1
-    assert shapes.matmul_size.M == 16
-    assert shapes.matmul_size.N == 32
-    assert shapes.matmul_size.K == 64
+    assert shapes.matmul_size.B == []
+    assert shapes.matmul_size.M == [16]
+    assert shapes.matmul_size.N == [32]
+    assert shapes.matmul_size.K == [64]
     assert shapes.lhs_type.shape == [16, 64]
     assert isinstance(shapes.lhs_type.element_type, ir.F16Type)
     assert shapes.rhs_type.shape == [32, 64]
@@ -102,10 +102,32 @@ def test_get_contraction_operation(tuner_ctx: common.TunerContext) -> None:
     module = ir.Module.parse(bmm_transposed_inputs_str, context)
     mmt_op = parser.get_contraction_operation(module)
     shapes = parser.get_shapes(bmm_transposed_inputs_str.splitlines())
-    assert shapes.matmul_size.B == 5
-    assert shapes.matmul_size.M == 8
-    assert shapes.matmul_size.N == 40
-    assert shapes.matmul_size.K == 128
+    assert shapes.matmul_size.B == [5]
+    assert shapes.matmul_size.M == [8]
+    assert shapes.matmul_size.N == [40]
+    assert shapes.matmul_size.K == [128]
+
+    with ir.Location.unknown():
+        bmm_transposed_inputs_str = CONTRACTION_TEMPLATE.format(
+            lhs_type=ir.RankedTensorType.get(
+                [16, 8, 15, 16, 64, 256], ir.F16Type.get()
+            ),
+            rhs_type=ir.RankedTensorType.get(
+                [16, 9, 15, 16, 128, 256], ir.F16Type.get()
+            ),
+            res_type=ir.RankedTensorType.get([16, 8, 9, 16, 64, 128], ir.F32Type.get()),
+            lhs_map="affine_map<(b0, m0, n0, k0, b1, m1, n1, k1) -> (b0, m0, k0, b1, m1, k1)>",
+            rhs_map="affine_map<(b0, m0, n0, k0, b1, m1, n1, k1) -> (b0, n0, k0, b1, n1, k1)>",
+            res_map="affine_map<(b0, m0, n0, k0, b1, m1, n1, k1) -> (b0, m0, n0, b1, m1, n1)>",
+            iterator_types='["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "parallel", "reduction"]',
+        )
+    module = ir.Module.parse(bmm_transposed_inputs_str, context)
+    mmt_op = parser.get_contraction_operation(module)
+    shapes = parser.get_shapes(bmm_transposed_inputs_str.splitlines())
+    assert shapes.matmul_size.B == [16, 16]
+    assert shapes.matmul_size.M == [8, 64]
+    assert shapes.matmul_size.N == [9, 128]
+    assert shapes.matmul_size.K == [15, 256]
 
 
 def test_get_conv_operation(tuner_ctx: common.TunerContext) -> None:
diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py
index 4e2a97ec8..fab86c369 100644
--- a/tuner/tuner/libtuner.py
+++ b/tuner/tuner/libtuner.py
@@ -227,6 +227,11 @@ class ExecutionPhases(str, Enum):
     benchmark_models = "benchmark-models"
 
 
+class CodegenPipelines(str, Enum):
+    llvmgpu_vector_distribute = "llvmgpu_vector_distribute"
+    llvmgpu_tile_and_fuse = "llvmgpu_tile_and_fuse"
+
+
 def parse_arguments(
     initial_parser: Optional[argparse.ArgumentParser] = None,
 ) -> argparse.Namespace:
@@ -298,6 +303,12 @@ def parse_arguments(
     candidate_gen_args.add_argument(
         "--tile-dims", help="Map of tile size matmul dims", type=str, default="mnk"
     )
+    general_args.add_argument(
+        "--codegen-pipeline",
+        choices=[x.value for x in CodegenPipelines],
+        default=CodegenPipelines.llvmgpu_vector_distribute,
+        help="Codegen pipeline to tune for",
+    )
 
     return parser.parse_args()
 
@@ -499,7 +510,9 @@ def run_iree_benchmark_module_command(benchmark_pack: BenchmarkPack):
         )
 
     times = []
+    logging.debug(f"candidate {candidate_id} benchmark_results: {benchmark_results}")
     for benchmark_result in benchmark_results:
+        logging.debug(f"candidate {candidate_id} benchmark_result: {benchmark_result}")
         benchmark_name = benchmark_result.benchmark_name
         # With multiple benchmark results, there will be `real_time_mean`, but
         # not with single iteration benchmark results, so ignore the mean time
@@ -601,6 +614,16 @@ def find_collisions(
     return collisions_exist, hash_values
 
 
+def get_iree_codegen_pipeline(pipeline: CodegenPipelines):
+    match pipeline:
+        case CodegenPipelines.llvmgpu_vector_distribute:
+            return iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute
+        case CodegenPipelines.llvmgpu_tile_and_fuse:
+            return iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse
+        case _:
+            assert False, "unexpected codegen pipeline"
+
+
 def generate_candidate_specs(
     args: argparse.Namespace,
     path_config: PathConfig,
@@ -628,6 +651,7 @@ def generate_candidate_specs(
                 tuner_context=tuning_client.tuner_context,
                 limit=args.num_candidates,
                 num_subgroups=args.num_subgroups,
+                codegen_pipeline=get_iree_codegen_pipeline(args.codegen_pipeline),
             )
         logging.debug("candidate_gen.py ends")
         handle_error(
diff --git a/tuner/tuner/op_matchers.py b/tuner/tuner/op_matchers.py
index db953fbb3..f3966b97d 100644
--- a/tuner/tuner/op_matchers.py
+++ b/tuner/tuner/op_matchers.py
@@ -170,10 +170,10 @@ def match_indexing_maps(self, maps: list[ir.AffineMap]) -> bool:
             return False
 
         self.contraction_dimensions = ContractionDimensions(
-            batch=batch_dims,
             m=m_dims,
             n=n_dims,
             k=k_dims,
+            batch=batch_dims,
         )
         self.lhs_dims = lhs_dims
         self.rhs_dims = rhs_dims

From ab29d88540d8d25839b8d07450f04db0fae19f6f Mon Sep 17 00:00:00 2001
From: Avinash Sharma <avinash.sharma@amd.com>
Date: Tue, 7 Jan 2025 13:16:26 -0800
Subject: [PATCH 05/35] Update nightly llama benchmarking tests (#754)

- Updates nightly llama benchmarking tests to benchmark input token
lengths of 128 and 2048 for llama 8b, 70b, and 405b.
- Switch IREE compile flag from `--iree-hal-target-backends` to
`--iree-hal-target-device`

TODO: Add 405b decode benchmark calls to 405b fp16 tests when decode is
fixed

---------

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
Co-authored-by: Archana Ramalingam <98564406+archana-ramalingam@users.noreply.github.com>
Co-authored-by: archana-ramalingam <archana.ramalingam@amd.com>
---
 .github/workflows/ci_eval.yaml                |   2 +-
 .github/workflows/ci_eval_short.yaml          |   2 +-
 sharktank/conftest.py                         |   8 +-
 .../sharktank/evaluate/perplexity_iree.py     |  18 +-
 sharktank/sharktank/utils/export_artifacts.py |  14 +-
 .../tests/evaluate/perplexity_iree_test.py    |  16 +-
 .../models/llama/benchmark_amdgpu_test.py     | 466 ++++++++++++------
 7 files changed, 338 insertions(+), 188 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index f2db697d7..fe29f54d5 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -70,7 +70,7 @@ jobs:
       - name: Run perplexity test with IREE
         run: |
           source ${VENV_DIR}/bin/activate
-          pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --run-nightly-llama-tests --bs=100 --iree-device=hip://0 --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=out/llm/llama/perplexity/iree_perplexity/index.html
+          pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --run-nightly-llama-tests --bs=100 --iree-device=hip://0 --iree-hip-target=gfx942 --iree-hal-target-device=hip --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=out/llm/llama/perplexity/iree_perplexity/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
diff --git a/.github/workflows/ci_eval_short.yaml b/.github/workflows/ci_eval_short.yaml
index 385a54261..05c7fa415 100644
--- a/.github/workflows/ci_eval_short.yaml
+++ b/.github/workflows/ci_eval_short.yaml
@@ -69,4 +69,4 @@ jobs:
       - name: Run perplexity test with vmfb
         run: |
           source ${VENV_DIR}/bin/activate
-          pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --bs=5 --iree-device=hip://0 --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
+          pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --bs=5 --iree-device=hip://0 --iree-hip-target=gfx942 --iree-hal-target-device=hip --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index 5d16b5ff2..8ae3fe629 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -226,9 +226,9 @@ def pytest_addoption(parser):
     )
 
     parser.addoption(
-        "--iree-hal-target-backends",
+        "--iree-hal-target-device",
         action="store",
-        help="Specify the iree-hal target backend (e.g., rocm)",
+        help="Specify the iree-hal target device (e.g., hip)",
     )
 
     parser.addoption(
@@ -354,8 +354,8 @@ def get_iree_flags(request: FixtureRequest):
     model_path["iree_hip_target"] = set_fixture_from_cli_option(
         request, "--iree-hip-target", "iree_hip_target"
     )
-    model_path["iree_hal_target_backends"] = set_fixture_from_cli_option(
-        request, "--iree-hal-target-backends", "iree_hal_target_backends"
+    model_path["iree_hal_target_device"] = set_fixture_from_cli_option(
+        request, "--iree-hal-target-device", "iree_hal_target_device"
     )
 
 
diff --git a/sharktank/sharktank/evaluate/perplexity_iree.py b/sharktank/sharktank/evaluate/perplexity_iree.py
index c47726f0e..f42a4cf4a 100644
--- a/sharktank/sharktank/evaluate/perplexity_iree.py
+++ b/sharktank/sharktank/evaluate/perplexity_iree.py
@@ -64,7 +64,7 @@ def __init__(
         torch_device,
         iree_device,
         iree_hip_target,
-        iree_hal_target_backends,
+        iree_hal_target_device,
         kv_cache_type,
         tensor_parallelism_size,
         attention_kernel,
@@ -73,7 +73,7 @@ def __init__(
         self.torch_device = torch_device
         self.iree_device = iree_device
         self.iree_hip_target = iree_hip_target
-        self.iree_hal_target_backends = iree_hal_target_backends
+        self.iree_hal_target_device = iree_hal_target_device
         self.kv_cache_type = kv_cache_type
         self.block_seq_stride = block_seq_stride
         self.activation_dtype = torch.float16
@@ -135,7 +135,7 @@ def compile_model(self, weight_path_str):
             irpa_path=self.weight_path_str,
             batch_size=self.bs,
             iree_hip_target=self.iree_hip_target,
-            iree_hal_target_backends=self.iree_hal_target_backends,
+            iree_hal_target_device=self.iree_hal_target_device,
             attention_kernel=self.attention_kernel,
             tensor_parallelism_size=self.tensor_parallelism_size,
             block_seq_stride=self.block_seq_stride,
@@ -392,7 +392,7 @@ def run_perplexity(
     torch_device,
     iree_device,
     iree_hip_target,
-    iree_hal_target_backends,
+    iree_hal_target_device,
     kv_cache_type,
     tensor_parallelism_size,
     attention_kernel,
@@ -404,7 +404,7 @@ def run_perplexity(
         torch_device=torch_device,
         iree_device=iree_device,
         iree_hip_target=iree_hip_target,
-        iree_hal_target_backends=iree_hal_target_backends,
+        iree_hal_target_device=iree_hal_target_device,
         kv_cache_type=kv_cache_type,
         tensor_parallelism_size=tensor_parallelism_size,
         attention_kernel=attention_kernel,
@@ -450,10 +450,10 @@ def main(argv):
         help="Specify the iree-hip target version (e.g., gfx942)",
     )
     parser.add_argument(
-        "--iree-hal-target-backends",
+        "--iree-hal-target-device",
         action="store",
-        default="rocm",
-        help="Specify the iree-hal target backends (e.g., rocm)",
+        default="hip",
+        help="Specify the iree-hal target device (e.g., hip, cpu)",
     )
     parser.add_argument("--kv-cache-type", default="paged", help="KV cache type")
     parser.add_argument(
@@ -485,7 +485,7 @@ def main(argv):
         torch_device=torch_device,
         iree_device=args.iree_device,
         iree_hip_target=args.iree_hip_target,
-        iree_hal_target_backends=args.iree_hal_target_backends,
+        iree_hal_target_device=args.iree_hal_target_device,
         kv_cache_type=args.kv_cache_type,
         tensor_parallelism_size=args.tensor_parallelism_size,
         attention_kernel=args.attention_kernel,
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index 75cdbab7a..4045e90a5 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -89,10 +89,10 @@ def __init__(
         irpa_path: str,
         batch_size: int,
         iree_hip_target: str,
-        iree_hal_target_backends: str,
         attention_kernel: str,
         tensor_parallelism_size: int,
         block_seq_stride: int,
+        iree_hal_target_device: str,
     ):
         self.sharktank_dir = str(
             Path(os.path.dirname(os.path.abspath(__file__))).parent.parent.parent
@@ -100,7 +100,7 @@ def __init__(
         self.irpa_path = irpa_path
         self.batch_size = batch_size
         self.iree_hip_target = iree_hip_target
-        self.iree_hal_target_backends = iree_hal_target_backends
+        self.iree_hal_target_device = iree_hal_target_device
         self.attention_kernel = attention_kernel
         self.tensor_parallelism_size = tensor_parallelism_size
         self.block_seq_stride = block_seq_stride
@@ -216,15 +216,18 @@ def compile_to_vmfb(
             f"iree-compile",
             f"{mlir_path}",
             f"--iree-hip-target={self.iree_hip_target}",
-            f"--iree-hal-target-backends={self.iree_hal_target_backends}",
             f"-o={vmfb_path}",
         ]
         if self.tensor_parallelism_size > 1:
             iree_hal_target_devices = [
-                f"--iree-hal-target-device=hip[{i}]"
+                f"--iree-hal-target-device={self.iree_hal_target_device}[{i}]"
                 for i in range(self.tensor_parallelism_size)
             ]
-            compile_args += iree_hal_target_devices
+        else:
+            iree_hal_target_devices = [
+                f"--iree-hal-target-device={self.iree_hal_target_device}"
+            ]
+        compile_args += iree_hal_target_devices
         if hal_dump_path:
             compile_args += [
                 f"--iree-hal-dump-executable-files-to={hal_dump_path}/files"
@@ -283,7 +286,6 @@ def iree_benchmark_vmfb(
         benchmark_args += [
             "iree-benchmark-module",
             "--hip_use_streams=true",
-            "--device_allocator=caching",
             f"--module={vmfb_name}",
         ]
         benchmark_args += params
diff --git a/sharktank/tests/evaluate/perplexity_iree_test.py b/sharktank/tests/evaluate/perplexity_iree_test.py
index d10d9f5db..1e42bde9c 100644
--- a/sharktank/tests/evaluate/perplexity_iree_test.py
+++ b/sharktank/tests/evaluate/perplexity_iree_test.py
@@ -46,7 +46,7 @@ def test_llama3_8B_f16_decomposed(self):
                 f"--irpa-file={self.llama3_8b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
                 f"--iree-device={self.iree_device}",
-                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hal-target-device={self.iree_hal_target_device}",
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size=1",
                 f"--attention-kernel=decomposed",
@@ -82,7 +82,7 @@ def test_llama3_8B_f16(self):
                 f"--irpa-file={self.llama3_8b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
                 f"--iree-device={self.iree_device}",
-                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hal-target-device={self.iree_hal_target_device}",
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size=1",
                 f"--attention-kernel=torch_sdpa",
@@ -118,7 +118,7 @@ def test_llama3_8B_fp8_decomposed(self):
                 f"--irpa-file={self.llama3_8b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
                 f"--iree-device={self.iree_device}",
-                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hal-target-device={self.iree_hal_target_device}",
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size=1",
                 f"--attention-kernel=decomposed",
@@ -154,7 +154,7 @@ def test_llama3_8B_fp8(self):
                 f"--irpa-file={self.llama3_8b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
                 f"--iree-device={self.iree_device}",
-                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hal-target-device={self.iree_hal_target_device}",
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size=1",
                 f"--attention-kernel=torch_sdpa",
@@ -192,7 +192,7 @@ def test_llama3_405B_f16_decomposed(self):
                 f"--irpa-file={self.llama3_405b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
                 f"--iree-device={self.iree_device}",
-                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hal-target-device={self.iree_hal_target_device}",
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
                 f"--attention-kernel=decomposed",
@@ -228,7 +228,7 @@ def test_llama3_405B_f16(self):
                 f"--irpa-file={self.llama3_405b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
                 f"--iree-device={self.iree_device}",
-                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hal-target-device={self.iree_hal_target_device}",
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
                 f"--attention-kernel=torch_sdpa",
@@ -264,7 +264,7 @@ def test_llama3_405B_fp8_decomposed(self):
                 f"--irpa-file={self.llama3_405b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
                 f"--iree-device={self.iree_device}",
-                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hal-target-device={self.iree_hal_target_device}",
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
                 f"--attention-kernel=decomposed",
@@ -300,7 +300,7 @@ def test_llama3_405B_fp8(self):
                 f"--irpa-file={self.llama3_405b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
                 f"--iree-device={self.iree_device}",
-                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hal-target-device={self.iree_hal_target_device}",
                 f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
                 f"--attention-kernel=torch_sdpa",
diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
index 0c45bdffa..a079f75ef 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -68,6 +68,7 @@ def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
         self.artifacts_dir = Path("/data/llama3.1/weights/8b")
+        self.artifacts_dir_2048 = Path("/shark-dev/8b")
         self.irpa_path = self.artifacts_dir / "fp16/llama3.1_8b_fp16.irpa"
         self.irpa_path_fp8 = self.artifacts_dir / "f8/llama3.1_8b_fp8.irpa"
         self.tensor_parallelism_size = 1
@@ -78,7 +79,7 @@ def setUp(self):
             irpa_path=str(self.irpa_path),
             batch_size=4,
             iree_hip_target="gfx942",
-            iree_hal_target_backends="rocm",
+            iree_hal_target_device="hip",
             attention_kernel="torch",
             tensor_parallelism_size=self.tensor_parallelism_size,
             block_seq_stride=32,
@@ -87,7 +88,7 @@ def setUp(self):
             irpa_path=str(self.irpa_path_fp8),
             batch_size=4,
             iree_hip_target="gfx942",
-            iree_hal_target_backends="rocm",
+            iree_hal_target_device="hip",
             attention_kernel="decomposed",
             tensor_parallelism_size=self.tensor_parallelism_size,
             block_seq_stride=32,
@@ -96,45 +97,57 @@ def setUp(self):
             irpa_path=str(self.irpa_path_fp8),
             batch_size=4,
             iree_hip_target="gfx942",
-            iree_hal_target_backends="rocm",
+            iree_hal_target_device="hip",
             attention_kernel="torch",
             tensor_parallelism_size=self.tensor_parallelism_size,
             block_seq_stride=32,
         )
-        self.prefill_args_bs4_128_in_tokens_stride_32_f16 = (
+        self.prefill_args_bs4_128_stride_32_f16 = (
             self.artifacts_dir / "prefill_args_bs4_128_stride_32"
         )
-        self.prefill_args_bs4_2048_in_tokens_f16 = (
-            self.artifacts_dir / "prefill_args_bs4_2048"
-        )
-        self.decode_args_bs4_128_in_tokens_stride_32_f16 = (
+        self.decode_args_bs4_128_stride_32_f16 = (
             self.artifacts_dir / "decode_args_bs4_128_stride_32"
         )
+        self.prefill_args_bs4_2048_stride_32_f16 = (
+            self.artifacts_dir_2048 / "prefill_args_bs4_2048_stride_32"
+        )
+        self.decode_args_bs4_2048_stride_32_f16 = (
+            self.artifacts_dir_2048 / "decode_args_bs4_2048_stride_32"
+        )
         self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8"
         self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8"
         self.iree_run_prefill_nondecomposed_args_fp16 = [
             "--function=prefill_bs4",
-            f"--input=@{self.prefill_args_bs4_128_in_tokens_stride_32_f16}/tokens.npy",
-            f"--input=@{self.prefill_args_bs4_128_in_tokens_stride_32_f16}/seq_lens.npy",
-            f"--input=@{self.prefill_args_bs4_128_in_tokens_stride_32_f16}/seq_block_ids.npy",
-            f"--input=@{self.prefill_args_bs4_128_in_tokens_stride_32_f16}/cs_f16.npy",
+            f"--input=@{self.prefill_args_bs4_128_stride_32_f16}/tokens.npy",
+            f"--input=@{self.prefill_args_bs4_128_stride_32_f16}/seq_lens.npy",
+            f"--input=@{self.prefill_args_bs4_128_stride_32_f16}/seq_block_ids.npy",
+            f"--input=@{self.prefill_args_bs4_128_stride_32_f16}/cs_f16.npy",
+            "--benchmark_repetitions=3",
+        ]
+        self.iree_run_decode_nondecomposed_args_f16 = [
+            "--function=decode_bs4",
+            f"--input=@{self.decode_args_bs4_128_stride_32_f16}/next_tokens.npy",
+            f"--input=@{self.decode_args_bs4_128_stride_32_f16}/seq_lens.npy",
+            f"--input=@{self.decode_args_bs4_128_stride_32_f16}/start_positions.npy",
+            f"--input=@{self.decode_args_bs4_128_stride_32_f16}/seq_block_ids.npy",
+            f"--input=@{self.decode_args_bs4_128_stride_32_f16}/cs_f16.npy",
             "--benchmark_repetitions=3",
         ]
         self.iree_run_prefill_nondecomposed_args_fp16_2048 = [
             "--function=prefill_bs4",
-            f"--input=@{self.prefill_args_bs4_2048_in_tokens_f16}/tokens.npy",
-            f"--input=@{self.prefill_args_bs4_2048_in_tokens_f16}/seq_lens.npy",
-            f"--input=@{self.prefill_args_bs4_2048_in_tokens_f16}/seq_block_ids.npy",
-            f"--input=@{self.prefill_args_bs4_2048_in_tokens_f16}/cs_f16.npy",
+            f"--input=@{self.prefill_args_bs4_2048_stride_32_f16}/tokens.npy",
+            f"--input=@{self.prefill_args_bs4_2048_stride_32_f16}/seq_lens.npy",
+            f"--input=@{self.prefill_args_bs4_2048_stride_32_f16}/seq_block_ids.npy",
+            f"--input=@{self.prefill_args_bs4_2048_stride_32_f16}/cs_f16.npy",
             "--benchmark_repetitions=3",
         ]
-        self.iree_run_decode_nondecomposed_args_f16 = [
+        self.iree_run_decode_nondecomposed_args_fp16_2048 = [
             "--function=decode_bs4",
-            f"--input=@{self.decode_args_bs4_128_in_tokens_stride_32_f16}/next_tokens.npy",
-            f"--input=@{self.decode_args_bs4_128_in_tokens_stride_32_f16}/seq_lens.npy",
-            f"--input=@{self.decode_args_bs4_128_in_tokens_stride_32_f16}/start_positions.npy",
-            f"--input=@{self.decode_args_bs4_128_in_tokens_stride_32_f16}/seq_block_ids.npy",
-            f"--input=@{self.decode_args_bs4_128_in_tokens_stride_32_f16}/cs_f16.npy",
+            f"--input=@{self.decode_args_bs4_2048_stride_32_f16}/next_tokens.npy",
+            f"--input=@{self.decode_args_bs4_2048_stride_32_f16}/seq_lens.npy",
+            f"--input=@{self.decode_args_bs4_2048_stride_32_f16}/start_positions.npy",
+            f"--input=@{self.decode_args_bs4_2048_stride_32_f16}/seq_block_ids.npy",
+            f"--input=@{self.decode_args_bs4_2048_stride_32_f16}/cs_f16.npy",
             "--benchmark_repetitions=3",
         ]
         self.iree_run_prefill_args_fp8 = [
@@ -188,8 +201,8 @@ def testBenchmark8B_f16_Non_Decomposed_Prefill_Input_Len_128(self):
         )
 
     @skipif_run_quick_llama_test
-    def testBenchmark8B_f16_Non_Decomposed_Prefill_Input_Len_2048(self):
-        output_file_name = self.dir_path_8b / "f16_torch_prefill_2048"
+    def testBenchmark8B_f16_Non_Decomposed_Input_Len_128(self):
+        output_file_name = self.dir_path_8b / "f16_torch_128"
         output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
         )
@@ -199,11 +212,9 @@ def testBenchmark8B_f16_Non_Decomposed_Prefill_Input_Len_2048(self):
         output_vmfb = self.llama8b_f16_torch_sdpa_artifacts.create_file(
             suffix=".vmfb", prefix=output_file_name
         )
-        self.llama8b_f16_torch_sdpa_artifacts.block_seq_stride = 16
         export_return_code = self.llama8b_f16_torch_sdpa_artifacts.export_to_mlir(
             mlir_path=output_mlir,
             json_path=output_json,
-            skip_decode=True,
         )
         self.llama8b_f16_torch_sdpa_artifacts.compile_to_vmfb(
             mlir_path=str(output_mlir),
@@ -217,13 +228,21 @@ def testBenchmark8B_f16_Non_Decomposed_Prefill_Input_Len_2048(self):
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
-            args=self.iree_run_prefill_nondecomposed_args_fp16_2048,
+            args=self.iree_run_prefill_nondecomposed_args_fp16,
+            cwd=self.repo_root,
+        )
+        # benchmark decode
+        self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
+            hip_device_id=self.iree_device,
+            vmfb_name=output_vmfb,
+            irpa_path=self.irpa_path,
+            args=self.iree_run_decode_nondecomposed_args_f16,
             cwd=self.repo_root,
         )
 
     @skipif_run_quick_llama_test
-    def testBenchmark8B_f16_Non_Decomposed(self):
-        output_file_name = self.dir_path_8b / "f16_torch"
+    def testBenchmark8B_f16_Non_Decomposed_Input_Len_2048(self):
+        output_file_name = self.dir_path_8b / "f16_torch_2048"
         output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
         )
@@ -249,7 +268,7 @@ def testBenchmark8B_f16_Non_Decomposed(self):
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
-            args=self.iree_run_prefill_nondecomposed_args_fp16,
+            args=self.iree_run_prefill_nondecomposed_args_fp16_2048,
             cwd=self.repo_root,
         )
         # benchmark decode
@@ -257,47 +276,7 @@ def testBenchmark8B_f16_Non_Decomposed(self):
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
-            args=self.iree_run_decode_nondecomposed_args_f16,
-            cwd=self.repo_root,
-        )
-
-    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
-    def testBenchmark8B_fp8_Decomposed(self):
-        output_file_name = self.dir_path_8b / "fp8_decomposed"
-        output_mlir = self.llama8b_fp8_decomposed_artifacts.create_file(
-            suffix=".mlir", prefix=output_file_name
-        )
-        output_json = self.llama8b_fp8_decomposed_artifacts.create_file(
-            suffix=".json", prefix=output_file_name
-        )
-        output_vmfb = self.llama8b_fp8_decomposed_artifacts.create_file(
-            suffix=".vmfb", prefix=output_file_name
-        )
-        export_return_code = self.llama8b_fp8_decomposed_artifacts.export_to_mlir(
-            mlir_path=output_mlir,
-            json_path=output_json,
-        )
-        self.llama8b_fp8_decomposed_artifacts.compile_to_vmfb(
-            mlir_path=str(output_mlir),
-            vmfb_path=output_vmfb,
-            hal_dump_path=output_file_name,
-            cwd=self.repo_root,
-            args=self.compile_args,
-        )
-        # benchmark prefill
-        self.llama8b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.iree_device,
-            vmfb_name=output_vmfb,
-            irpa_path=self.irpa_path_fp8,
-            args=self.iree_run_prefill_args,
-            cwd=self.repo_root,
-        )
-        # benchmark decode
-        self.llama8b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.iree_device,
-            vmfb_name=output_vmfb,
-            irpa_path=self.irpa_path_fp8,
-            args=self.iree_run_decode_args,
+            args=self.iree_run_decode_nondecomposed_args_fp16_2048,
             cwd=self.repo_root,
         )
 
@@ -349,6 +328,7 @@ def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
         self.artifacts_dir = Path("/data/llama3.1/weights/70b")
+        self.artifacts_dir_2048 = Path("/shark-dev/70b")
         self.irpa_path = self.artifacts_dir / "fp16/llama3.1_70b_f16.irpa"
         self.irpa_path_fp8 = self.artifacts_dir / "f8/llama70b_fp8.irpa"
         self.tensor_parallelism_size = 8
@@ -359,61 +339,105 @@ def setUp(self):
             irpa_path=str(self.irpa_path),
             batch_size=4,
             iree_hip_target="gfx942",
-            iree_hal_target_backends="rocm",
+            iree_hal_target_device="hip",
             attention_kernel="torch",
             tensor_parallelism_size=self.tensor_parallelism_size,
-            block_seq_stride=16,
+            block_seq_stride=32,
         )
         self.llama70b_fp8_decomposed_artifacts = ExportArtifacts(
             irpa_path=str(self.irpa_path_fp8),
             batch_size=4,
             iree_hip_target="gfx942",
-            iree_hal_target_backends="rocm",
+            iree_hal_target_device="hip",
             attention_kernel="decomposed",
             tensor_parallelism_size=self.tensor_parallelism_size,
-            block_seq_stride=16,
+            block_seq_stride=32,
         )
         self.llama70b_fp8_torch_sdpa_artifacts = ExportArtifacts(
             irpa_path=str(self.irpa_path_fp8),
             batch_size=4,
             iree_hip_target="gfx942",
-            iree_hal_target_backends="rocm",
+            iree_hal_target_device="hip",
             attention_kernel="torch",
             tensor_parallelism_size=self.tensor_parallelism_size,
-            block_seq_stride=16,
+            block_seq_stride=32,
+        )
+        self.prefill_args_bs4_128_stride_32_tp8_f16 = (
+            self.artifacts_dir / "prefill_args_bs4_128_stride_32_tp8"
+        )
+        self.decode_args_bs4_128_stride_32_tp8_f16 = (
+            self.artifacts_dir / "decode_args_bs4_128_stride_32_tp8"
         )
-        self.prefill_args_f16 = self.artifacts_dir / "prefill_args"
-        self.prefill_args_bs4_128_in_tokens_f16 = (
-            self.artifacts_dir / "prefill_args_bs4_128"
+        self.prefill_args_bs4_2048_stride_32_tp8_f16 = (
+            self.artifacts_dir_2048 / "prefill_args_bs4_2048_stride_32_tp8"
+        )
+        self.decode_args_bs4_2048_stride_32_tp8_f16 = (
+            self.artifacts_dir_2048 / "decode_args_bs4_2048_stride_32_tp8"
         )
-        self.decode_args_f16 = self.artifacts_dir / "decode_args"
         self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8"
         self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8"
-        self.iree_run_prefill_args = [
-            "--function=prefill_bs4",
-            f"--input=@{self.prefill_args_f16}/tokens.npy",
-            f"--input=@{self.prefill_args_f16}/seq_lens.npy",
-            f"--input=@{self.prefill_args_f16}/seq_block_ids.npy",
-            f"--input=@{self.prefill_args_f16}/cache_state_f16.npy",
-            "--benchmark_repetitions=3",
-        ]
-        self.iree_run_prefill_nondecomposed_args_fp16 = [
-            "--function=prefill_bs4",
-            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/random_tokens.npy",
-            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_lens.npy",
-            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_block_ids.npy",
-            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/cs_f16.npy",
-            "--benchmark_repetitions=3",
-        ]
-        self.iree_run_decode_args = [
-            "--function=decode_bs4",
-            f"--input=@{self.decode_args_f16}/tokens.npy",
-            f"--input=@{self.decode_args_f16}/seq_lens.npy",
-            f"--input=@{self.decode_args_f16}/start_positions.npy",
-            f"--input=@{self.decode_args_f16}/seq_block_ids.npy",
-            f"--input=@{self.decode_args_f16}/cache_state_f16.npy",
-            "--benchmark_repetitions=3",
-        ]
+        self.iree_run_prefill_nondecomposed_args_128_tp8_fp16 = (
+            [
+                "--function=prefill_bs4",
+                f"--input=@{self.prefill_args_bs4_128_stride_32_tp8_f16}/tokens.npy",
+                f"--input=@{self.prefill_args_bs4_128_stride_32_tp8_f16}/seq_lens.npy",
+                f"--input=@{self.prefill_args_bs4_128_stride_32_tp8_f16}/seq_block_ids.npy",
+            ]
+            + [
+                f"--input=@{self.prefill_args_bs4_128_stride_32_tp8_f16}/cs_f16_shard_{i}.npy"
+                for i in range(self.tensor_parallelism_size)
+            ]
+            + [
+                "--benchmark_repetitions=3",
+            ]
+        )
+        self.iree_run_decode_nondecomposed_args_128_tp8_fp16 = (
+            [
+                "--function=decode_bs4",
+                f"--input=@{self.decode_args_bs4_128_stride_32_tp8_f16}/next_tokens.npy",
+                f"--input=@{self.decode_args_bs4_128_stride_32_tp8_f16}/seq_lens.npy",
+                f"--input=@{self.decode_args_bs4_128_stride_32_tp8_f16}/start_positions.npy",
+                f"--input=@{self.decode_args_bs4_128_stride_32_tp8_f16}/seq_block_ids.npy",
+            ]
+            + [
+                f"--input=@{self.decode_args_bs4_128_stride_32_tp8_f16}/cs_f16_shard_{i}.npy"
+                for i in range(self.tensor_parallelism_size)
+            ]
+            + [
+                "--benchmark_repetitions=3",
+            ]
+        )
+        self.iree_run_prefill_nondecomposed_args_2048_tp8_fp16 = (
+            [
+                "--function=prefill_bs4",
+                f"--input=@{self.prefill_args_bs4_2048_stride_32_tp8_f16}/tokens.npy",
+                f"--input=@{self.prefill_args_bs4_2048_stride_32_tp8_f16}/seq_lens.npy",
+                f"--input=@{self.prefill_args_bs4_2048_stride_32_tp8_f16}/seq_block_ids.npy",
+            ]
+            + [
+                f"--input=@{self.prefill_args_bs4_2048_stride_32_tp8_f16}/cs_f16_shard_{i}.npy"
+                for i in range(self.tensor_parallelism_size)
+            ]
+            + [
+                "--benchmark_repetitions=3",
+            ]
+        )
+        self.iree_run_decode_nondecomposed_args_2048_tp8_fp16 = (
+            [
+                "--function=decode_bs4",
+                f"--input=@{self.decode_args_bs4_2048_stride_32_tp8_f16}/next_tokens.npy",
+                f"--input=@{self.decode_args_bs4_2048_stride_32_tp8_f16}/seq_lens.npy",
+                f"--input=@{self.decode_args_bs4_2048_stride_32_tp8_f16}/start_positions.npy",
+                f"--input=@{self.decode_args_bs4_2048_stride_32_tp8_f16}/seq_block_ids.npy",
+            ]
+            + [
+                f"--input=@{self.decode_args_bs4_2048_stride_32_tp8_f16}/cs_f16_shard_{i}.npy"
+                for i in range(self.tensor_parallelism_size)
+            ]
+            + [
+                "--benchmark_repetitions=3",
+            ]
+        )
         self.iree_run_prefill_args_fp8 = [
             "--function=prefill_bs4",
             f"--input=@{self.prefill_args_fp8}/tokens.npy",
@@ -435,8 +459,8 @@ def setUp(self):
     @pytest.mark.xfail(
         reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
     )
-    def testBenchmark70B_f16_TP8_Non_Decomposed(self):
-        output_file_name = self.dir_path_70b / "f16_torch"
+    def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_128(self):
+        output_file_name = self.dir_path_70b / "f16_torch_128"
         output_mlir = self.llama70b_f16_torch_sdpa_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
         )
@@ -446,13 +470,12 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self):
         output_vmfb = self.llama70b_f16_torch_sdpa_artifacts.create_file(
             suffix=".vmfb", prefix=output_file_name
         )
-        self.llama70b_f16_torch_sdpa_artifacts.attention_kernel = "torch"
         output_shard_file_name = (
             self.artifacts_dir
             / f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
         )
         if output_shard_file_name.exists():
-            self.irpa_path = output_shard_file_name
+            self.llama70b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
         export_return_code = self.llama70b_f16_torch_sdpa_artifacts.export_to_mlir(
             mlir_path=output_mlir,
             json_path=output_json,
@@ -469,7 +492,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self):
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
-            args=self.iree_run_prefill_args,
+            args=self.iree_run_prefill_nondecomposed_args_128_tp8_fp16,
             cwd=self.repo_root,
         )
         # benchmark decode
@@ -477,7 +500,55 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self):
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
-            args=self.iree_run_decode_args,
+            args=self.iree_run_decode_nondecomposed_args_128_tp8_fp16,
+            cwd=self.repo_root,
+        )
+
+    @pytest.mark.xfail(
+        reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
+    )
+    def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
+        output_file_name = self.dir_path_70b / "f16_torch_2048"
+        output_mlir = self.llama70b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".mlir", prefix=output_file_name
+        )
+        output_json = self.llama70b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".json", prefix=output_file_name
+        )
+        output_vmfb = self.llama70b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".vmfb", prefix=output_file_name
+        )
+        output_shard_file_name = (
+            self.artifacts_dir
+            / f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
+        )
+        if output_shard_file_name.exists():
+            self.llama70b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
+        export_return_code = self.llama70b_f16_torch_sdpa_artifacts.export_to_mlir(
+            mlir_path=output_mlir,
+            json_path=output_json,
+        )
+        self.llama70b_f16_torch_sdpa_artifacts.compile_to_vmfb(
+            mlir_path=str(output_mlir),
+            vmfb_path=output_vmfb,
+            hal_dump_path=output_file_name,
+            cwd=self.repo_root,
+            args=self.compile_args,
+        )
+        # benchmark prefill
+        self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
+            hip_device_id=self.iree_device,
+            vmfb_name=output_vmfb,
+            irpa_path=self.irpa_path,
+            args=self.iree_run_prefill_nondecomposed_args_2048_tp8_fp16,
+            cwd=self.repo_root,
+        )
+        # benchmark decode
+        self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
+            hip_device_id=self.iree_device,
+            vmfb_name=output_vmfb,
+            irpa_path=self.irpa_path,
+            args=self.iree_run_decode_nondecomposed_args_2048_tp8_fp16,
             cwd=self.repo_root,
         )
 
@@ -500,7 +571,7 @@ def testBenchmark70B_fp8_TP8_Decomposed(self):
             / f"f8/tp8/llama3.1_70b_fp8_tp{self.tensor_parallelism_size}_parameters.irpa"
         )
         if output_shard_file_name.exists():
-            self.irpa_path = output_shard_file_name
+            self.llama70b_fp8_decomposed_artifacts.irpa_path = output_shard_file_name
         export_return_code = self.llama70b_fp8_decomposed_artifacts.export_to_mlir(
             mlir_path=output_mlir,
             json_path=output_json,
@@ -548,7 +619,7 @@ def testBenchmark70B_fp8_TP8_Non_Decomposed(self):
             / f"f8/tp8/llama3.1_70b_fp8_tp{self.tensor_parallelism_size}_parameters.irpa"
         )
         if output_shard_file_name.exists():
-            self.irpa_path = output_shard_file_name
+            self.llama70b_fp8_torch_sdpa_artifacts.irpa_path = output_shard_file_name
         export_return_code = self.llama70b_fp8_torch_sdpa_artifacts.export_to_mlir(
             mlir_path=output_mlir,
             json_path=output_json,
@@ -585,6 +656,7 @@ def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
         self.artifacts_dir = Path("/data/llama3.1/weights/405b")
+        self.artifacts_dir_2048 = Path("/shark-dev/405b")
         self.irpa_path = self.artifacts_dir / "fp16/llama3.1_405b_fp16.irpa"
         self.irpa_path_fp8 = self.artifacts_dir / "f8/llama3.1_405b_fp8.irpa"
         self.tensor_parallelism_size = 8
@@ -595,61 +667,105 @@ def setUp(self):
             irpa_path=str(self.irpa_path),
             batch_size=4,
             iree_hip_target="gfx942",
-            iree_hal_target_backends="rocm",
+            iree_hal_target_device="hip",
             attention_kernel="torch",
             tensor_parallelism_size=self.tensor_parallelism_size,
-            block_seq_stride=16,
+            block_seq_stride=32,
         )
         self.llama405b_fp8_decomposed_artifacts = ExportArtifacts(
             irpa_path=str(self.irpa_path_fp8),
             batch_size=4,
             iree_hip_target="gfx942",
-            iree_hal_target_backends="rocm",
+            iree_hal_target_device="hip",
             attention_kernel="decomposed",
             tensor_parallelism_size=self.tensor_parallelism_size,
-            block_seq_stride=16,
+            block_seq_stride=32,
         )
         self.llama405b_fp8_torch_sdpa_artifacts = ExportArtifacts(
             irpa_path=str(self.irpa_path_fp8),
             batch_size=4,
             iree_hip_target="gfx942",
-            iree_hal_target_backends="rocm",
+            iree_hal_target_device="hip",
             attention_kernel="torch",
             tensor_parallelism_size=self.tensor_parallelism_size,
-            block_seq_stride=16,
+            block_seq_stride=32,
+        )
+        self.prefill_args_bs4_128_stride_32_tp8_f16 = (
+            self.artifacts_dir / "prefill_args_bs4_128_stride_32_tp8"
+        )
+        self.decode_args_bs4_128_stride_32_tp8_f16 = (
+            self.artifacts_dir / "decode_args_bs4_128_stride_32_tp8"
         )
-        self.prefill_args_f16 = self.artifacts_dir / "prefill_args"
-        self.prefill_args_bs4_128_in_tokens_f16 = (
-            self.artifacts_dir / "prefill_args_bs4_128"
+        self.prefill_args_bs4_2048_stride_32_tp8_f16 = (
+            self.artifacts_dir_2048 / "prefill_args_bs4_2048_stride_32_tp8"
+        )
+        self.decode_args_bs4_2048_stride_32_tp8_f16 = (
+            self.artifacts_dir_2048 / "decode_args_bs4_2048_stride_32_tp8"
         )
-        self.decode_args_f16 = self.artifacts_dir / "decode_args"
         self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8"
         self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8"
-        self.iree_run_prefill_args = [
-            "--function=prefill_bs4",
-            f"--input=@{self.prefill_args_f16}/tokens.npy",
-            f"--input=@{self.prefill_args_f16}/seq_lens.npy",
-            f"--input=@{self.prefill_args_f16}/seq_block_ids.npy",
-            f"--input=@{self.prefill_args_f16}/cache_state_f16.npy",
-            "--benchmark_repetitions=3",
-        ]
-        self.iree_run_prefill_nondecomposed_args_fp16 = [
-            "--function=prefill_bs4",
-            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/random_tokens.npy",
-            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_lens.npy",
-            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_block_ids.npy",
-            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/cs_f16.npy",
-            "--benchmark_repetitions=3",
-        ]
-        self.iree_run_decode_args = [
-            "--function=decode_bs4",
-            f"--input=@{self.decode_args_f16}/tokens.npy",
-            f"--input=@{self.decode_args_f16}/seq_lens.npy",
-            f"--input=@{self.decode_args_f16}/start_positions.npy",
-            f"--input=@{self.decode_args_f16}/seq_block_ids.npy",
-            f"--input=@{self.decode_args_f16}/cache_state_f16.npy",
-            "--benchmark_repetitions=3",
-        ]
+        self.iree_run_prefill_nondecomposed_args_128_tp8_fp16 = (
+            [
+                "--function=prefill_bs4",
+                f"--input=@{self.prefill_args_bs4_128_stride_32_tp8_f16}/tokens.npy",
+                f"--input=@{self.prefill_args_bs4_128_stride_32_tp8_f16}/seq_lens.npy",
+                f"--input=@{self.prefill_args_bs4_128_stride_32_tp8_f16}/seq_block_ids.npy",
+            ]
+            + [
+                f"--input=@{self.prefill_args_bs4_128_stride_32_tp8_f16}/cs_f16_shard_{i}.npy"
+                for i in range(self.tensor_parallelism_size)
+            ]
+            + [
+                "--benchmark_repetitions=3",
+            ]
+        )
+        self.iree_run_decode_nondecomposed_args_128_tp8_fp16 = (
+            [
+                "--function=decode_bs4",
+                f"--input=@{self.decode_args_bs4_128_stride_32_tp8_f16}/next_tokens.npy",
+                f"--input=@{self.decode_args_bs4_128_stride_32_tp8_f16}/seq_lens.npy",
+                f"--input=@{self.decode_args_bs4_128_stride_32_tp8_f16}/start_positions.npy",
+                f"--input=@{self.decode_args_bs4_128_stride_32_tp8_f16}/seq_block_ids.npy",
+            ]
+            + [
+                f"--input=@{self.decode_args_bs4_128_stride_32_tp8_f16}/cs_f16_shard_{i}.npy"
+                for i in range(self.tensor_parallelism_size)
+            ]
+            + [
+                "--benchmark_repetitions=3",
+            ]
+        )
+        self.iree_run_prefill_nondecomposed_args_2048_tp8_fp16 = (
+            [
+                "--function=prefill_bs4",
+                f"--input=@{self.prefill_args_bs4_2048_stride_32_tp8_f16}/tokens.npy",
+                f"--input=@{self.prefill_args_bs4_2048_stride_32_tp8_f16}/seq_lens.npy",
+                f"--input=@{self.prefill_args_bs4_2048_stride_32_tp8_f16}/seq_block_ids.npy",
+            ]
+            + [
+                f"--input=@{self.prefill_args_bs4_2048_stride_32_tp8_f16}/cs_f16_shard_{i}.npy"
+                for i in range(self.tensor_parallelism_size)
+            ]
+            + [
+                "--benchmark_repetitions=3",
+            ]
+        )
+        self.iree_run_decode_nondecomposed_args_2048_tp8_fp16 = (
+            [
+                "--function=decode_bs4",
+                f"--input=@{self.decode_args_bs4_2048_stride_32_tp8_f16}/next_tokens.npy",
+                f"--input=@{self.decode_args_bs4_2048_stride_32_tp8_f16}/seq_lens.npy",
+                f"--input=@{self.decode_args_bs4_2048_stride_32_tp8_f16}/start_positions.npy",
+                f"--input=@{self.decode_args_bs4_2048_stride_32_tp8_f16}/seq_block_ids.npy",
+            ]
+            + [
+                f"--input=@{self.decode_args_bs4_2048_stride_32_tp8_f16}/cs_f16_shard_{i}.npy"
+                for i in range(self.tensor_parallelism_size)
+            ]
+            + [
+                "--benchmark_repetitions=3",
+            ]
+        )
         self.iree_run_prefill_args_fp8 = [
             "--function=prefill_bs4",
             f"--input=@{self.prefill_args_fp8}/tokens.npy",
@@ -671,8 +787,8 @@ def setUp(self):
     @pytest.mark.xfail(
         reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
     )
-    def testBenchmark405B_f16_TP8_Non_Decomposed(self):
-        output_file_name = self.dir_path_405b / "f16_torch"
+    def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128(self):
+        output_file_name = self.dir_path_405b / "f16_torch_128"
         output_mlir = self.llama405b_f16_torch_sdpa_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
         )
@@ -682,17 +798,15 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self):
         output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file(
             suffix=".vmfb", prefix=output_file_name
         )
-        self.llama405b_f16_torch_sdpa_artifacts.attention_kernel = "torch"
         output_shard_file_name = (
             self.artifacts_dir
             / f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
         )
         if output_shard_file_name.exists():
-            self.irpa_path = output_shard_file_name
+            self.llama405b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
         export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir(
             mlir_path=output_mlir,
             json_path=output_json,
-            skip_decode=True,
         )
         self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb(
             mlir_path=str(output_mlir),
@@ -706,17 +820,51 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self):
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
-            args=self.iree_run_prefill_args,
+            args=self.iree_run_prefill_nondecomposed_args_128_tp8_fp16,
             cwd=self.repo_root,
         )
-        # benchmark decode
+        # TODO: benchmark decode
+
+    @pytest.mark.xfail(
+        reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
+    )
+    def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
+        output_file_name = self.dir_path_405b / "f16_torch_2048"
+        output_mlir = self.llama405b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".mlir", prefix=output_file_name
+        )
+        output_json = self.llama405b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".json", prefix=output_file_name
+        )
+        output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".vmfb", prefix=output_file_name
+        )
+        output_shard_file_name = (
+            self.artifacts_dir
+            / f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
+        )
+        if output_shard_file_name.exists():
+            self.llama405b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
+        export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir(
+            mlir_path=output_mlir,
+            json_path=output_json,
+        )
+        self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb(
+            mlir_path=str(output_mlir),
+            vmfb_path=output_vmfb,
+            hal_dump_path=output_file_name,
+            cwd=self.repo_root,
+            args=self.compile_args,
+        )
+        # benchmark prefill
         self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
-            args=self.iree_run_decode_args,
+            args=self.iree_run_prefill_nondecomposed_args_2048_tp8_fp16,
             cwd=self.repo_root,
         )
+        # TODO: benchmark decode
 
     @pytest.mark.xfail(
         reason="KeyError in theta.py", strict=True, raises=ExportMlirException
@@ -737,7 +885,7 @@ def testBenchmark405B_fp8_TP8_Decomposed(self):
             / f"f8/tp8/llama3.1_405b_fp8_tp{self.tensor_parallelism_size}_parameters.irpa"
         )
         if output_shard_file_name.exists():
-            self.irpa_path = output_shard_file_name
+            self.llama405b_fp8_decomposed_artifacts.irpa_path = output_shard_file_name
         export_return_code = self.llama405b_fp8_decomposed_artifacts.export_to_mlir(
             mlir_path=output_mlir,
             json_path=output_json,
@@ -785,7 +933,7 @@ def testBenchmark405B_fp8_TP8_Non_Decomposed(self):
             / f"f8/tp8/llama3.1_405b_fp8_tp{self.tensor_parallelism_size}_parameters.irpa"
         )
         if output_shard_file_name.exists():
-            self.irpa_path = output_shard_file_name
+            self.llama405b_fp8_torch_sdpa_artifacts.irpa_path = output_shard_file_name
         export_return_code = self.llama405b_fp8_torch_sdpa_artifacts.export_to_mlir(
             mlir_path=output_mlir,
             json_path=output_json,

From ad236fd0007d675440c9907a1c97f9dd300d3c2d Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Tue, 7 Jan 2025 13:56:22 -0800
Subject: [PATCH 06/35] Add a timeout for shortfin unit tests. (#777)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I'm seeing stalls in `test_invoke_mobilenet_multi_fiber_per_fiber` from
https://github.com/nod-ai/shark-ai/blob/main/shortfin/tests/invocation/mobilenet_program_test.py
when the test program fails numerics checks. The other test cases fail
and terminate as expected, without needing to use a timeout mechanism.

Tested locally on Windows and the timeout worked (though it isn't
pretty):
```
(.venv) λ pytest tests/ -rA -k test_invoke_mobilenet_multi_fiber_per_fiber --timeout 10
======================================= test session starts =======================================
platform win32 -- Python 3.11.2, pytest-8.3.4, pluggy-1.5.0
rootdir: D:\dev\projects\shark-ai\shortfin
configfile: pyproject.toml
plugins: anyio-4.8.0, timeout-2.3.1
timeout: 10.0s
timeout method: thread
timeout func_only: False
collected 264 items / 263 deselected / 1 selected

tests\invocation\mobilenet_program_test.p
 +++++++++++++++++++++++++++++++++++++++++++++ Timeout +++++++++++++++++++++++++++++++++++++++++++++
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Captured stdout ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Fibers: [Fiber(worker='__init__', devices=[cpu0]), Fiber(worker='__init__', devices=[cpu0]), Fiber(worker='__init__', devices=[cpu0]), Fiber(worker='__init__', devices=[cpu0]), Fiber(worker='__init__', devices=[cpu0])]
Waiting for processes: [Process(pid=1, worker='__init__'), Process(pid=2, worker='__init__'), Process(pid=3, worker='__init__'), Process(pid=4, worker='__init__'), Process(pid=5, worker='__init__')]
Process(pid=1, worker='__init__'): Start
Process(pid=2, worker='__init__'): Start
Process(pid=3, worker='__init__'): Start
Process(pid=4, worker='__init__'): Start
Process(pid=5, worker='__init__'): Start
Process(pid=1, worker='__init__'): Program complete (+116ms)
Process(pid=2, worker='__init__'): Program complete (+111ms)
Process(pid=3, worker='__init__'): Program complete (+107ms)
Process(pid=4, worker='__init__'): Program complete (+101ms)
Process(pid=5, worker='__init__'): Program complete (+97ms)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Captured stderr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
D:\dev\projects\shark-ai\shortfin\src\shortfin/support/iree_helpers.h:316: UNKNOWN; Unhandled exception: Traceback (most recent call last):
  File "D:\dev\projects\shark-ai\shortfin\tests\invocation\mobilenet_program_test.py", line 77, in assert_mobilenet_ref_output
RuntimeError: Async exception on <Worker '__init__'>): assert 0.8119692911421882 == 5.01964943873882 ± 5.0e-06

  comparison failed
  Obtained: 0.8119692911421882
  Expected: 5.01964943873882 ± 5.0e-06
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Stack of Thread-4 () (9816) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  File "C:\Program Files\Python311\Lib\threading.py", line 995, in _bootstrap
    self._bootstrap_inner()
  File "C:\Program Files\Python311\Lib\threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "C:\Program Files\Python311\Lib\threading.py", line 975, in run
    self._target(*self._args, **self._kwargs)

...
```
---
 .github/workflows/ci-libshortfin.yml | 2 +-
 shortfin/requirements-tests.txt      | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci-libshortfin.yml b/.github/workflows/ci-libshortfin.yml
index 1c2f8aa89..299212657 100644
--- a/.github/workflows/ci-libshortfin.yml
+++ b/.github/workflows/ci-libshortfin.yml
@@ -131,7 +131,7 @@ jobs:
       working-directory: ${{ env.LIBSHORTFIN_DIR }}
       run: |
         ctest --timeout 30 --output-on-failure --test-dir build
-        pytest -s --durations=10
+        pytest -s --durations=10 --timeout=30
 
   # Depends on all other jobs to provide an aggregate job status.
   ci_libshortfin_summary:
diff --git a/shortfin/requirements-tests.txt b/shortfin/requirements-tests.txt
index c04c97af2..dc194db12 100644
--- a/shortfin/requirements-tests.txt
+++ b/shortfin/requirements-tests.txt
@@ -1,4 +1,5 @@
 pytest
+pytest-timeout
 requests
 fastapi
 onnx

From 297d83dc5b9c5af1a10a7761f77aca9c3a1fd56e Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Tue, 7 Jan 2025 14:30:19 -0800
Subject: [PATCH 07/35] Use --opset-version provided by iree-import-onnx.
 (#776)

This option was added in
https://github.com/iree-org/iree/commit/d4975713a0aa3ba872807fc16585fb4b5a04e41d.
---
 shortfin/tests/invocation/conftest.py | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/shortfin/tests/invocation/conftest.py b/shortfin/tests/invocation/conftest.py
index e62373eb5..97bcadaa8 100644
--- a/shortfin/tests/invocation/conftest.py
+++ b/shortfin/tests/invocation/conftest.py
@@ -8,14 +8,6 @@
 import urllib.request
 
 
-def upgrade_onnx(original_path, converted_path):
-    import onnx
-
-    original_model = onnx.load_model(original_path)
-    converted_model = onnx.version_converter.convert_version(original_model, 17)
-    onnx.save(converted_model, converted_path)
-
-
 @pytest.fixture(scope="session")
 def mobilenet_onnx_path(tmp_path_factory):
     try:
@@ -23,16 +15,14 @@ def mobilenet_onnx_path(tmp_path_factory):
     except ModuleNotFoundError:
         raise pytest.skip("onnx python package not available")
     parent_dir = tmp_path_factory.mktemp("mobilenet_onnx")
-    orig_onnx_path = parent_dir / "mobilenet_orig.onnx"
-    upgraded_onnx_path = parent_dir / "mobilenet.onnx"
-    if not upgraded_onnx_path.exists():
+    onnx_path = parent_dir / "mobilenet.onnx"
+    if not onnx_path.exists():
         print("Downloading mobilenet.onnx")
         urllib.request.urlretrieve(
             "https://github.com/onnx/models/raw/main/validated/vision/classification/mobilenet/model/mobilenetv2-12.onnx",
-            orig_onnx_path,
+            onnx_path,
         )
-        upgrade_onnx(orig_onnx_path, upgraded_onnx_path)
-    return upgraded_onnx_path
+    return onnx_path
 
 
 @pytest.fixture(scope="session")
@@ -47,7 +37,7 @@ def mobilenet_compiled_path(mobilenet_onnx_path, compile_flags):
     if not vmfb_path.exists():
         print("Compiling mobilenet")
         args = import_onnx.parse_arguments(
-            ["-o", str(mlir_path), str(mobilenet_onnx_path)]
+            ["-o", str(mlir_path), str(mobilenet_onnx_path), "--opset-version", "17"]
         )
         import_onnx.main(args)
         tools.compile_file(

From 126330d1bddfbde015bc98dc2e550d67c8de46de Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Tue, 7 Jan 2025 15:05:24 -0800
Subject: [PATCH 08/35] Bump pinned IREE version to 20241206. (#778)

Note that the mobilenet program being tested was miscompiling and
producing different output compared to onnxruntime prior to
https://github.com/iree-org/iree/commit/d48071df356eec533554699ac1e658f6809aacbe.
---
 .github/workflows/ci_linux_x64_asan-libshortfin.yml | 4 ++--
 requirements-iree-pinned.txt                        | 6 +++---
 shortfin/tests/invocation/mobilenet_program_test.py | 5 ++++-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ci_linux_x64_asan-libshortfin.yml b/.github/workflows/ci_linux_x64_asan-libshortfin.yml
index a0009b2ac..3f2cc0d41 100644
--- a/.github/workflows/ci_linux_x64_asan-libshortfin.yml
+++ b/.github/workflows/ci_linux_x64_asan-libshortfin.yml
@@ -104,7 +104,7 @@ jobs:
       uses: actions/cache/restore@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
       with:
         path: ${{ env.PYENV_ROOT }}
-        key: ${{ runner.os }}-python-deps-${{ hashFiles('shortfin/requirements-tests.txt') }}-v${{ env.CACHE_DEPS_VER }}
+        key: ${{ runner.os }}-python-deps-${{ hashFiles('shortfin/requirements-tests.txt', 'requirements-iree-pinned.txt') }}-v${{ env.CACHE_DEPS_VER }}
 
     - name: Restore Python ASan cache
       id: cache-python-asan
@@ -152,4 +152,4 @@ jobs:
       working-directory: ${{ env.LIBSHORTFIN_DIR }}
       run: |
         eval "$(pyenv init -)"
-        pytest -s
+        pytest -s --durations=10 --timeout=30
diff --git a/requirements-iree-pinned.txt b/requirements-iree-pinned.txt
index 4ca1246ee..11d7e4258 100644
--- a/requirements-iree-pinned.txt
+++ b/requirements-iree-pinned.txt
@@ -3,8 +3,8 @@
 # Keep these versions synced with SHORTFIN_IREE_GIT_TAG in shortfin/CMakeLists.txt
 --pre
 --find-links https://iree.dev/pip-release-links.html
-iree-base-compiler==3.1.0rc20241204
-iree-base-runtime==3.1.0rc20241204
+iree-base-compiler==3.1.0rc20241206
+iree-base-runtime==3.1.0rc20241206
 
 # TODO(#760): include iree-turbine in this requirements file too?
-# iree-turbine==3.1.0rc20241205
+# iree-turbine==3.1.0rc20241206
diff --git a/shortfin/tests/invocation/mobilenet_program_test.py b/shortfin/tests/invocation/mobilenet_program_test.py
index ff7b9bbf2..502843bc3 100644
--- a/shortfin/tests/invocation/mobilenet_program_test.py
+++ b/shortfin/tests/invocation/mobilenet_program_test.py
@@ -74,7 +74,10 @@ async def assert_mobilenet_ref_output(device, device_output):
     absmean = functools.reduce(
         lambda x, y: x + abs(y) / len(flat_output), flat_output, 0.0
     )
-    assert absmean == pytest.approx(5.01964943873882)
+    # Note: this value was just copied from a sample run of the test.
+    # Comparison against a reference backend for this model is tested upstream
+    # in https://github.com/iree-org/iree-test-suites/tree/main/onnx_models.
+    assert absmean == pytest.approx(0.81196929)
 
 
 # Tests that a single invocation on a single fiber works.

From 2c699fd316072f4f9f4597f730357fd873d18c6b Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Tue, 7 Jan 2025 15:30:20 -0800
Subject: [PATCH 09/35] Switch presubmit CI workflows to use pinned IREE
 versions. (#774)

Progress on https://github.com/nod-ai/shark-ai/issues/760.

The idea here is that we will test with only pinned versions in all
workflows that run on `pull_request` and `push` triggers, then we will
create pull requests (ideally via automation like dependabot) that
attempt to update the pinned versions. This will give us confidence that
test regressions are _only_ due to the code changes in the pull request
and not due to a dependency changing. Workflows will also be more
reproducible as the versions they fetch will come from source code and
not an external, time-dependent source.
---
 .github/workflows/ci-llama-quick-tests.yaml | 15 ++----
 .github/workflows/ci-shark-ai.yml           | 16 ++-----
 .github/workflows/ci-sharktank.yml          | 52 +++++++++------------
 .github/workflows/ci_eval_short.yaml        | 15 ++----
 requirements-iree-pinned.txt                |  8 ++--
 requirements-iree-unpinned.txt              |  4 +-
 sharktank/requirements-tests.txt            |  3 +-
 7 files changed, 42 insertions(+), 71 deletions(-)

diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml
index 7ad153924..d41f3bc23 100644
--- a/.github/workflows/ci-llama-quick-tests.yaml
+++ b/.github/workflows/ci-llama-quick-tests.yaml
@@ -55,18 +55,13 @@ jobs:
           python -m pip install --no-compile --upgrade pip
 
           # Note: We install in three steps in order to satisfy requirements
-          # from non default locations first. Installing the PyTorch CPU
-          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          # from non default locations first.
           pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install -r requirements-iree-pinned.txt
+          pip install --no-compile \
+            -r sharktank/requirements-tests.txt \
+            -e sharktank/
 
-          # Install nightly IREE packages.
-          # We could also pin to a known working or stable version.
-          pip install -f https://iree.dev/pip-release-links.html --pre --upgrade \
-            iree-base-compiler \
-            iree-base-runtime \
-            iree-turbine
-
-          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
           pip freeze
 
       - name: Run llama 8b f16 decomposed test
diff --git a/.github/workflows/ci-shark-ai.yml b/.github/workflows/ci-shark-ai.yml
index 211fea4ba..3957b6d11 100644
--- a/.github/workflows/ci-shark-ai.yml
+++ b/.github/workflows/ci-shark-ai.yml
@@ -51,18 +51,12 @@ jobs:
           python -m pip install --no-compile --upgrade pip
 
           # Note: We install in three steps in order to satisfy requirements
-          # from non default locations first. Installing the PyTorch CPU
-          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          # from non default locations first.
           pip install --no-compile -r pytorch-cpu-requirements.txt
-
-          # Install nightly IREE packages.
-          # We could also pin to a known working or stable version.
-          pip install -f https://iree.dev/pip-release-links.html --pre --upgrade \
-            iree-base-compiler \
-            iree-base-runtime \
-            iree-turbine
-
-          pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
+          pip install -r requirements-iree-pinned.txt
+          pip install --no-compile \
+            -r requirements.txt \
+            -e sharktank/ shortfin/
 
           pip freeze
 
diff --git a/.github/workflows/ci-sharktank.yml b/.github/workflows/ci-sharktank.yml
index 4cdd2b274..690f08d3e 100644
--- a/.github/workflows/ci-sharktank.yml
+++ b/.github/workflows/ci-sharktank.yml
@@ -72,18 +72,15 @@ jobs:
           python -m pip install --no-compile --upgrade pip
 
           # Note: We install in three steps in order to satisfy requirements
-          # from non default locations first. Installing the PyTorch CPU
-          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
-          pip install --no-compile --index-url https://download.pytorch.org/whl/cpu torch==${{matrix.torch-version}}+cpu
+          # from non default locations first.
+          pip install --no-compile \
+            --index-url https://download.pytorch.org/whl/cpu torch==${{matrix.torch-version}}+cpu
+          pip install -r requirements-iree-pinned.txt
+          pip install --no-compile \
+            -r sharktank/requirements-tests.txt \
+            -e sharktank/
 
-          # Install nightly IREE packages.
-          # We could also pin to a known working or stable version.
-          pip install -f https://iree.dev/pip-release-links.html --pre \
-            iree-base-compiler \
-            iree-base-runtime \
-            iree-turbine
-
-          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
+          pip freeze
 
       - name: Run sharktank tests
         if: ${{ !cancelled() }}
@@ -121,19 +118,16 @@ jobs:
         run: |
           source ${VENV_DIR}/bin/activate
           python -m pip install --no-compile --upgrade pip
+
           # Note: We install in three steps in order to satisfy requirements
-          # from non default locations first. Installing the PyTorch CPU
-          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          # from non default locations first.
           pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install -r requirements-iree-pinned.txt
+          pip install --no-compile \
+            -r sharktank/requirements-tests.txt \
+            -e sharktank/
 
-          # Install nightly IREE packages.
-          # We could also pin to a known working or stable version.
-          pip install -f https://iree.dev/pip-release-links.html --pre --upgrade \
-            iree-base-compiler \
-            iree-base-runtime \
-            iree-turbine
-
-          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
+          pip freeze
 
       - name: Run tests
       # TODO: unify with-*-data flags into a single flag and make it possible to run
@@ -180,18 +174,14 @@ jobs:
           python -m pip install --no-compile --upgrade pip
 
           # Note: We install in three steps in order to satisfy requirements
-          # from non default locations first. Installing the PyTorch CPU
-          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          # from non default locations first.
           pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install -r requirements-iree-pinned.txt
+          pip install --no-compile \
+            -r sharktank/requirements-tests.txt \
+            -e sharktank/
 
-          # Install nightly IREE packages.
-          # We could also pin to a known working or stable version.
-          pip install -f https://iree.dev/pip-release-links.html --pre \
-            iree-base-compiler \
-            iree-base-runtime \
-            iree-turbine
-
-          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
+          pip freeze
 
       - name: Run punet tests
         run: |
diff --git a/.github/workflows/ci_eval_short.yaml b/.github/workflows/ci_eval_short.yaml
index 05c7fa415..a02d7bf14 100644
--- a/.github/workflows/ci_eval_short.yaml
+++ b/.github/workflows/ci_eval_short.yaml
@@ -52,18 +52,13 @@ jobs:
           python -m pip install --no-compile --upgrade pip
 
           # Note: We install in three steps in order to satisfy requirements
-          # from non default locations first. Installing the PyTorch CPU
-          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          # from non default locations first.
           pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install -r requirements-iree-pinned.txt
+          pip install --no-compile \
+            -r sharktank/requirements-tests.txt \
+            -e sharktank/
 
-          # Install nightly IREE packages.
-          # We could also pin to a known working or stable version.
-          pip install -f https://iree.dev/pip-release-links.html --pre --upgrade \
-            iree-base-compiler \
-            iree-base-runtime \
-            iree-turbine
-
-          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
           pip freeze
 
       - name: Run perplexity test with vmfb
diff --git a/requirements-iree-pinned.txt b/requirements-iree-pinned.txt
index 11d7e4258..6895fa7d4 100644
--- a/requirements-iree-pinned.txt
+++ b/requirements-iree-pinned.txt
@@ -3,8 +3,6 @@
 # Keep these versions synced with SHORTFIN_IREE_GIT_TAG in shortfin/CMakeLists.txt
 --pre
 --find-links https://iree.dev/pip-release-links.html
-iree-base-compiler==3.1.0rc20241206
-iree-base-runtime==3.1.0rc20241206
-
-# TODO(#760): include iree-turbine in this requirements file too?
-# iree-turbine==3.1.0rc20241206
+iree-base-compiler==3.1.0rc20250107
+iree-base-runtime==3.1.0rc20250107
+iree-turbine==3.1.0rc20250107
diff --git a/requirements-iree-unpinned.txt b/requirements-iree-unpinned.txt
index 09d4688dd..878541a40 100644
--- a/requirements-iree-unpinned.txt
+++ b/requirements-iree-unpinned.txt
@@ -4,6 +4,4 @@
 --find-links https://iree.dev/pip-release-links.html
 iree-base-compiler
 iree-base-runtime
-
-# TODO(#760): include iree-turbine in this requirements file too?
-# iree-turbine
+iree-turbine
diff --git a/sharktank/requirements-tests.txt b/sharktank/requirements-tests.txt
index a0ddf6117..e42d68c8c 100644
--- a/sharktank/requirements-tests.txt
+++ b/sharktank/requirements-tests.txt
@@ -1,5 +1,6 @@
 datasets==3.0.0
+diffusers
 parameterized
 pytest==8.0.0
 pytest-html
-diffusers
+pytest-xdist==3.5.0

From 12446b3c00f541850bf0217596d044035fccea17 Mon Sep 17 00:00:00 2001
From: IanNod <45800100+IanNod@users.noreply.github.com>
Date: Tue, 7 Jan 2025 15:55:06 -0800
Subject: [PATCH 10/35] Mark sharktank vae iree test as xfail (#775)

Xfails vae until iree runtime issue on the CI machine is fixed. Resolves
sharktank data-dependent tests
---
 sharktank/tests/models/vae/vae_test.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sharktank/tests/models/vae/vae_test.py b/sharktank/tests/models/vae/vae_test.py
index 99454f6cf..9b77d835e 100644
--- a/sharktank/tests/models/vae/vae_test.py
+++ b/sharktank/tests/models/vae/vae_test.py
@@ -100,6 +100,9 @@ def testCompareF16EagerVsHuggingface(self):
 
         torch.testing.assert_close(ref_results, results)
 
+    @pytest.mark.xfail(
+        reason="Waiting on fix for https://github.com/iree-org/iree/issues/19623"
+    )
     def testVaeIreeVsHuggingFace(self):
         dtype = getattr(torch, "float32")
         inputs = get_random_inputs(dtype=dtype, device="cpu", bs=1)

From 04b18192dd60d29f8c8822f4cfe4e7e26267b73c Mon Sep 17 00:00:00 2001
From: Max191 <44243577+Max191@users.noreply.github.com>
Date: Tue, 7 Jan 2025 20:22:50 -0500
Subject: [PATCH 11/35] [tuner] Tweak tile and fuse constraints and test to
 reduce run time (#779)

The `test_generate_tile_and_fuse_constraints*` tests have a long run
time due to the complex constraint problem they are solving. This PR
reduces the complexity of the problem to improve the run time of the
test.

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
---
 tuner/tuner/candidate_gen.py             |  1 +
 tuner/tuner/dispatch_constraints.py      | 37 +++++++-------
 tuner/tuner/dispatch_constraints_test.py | 64 ++++++++++++------------
 3 files changed, 50 insertions(+), 52 deletions(-)

diff --git a/tuner/tuner/candidate_gen.py b/tuner/tuner/candidate_gen.py
index ff7019ee0..d7b2da6f6 100644
--- a/tuner/tuner/candidate_gen.py
+++ b/tuner/tuner/candidate_gen.py
@@ -372,6 +372,7 @@ def main():
             tuner_ctx,
             args.limit,
             args.num_subgroups,
+            iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse,
         )
         for candidate_num, spec in enumerate(specs):
             spec_dir = Path(args.output)
diff --git a/tuner/tuner/dispatch_constraints.py b/tuner/tuner/dispatch_constraints.py
index 50a36d02f..c20325249 100644
--- a/tuner/tuner/dispatch_constraints.py
+++ b/tuner/tuner/dispatch_constraints.py
@@ -169,15 +169,14 @@ def generate_tile_and_fuse_constraints(
     m_tiles, n_tiles, k_tiles, subgroup_m_tiles, subgroup_n_tiles = tile_sizes
     intrinsic_mn, intrinsic_k = intrinsic_size
     wg_x, wg_y, wg_z = workgroup_size
-    wg_threads = z3.Int("wg_threads")
-    constraints = [wg_x == wg_threads, wg_y == 1, wg_z == 1]
+    wg_threads = wg_x
+    constraints = [wg_y == 1, wg_z == 1]
     constraints += [subgroup_size == 64, wg_threads <= 1024]
     constraints += [
         get_mfma_intrinsic_constraints(
             problem_size, intrinsic_mn, intrinsic_mn, intrinsic_k, mma_intrinsics
         )
     ]
-    subgroup_k_count = 1
 
     constraints += [
         m_tiles[-1] >= intrinsic_mn,
@@ -192,9 +191,9 @@ def generate_tile_and_fuse_constraints(
     constraints += [m_shape % m == 0 for m, m_shape in zip(m_tiles, M)]
     constraints += [n_shape % n == 0 for n, n_shape in zip(n_tiles, N)]
     constraints += [k_shape % k == 0 for k, k_shape in zip(k_tiles[:-1], K[:-1])]
-    constraints += [m >= 0 for m in m_tiles]
-    constraints += [n >= 0 for n in n_tiles]
-    constraints += [k >= 0 for k in k_tiles]
+    constraints += [m >= 1 for m in m_tiles]
+    constraints += [n >= 1 for n in n_tiles]
+    constraints += [k >= 1 for k in k_tiles]
     constraints += [K[-1] % (k_tiles[-1] * intrinsic_k) == 0]
     constraints += [m <= m_shape for m, m_shape in zip(m_tiles, M)]
     constraints += [n <= n_shape for n, n_shape in zip(n_tiles, N)]
@@ -203,29 +202,27 @@ def generate_tile_and_fuse_constraints(
     for x in (subgroup_m_count, subgroup_n_count):
         constraints += [x >= 1, x <= 32]
 
-    subgroup_m_tile_count = z3.Int("sg_m_tcnt")
-    subgroup_n_tile_count = z3.Int("sg_n_tcnt")
-    subgroup_k_tile_count = z3.Int("sg_k_tcnt")
-    for x in (subgroup_m_tile_count, subgroup_n_tile_count, subgroup_k_tile_count):
-        constraints += [x >= 1, x <= 32]
-    constraints += [math.prod(subgroup_m_tiles) == subgroup_m_tile_count]
-    constraints += [math.prod(subgroup_n_tiles) == subgroup_n_tile_count]
     constraints += [
-        m % m_subgroup == 0 for m, m_subgroup in zip(m_tiles, subgroup_m_tiles)
+        m % m_subgroup == 0
+        for m, m_subgroup in zip(m_tiles[:-1], subgroup_m_tiles[:-1])
     ]
     constraints += [
-        n % n_subgroup == 0 for n, n_subgroup in zip(n_tiles, subgroup_n_tiles)
+        n % n_subgroup == 0
+        for n, n_subgroup in zip(n_tiles[:-1], subgroup_n_tiles[:-1])
     ]
-    constraints += [m_subgroup > 0 for m_subgroup in subgroup_m_tiles]
-    constraints += [n_subgroup > 0 for n_subgroup in subgroup_n_tiles]
+    constraints += [m_tiles[-1] % (subgroup_m_tiles[-1] * intrinsic_mn) == 0]
+    constraints += [n_tiles[-1] % (subgroup_n_tiles[-1] * intrinsic_mn) == 0]
+    constraints += [m_subgroup >= 1 for m_subgroup in subgroup_m_tiles]
+    constraints += [n_subgroup >= 1 for n_subgroup in subgroup_n_tiles]
 
     constraints += [
-        math.prod(m_tiles) == subgroup_m_count * subgroup_m_tile_count * intrinsic_mn
+        math.prod(m_tiles)
+        == math.prod(subgroup_m_tiles) * subgroup_m_count * intrinsic_mn
     ]
     constraints += [
-        math.prod(n_tiles) == subgroup_n_count * subgroup_n_tile_count * intrinsic_mn
+        math.prod(n_tiles)
+        == math.prod(subgroup_n_tiles) * subgroup_n_count * intrinsic_mn
     ]
-    constraints += [math.prod(k_tiles) == subgroup_k_count * subgroup_k_tile_count]
     subgroups = subgroup_m_count * subgroup_n_count
     if num_subgroups > 0:
         constraints += [subgroups == num_subgroups]
diff --git a/tuner/tuner/dispatch_constraints_test.py b/tuner/tuner/dispatch_constraints_test.py
index d31a76e90..1116adac3 100644
--- a/tuner/tuner/dispatch_constraints_test.py
+++ b/tuner/tuner/dispatch_constraints_test.py
@@ -124,20 +124,20 @@ def test_generate_tile_and_fuse_constraints_valid_input(
     tuner_ctx: common.TunerContext,
 ) -> None:
     matmul_size = common.ContractionSizes(
-        M=[4, 32],
-        N=[6, 64],
-        K=[8, 128],
-        B=[2, 16],
+        M=[32],
+        N=[64],
+        K=[128],
+        B=[2],
     )
     contraction_dims = common.ContractionDimensions(
-        m=[1, 5],
-        n=[2, 6],
-        k=[3, 7],
-        batch=[0, 4],
+        m=[1],
+        n=[2],
+        k=[3],
+        batch=[0],
     )
-    lhs_type = common.ShapedType([2, 4, 8, 16, 32, 128], tuner_ctx.type.f16)
-    rhs_type = common.ShapedType([2, 6, 8, 16, 64, 128], tuner_ctx.type.f16)
-    res_type = common.ShapedType([2, 4, 6, 16, 32, 64], tuner_ctx.type.f32)
+    lhs_type = common.ShapedType([2, 32, 128], tuner_ctx.type.f16)
+    rhs_type = common.ShapedType([2, 64, 128], tuner_ctx.type.f16)
+    res_type = common.ShapedType([2, 32, 64], tuner_ctx.type.f32)
     problem_size = common.ProblemSize(
         matmul_size,
         lhs_type,
@@ -148,13 +148,13 @@ def test_generate_tile_and_fuse_constraints_valid_input(
     )
     # Define input parameters as z3 Ints
     m, n, k = (
-        [z3.Int("m0"), z3.Int("m1")],
-        [z3.Int("n0"), z3.Int("n1")],
-        [z3.Int("k0"), z3.Int("k1")],
+        [z3.Int("m0")],
+        [z3.Int("n0")],
+        [z3.Int("k0")],
     )
     subgroup_m, subgroup_n = (
-        [z3.Int("subgroup_m0"), z3.Int("subgroup_m1")],
-        [z3.Int("subgroup_n0"), z3.Int("subgroup_n1")],
+        [z3.Int("subgroup_m0")],
+        [z3.Int("subgroup_n0")],
     )
     subgroup_size = z3.Int("subgroup_size")
     intrinsic_mn = z3.Int("intrinsic_mn")
@@ -198,20 +198,20 @@ def test_generate_tile_and_fuse_constraints_invalid_input(
 ) -> None:
     # Define input parameters that should lead to unsatisfiable constraints
     matmul_size = common.ContractionSizes(
-        M=[4, 32],
-        N=[6, 64],
-        K=[8, 128],
-        B=[2, 16],
+        M=[32],
+        N=[64],
+        K=[128],
+        B=[2],
     )
     contraction_dims = common.ContractionDimensions(
-        m=[1, 5],
-        n=[2, 6],
-        k=[3, 7],
-        batch=[0, 4],
+        m=[1],
+        n=[2],
+        k=[3],
+        batch=[0],
     )
-    lhs_type = common.ShapedType([2, 4, 8, 16, 32, 128], tuner_ctx.type.f16)
-    rhs_type = common.ShapedType([2, 6, 8, 16, 64, 128], tuner_ctx.type.f16)
-    res_type = common.ShapedType([2, 4, 6, 16, 32, 64], tuner_ctx.type.f32)
+    lhs_type = common.ShapedType([2, 32, 128], tuner_ctx.type.f16)
+    rhs_type = common.ShapedType([2, 64, 128], tuner_ctx.type.f16)
+    res_type = common.ShapedType([2, 32, 64], tuner_ctx.type.f32)
     problem_size = common.ProblemSize(
         matmul_size,
         lhs_type,
@@ -222,13 +222,13 @@ def test_generate_tile_and_fuse_constraints_invalid_input(
     )
     # Define input parameters as z3 Ints
     m, n, k = (
-        [z3.Int("m0"), z3.Int("m1")],
-        [z3.Int("n0"), z3.Int("n1")],
-        [z3.Int("k0"), z3.Int("k1")],
+        [z3.Int("m0")],
+        [z3.Int("n0")],
+        [z3.Int("k0")],
     )
     subgroup_m, subgroup_n = (
-        [z3.Int("subgroup_m0"), z3.Int("subgroup_m1")],
-        [z3.Int("subgroup_n0"), z3.Int("subgroup_n1")],
+        [z3.Int("subgroup_m0")],
+        [z3.Int("subgroup_n0")],
     )
     subgroup_size = z3.Int("subgroup_size")
     intrinsic_mn = z3.Int("intrinsic_mn")

From 64dfcb25d563d9ced8b1ec14fe1a440d395baa26 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Tue, 7 Jan 2025 21:10:53 -0500
Subject: [PATCH 12/35] [Tuner] Clean up sample tuner (#781)

Rename it from 'test' to 'simple' to avoid mistaking it for a test:
https://github.com/nod-ai/shark-ai/actions/runs/12659028888/job/35277223146#step:6:26
.

Also:
* Update and improve the README (account for the directory structure)
* Make flag naming consistent
* Handle previously missing `--stop-after` phases
* Add git ignore for temporary files
---
 tuner/examples/simple/.gitignore              |  1 +
 tuner/examples/{test => simple}/README.md     | 26 ++++++++-------
 tuner/examples/{test => simple}/__init__.py   |  0
 tuner/examples/{test => simple}/__main__.py   |  4 +--
 .../examples/{test => simple}/double_mmt.mlir |  0
 .../tuner_test.py => simple/simple_tuner.py}  | 33 ++++++++++---------
 6 files changed, 36 insertions(+), 28 deletions(-)
 create mode 100644 tuner/examples/simple/.gitignore
 rename tuner/examples/{test => simple}/README.md (54%)
 rename tuner/examples/{test => simple}/__init__.py (100%)
 rename tuner/examples/{test => simple}/__main__.py (83%)
 rename tuner/examples/{test => simple}/double_mmt.mlir (100%)
 rename tuner/examples/{test/tuner_test.py => simple/simple_tuner.py} (83%)

diff --git a/tuner/examples/simple/.gitignore b/tuner/examples/simple/.gitignore
new file mode 100644
index 000000000..a9a5aecf4
--- /dev/null
+++ b/tuner/examples/simple/.gitignore
@@ -0,0 +1 @@
+tmp
diff --git a/tuner/examples/test/README.md b/tuner/examples/simple/README.md
similarity index 54%
rename from tuner/examples/test/README.md
rename to tuner/examples/simple/README.md
index 850a161da..b1aeb12e1 100644
--- a/tuner/examples/test/README.md
+++ b/tuner/examples/simple/README.md
@@ -1,6 +1,6 @@
-# Example Tuner Test
+# Simple Example Tuner
 
-Example of tuning a dispatch and full model.
+Example of tuning a dispatch and a full model.
 
 ## Environments
 Follow instructions in [`/tuner/README.md`](../README.md)
@@ -15,27 +15,31 @@ Use the usual `iree-compile` command for your model, add
 `--iree-hal-dump-executable-files-to=dump --iree-config-add-tuner-attributes`,
 and get the dispatch benchmark that you want to tune. For example:
 ```shell
+mkdir tmp
 iree-compile double_mmt.mlir --iree-hal-target-backends=rocm \
-    --iree-hip-target=gfx942 --iree-hal-dump-executable-files-to=dump \
+    --iree-hip-target=gfx942 --iree-hal-dump-executable-files-to=tmp/dump \
     --iree-config-add-tuner-attributes -o /dev/null
 
-cp dump/module_main_dispatch_0_rocm_hsaco_fb_benchmark.mlir mmt_benchmark.mlir
+cp tmp/dump/module_main_dispatch_0_rocm_hsaco_fb_benchmark.mlir tmp/mmt_benchmark.mlir
 ```
 
 ### Recommended Trial Run
 For an initial trial to test the tuning loop, use:
 ```shell
-python -m examples.test double_mmt.mlir mmt_benchmark.mlir \
-    --test_num_dispatch_candidates=5 --test_num_model_candidates=3 \
-    --num-candidates=30
+cd ../../
+python -m examples.simple examples/simple/double_mmt.mlir \
+    examples/simple/tmp/mmt_benchmark.mlir \
+    --devices=hip://0 --num-candidates=30 \
+    --simple-num-dispatch-candidates=5 --simple-num-model-candidates=3 \
 ```
 
 ### Basic Usage
 ```shell
-python -m examples.test <model_file_path> <benchmark_file_path> \
-    --test_num_dispatch_candidates=<num_dispatch_candidates> \
-    --test_num_model_candidates=<num_model_candidates> \
-    --test_hip_target=<hip_target> \
+python -m examples.simple <model_file_path> <benchmark_file_path> \
+    --devices=hip://0 --num-candidates=1024 \
+    --test-num-dispatch-candidates=<num_dispatch_candidates> \
+    --test-num-model-candidates=<num_model_candidates> \
+    --test-hip-target=<hip_target> \
     --num-candidates=<num_generated_candidates> \
     --codegen-pipeline=<codegen_pipeline>
 ```
diff --git a/tuner/examples/test/__init__.py b/tuner/examples/simple/__init__.py
similarity index 100%
rename from tuner/examples/test/__init__.py
rename to tuner/examples/simple/__init__.py
diff --git a/tuner/examples/test/__main__.py b/tuner/examples/simple/__main__.py
similarity index 83%
rename from tuner/examples/test/__main__.py
rename to tuner/examples/simple/__main__.py
index 4f426e110..7c02bb457 100644
--- a/tuner/examples/test/__main__.py
+++ b/tuner/examples/simple/__main__.py
@@ -4,6 +4,6 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from . import tuner_test
+from . import simple_tuner
 
-tuner_test.main()
+simple_tuner.main()
diff --git a/tuner/examples/test/double_mmt.mlir b/tuner/examples/simple/double_mmt.mlir
similarity index 100%
rename from tuner/examples/test/double_mmt.mlir
rename to tuner/examples/simple/double_mmt.mlir
diff --git a/tuner/examples/test/tuner_test.py b/tuner/examples/simple/simple_tuner.py
similarity index 83%
rename from tuner/examples/test/tuner_test.py
rename to tuner/examples/simple/simple_tuner.py
index 22a0d2f4d..d78ec5b53 100644
--- a/tuner/examples/test/tuner_test.py
+++ b/tuner/examples/simple/simple_tuner.py
@@ -65,22 +65,22 @@ def main():
     parser = argparse.ArgumentParser(description="Autotune test script")
     test_args = parser.add_argument_group("Example Test Options")
     test_args.add_argument(
-        "test_model_file", type=Path, help="Path to the model file to tune (.mlir)"
+        "simple_model_file", type=Path, help="Path to the model file to tune (.mlir)"
     )
     test_args.add_argument(
-        "--test_num_dispatch_candidates",
+        "--simple-num-dispatch-candidates",
         type=int,
         default=None,
         help="Number of dispatch candidates to keep for model benchmarks.",
     )
     test_args.add_argument(
-        "--test_num_model_candidates",
+        "--simple-num-model-candidates",
         type=int,
         default=None,
         help="Number of model candidates to produce after tuning.",
     )
     test_args.add_argument(
-        "--test_hip_target",
+        "--simple-hip-target",
         type=str,
         default="gfx942",
         help="Hip target for tuning.",
@@ -98,42 +98,43 @@ def main():
     libtuner.setup_logging(args, path_config)
     print(path_config.run_log, end="\n\n")
 
-    # TODO(Max191): Some bug seems to be causing OOM errors in benchmarking
-    # when device validation happens, so this is commented for now. Uncomment
-    # when the bug is fixed.
     if not args.dry_run:
         print("Validating devices")
         libtuner.validate_devices(args.devices)
         print("Validation successful!\n")
 
-    print("Generating candidates...")
+    print("Generating candidate tuning specs...")
     test_tuner = TestTuner()
     candidates = libtuner.generate_candidate_specs(
         args, path_config, candidate_trackers, test_tuner
     )
-    print(f"Stored candidate specs in {path_config.specs_dir}\n")
+    print(f"Stored candidate tuning specs in {path_config.specs_dir}\n")
     if stop_after_phase == libtuner.ExecutionPhases.generate_candidates:
         return
 
-    print("Compiling candidates...")
+    print("Compiling dispatch candidates...")
     compiled_candidates = libtuner.compile(
         args, path_config, candidates, candidate_trackers, test_tuner
     )
+    if stop_after_phase == libtuner.ExecutionPhases.compile_dispatches:
+        return
 
-    print("Benchmarking compiled candidates...")
+    print("Benchmarking compiled dispatch candidates...")
     top_candidates = libtuner.benchmark(
         args,
         path_config,
         compiled_candidates,
         candidate_trackers,
         test_tuner,
-        args.test_num_dispatch_candidates,
+        args.simple_num_dispatch_candidates,
     )
+    if stop_after_phase == libtuner.ExecutionPhases.benchmark_dispatches:
+        return
 
     print("Compiling models with top candidates...")
     test_tuner.compile_flags = [
         "--iree-hal-target-backends=rocm",
-        f"--iree-hip-target={args.test_hip_target}",
+        f"--iree-hip-target={args.simple_hip_target}",
     ]
     compiled_model_candidates = libtuner.compile(
         args,
@@ -141,8 +142,10 @@ def main():
         top_candidates,
         candidate_trackers,
         test_tuner,
-        args.test_model_file,
+        args.simple_model_file,
     )
+    if stop_after_phase == libtuner.ExecutionPhases.compile_models:
+        return
 
     print("Benchmarking compiled model candidates...")
     test_tuner.benchmark_flags = [
@@ -156,7 +159,7 @@ def main():
         compiled_model_candidates,
         candidate_trackers,
         test_tuner,
-        args.test_num_model_candidates,
+        args.simple_num_model_candidates,
     )
 
     print(f"Top model candidates: {top_model_candidates}")

From ed7906f0517425580031bcfdad8b09814a1e5ba7 Mon Sep 17 00:00:00 2001
From: Bangtian Liu <liubangtian@gmail.com>
Date: Wed, 8 Jan 2025 00:41:14 -0500
Subject: [PATCH 13/35] [Tuner] Fix context management (#770)

This PR is about addressing the MLIR context management issue in the
tuner detailed in https://github.com/nod-ai/shark-ai/issues/764.

Although this is a work in progress, I am sending it to gather feedback
and ensure I am heading in the right direction.

---------

Signed-off-by: Bangtian Liu <liubangtian@gmail.com>
---
 tuner/examples/simple/simple_tuner.py    | 130 ++++++++++++-----------
 tuner/tuner/candidate_gen_test.py        |   6 +-
 tuner/tuner/common.py                    |  23 ++--
 tuner/tuner/common_test.py               |   6 +-
 tuner/tuner/dispatch_constraints_test.py |   6 +-
 tuner/tuner/dispatch_parser_test.py      |   6 +-
 tuner/tuner/libtuner.py                  |  26 ++---
 tuner/tuner/op_matchers.py               |   2 +-
 8 files changed, 107 insertions(+), 98 deletions(-)

diff --git a/tuner/examples/simple/simple_tuner.py b/tuner/examples/simple/simple_tuner.py
index d78ec5b53..63421fdfe 100644
--- a/tuner/examples/simple/simple_tuner.py
+++ b/tuner/examples/simple/simple_tuner.py
@@ -7,11 +7,12 @@
 import argparse
 from pathlib import Path
 from tuner import libtuner
+from tuner.common import *
 
 
 class TestTuner(libtuner.TuningClient):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, tuner_context: libtuner.TunerContext):
+        super().__init__(tuner_context)
         self.compile_flags = ["--compile-from=executable-sources"]
         self.benchmark_flags = ["--benchmark_repetitions=3", "--input=1"]
 
@@ -104,68 +105,69 @@ def main():
         print("Validation successful!\n")
 
     print("Generating candidate tuning specs...")
-    test_tuner = TestTuner()
-    candidates = libtuner.generate_candidate_specs(
-        args, path_config, candidate_trackers, test_tuner
-    )
-    print(f"Stored candidate tuning specs in {path_config.specs_dir}\n")
-    if stop_after_phase == libtuner.ExecutionPhases.generate_candidates:
-        return
-
-    print("Compiling dispatch candidates...")
-    compiled_candidates = libtuner.compile(
-        args, path_config, candidates, candidate_trackers, test_tuner
-    )
-    if stop_after_phase == libtuner.ExecutionPhases.compile_dispatches:
-        return
-
-    print("Benchmarking compiled dispatch candidates...")
-    top_candidates = libtuner.benchmark(
-        args,
-        path_config,
-        compiled_candidates,
-        candidate_trackers,
-        test_tuner,
-        args.simple_num_dispatch_candidates,
-    )
-    if stop_after_phase == libtuner.ExecutionPhases.benchmark_dispatches:
-        return
-
-    print("Compiling models with top candidates...")
-    test_tuner.compile_flags = [
-        "--iree-hal-target-backends=rocm",
-        f"--iree-hip-target={args.simple_hip_target}",
-    ]
-    compiled_model_candidates = libtuner.compile(
-        args,
-        path_config,
-        top_candidates,
-        candidate_trackers,
-        test_tuner,
-        args.simple_model_file,
-    )
-    if stop_after_phase == libtuner.ExecutionPhases.compile_models:
-        return
-
-    print("Benchmarking compiled model candidates...")
-    test_tuner.benchmark_flags = [
-        "--benchmark_repetitions=3",
-        "--input=2048x2048xf16",
-        "--input=2048x2048xf16",
-    ]
-    top_model_candidates = libtuner.benchmark(
-        args,
-        path_config,
-        compiled_model_candidates,
-        candidate_trackers,
-        test_tuner,
-        args.simple_num_model_candidates,
-    )
-
-    print(f"Top model candidates: {top_model_candidates}")
-
-    print("Check the detailed execution logs in:")
-    print(path_config.run_log.resolve())
+    with TunerContext() as tuner_context:
+        test_tuner = TestTuner(tuner_context)
+        candidates = libtuner.generate_candidate_specs(
+            args, path_config, candidate_trackers, test_tuner
+        )
+        print(f"Stored candidate tuning specs in {path_config.specs_dir}\n")
+        if stop_after_phase == libtuner.ExecutionPhases.generate_candidates:
+            return
+
+        print("Compiling dispatch candidates...")
+        compiled_candidates = libtuner.compile(
+            args, path_config, candidates, candidate_trackers, test_tuner
+        )
+        if stop_after_phase == libtuner.ExecutionPhases.compile_dispatches:
+            return
+
+        print("Benchmarking compiled dispatch candidates...")
+        top_candidates = libtuner.benchmark(
+            args,
+            path_config,
+            compiled_candidates,
+            candidate_trackers,
+            test_tuner,
+            args.simple_num_dispatch_candidates,
+        )
+        if stop_after_phase == libtuner.ExecutionPhases.benchmark_dispatches:
+            return
+
+        print("Compiling models with top candidates...")
+        test_tuner.compile_flags = [
+            "--iree-hal-target-backends=rocm",
+            f"--iree-hip-target={args.simple_hip_target}",
+        ]
+        compiled_model_candidates = libtuner.compile(
+            args,
+            path_config,
+            top_candidates,
+            candidate_trackers,
+            test_tuner,
+            args.simple_model_file,
+        )
+        if stop_after_phase == libtuner.ExecutionPhases.compile_models:
+            return
+
+        print("Benchmarking compiled model candidates...")
+        test_tuner.benchmark_flags = [
+            "--benchmark_repetitions=3",
+            "--input=2048x2048xf16",
+            "--input=2048x2048xf16",
+        ]
+        top_model_candidates = libtuner.benchmark(
+            args,
+            path_config,
+            compiled_model_candidates,
+            candidate_trackers,
+            test_tuner,
+            args.simple_num_model_candidates,
+        )
+
+        print(f"Top model candidates: {top_model_candidates}")
+
+        print("Check the detailed execution logs in:")
+        print(path_config.run_log.resolve())
 
     for candidate in candidate_trackers:
         libtuner.logging.debug(candidate)
diff --git a/tuner/tuner/candidate_gen_test.py b/tuner/tuner/candidate_gen_test.py
index 8b0ca58d3..6a62e90e4 100644
--- a/tuner/tuner/candidate_gen_test.py
+++ b/tuner/tuner/candidate_gen_test.py
@@ -27,9 +27,9 @@ def tuner_ctx() -> Generator[common.TunerContext, None, None]:
     from logging import Logger
     from unittest.mock import MagicMock
 
-    with ir.Context() as ctx:
-        logger: Logger = MagicMock(spec=Logger)
-        yield common.TunerContext(ctx, logger)
+    mock_logger = MagicMock(spec=Logger)
+    with common.TunerContext(logger=mock_logger) as ctx:
+        yield ctx
 
 
 def test_get_td_spec_contraction(tuner_ctx: common.TunerContext) -> None:
diff --git a/tuner/tuner/common.py b/tuner/tuner/common.py
index 45bcb0d75..8efac1653 100644
--- a/tuner/tuner/common.py
+++ b/tuner/tuner/common.py
@@ -4,17 +4,16 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import re
 import logging
 from dataclasses import astuple, dataclass, field
 from enum import Enum
+from types import TracebackType
 from typing import Optional
 from typing import Any
 
 from iree.compiler import ir  # type: ignore
 
 from iree.compiler.dialects import iree_gpu  # type: ignore
-from iree.compiler.dialects import iree_codegen  # type: ignore
 
 
 class CommonTypes:
@@ -38,10 +37,22 @@ def getI64(self, value: int) -> ir.IntegerAttr:
 
 
 class TunerContext:
-    def __init__(self, mlir_ctx: ir.Context, logger: logging.Logger):
-        self.mlir_ctx: ir.Context = mlir_ctx
-        self.logger: logging.Logger = logger
-        self.type: CommonTypes = CommonTypes(mlir_ctx)
+    def __init__(self, logger: Optional[logging.Logger] = None):
+        self.mlir_ctx: ir.Context = ir.Context()
+        self.logger: logging.Logger = logger or logging.getLogger("tune")
+        self.type: CommonTypes = CommonTypes(self.mlir_ctx)
+
+    def __enter__(self) -> "TunerContext":
+        self.mlir_ctx.__enter__()
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: TracebackType | None,
+    ) -> bool:
+        return self.mlir_ctx.__exit__(exc_type, exc_value, traceback)
 
 
 class DispatchKind(Enum):
diff --git a/tuner/tuner/common_test.py b/tuner/tuner/common_test.py
index eba5b35e1..a6c71026d 100644
--- a/tuner/tuner/common_test.py
+++ b/tuner/tuner/common_test.py
@@ -23,9 +23,9 @@ def tuner_ctx() -> Generator[common.TunerContext, None, None]:
     from logging import Logger
     from unittest.mock import MagicMock
 
-    with ir.Context() as ctx:
-        logger: Logger = MagicMock(spec=Logger)
-        yield common.TunerContext(ctx, logger)
+    mock_logger = MagicMock(spec=Logger)
+    with common.TunerContext(logger=mock_logger) as ctx:
+        yield ctx
 
 
 @pytest.fixture
diff --git a/tuner/tuner/dispatch_constraints_test.py b/tuner/tuner/dispatch_constraints_test.py
index 1116adac3..9a34e41db 100644
--- a/tuner/tuner/dispatch_constraints_test.py
+++ b/tuner/tuner/dispatch_constraints_test.py
@@ -25,9 +25,9 @@ def tuner_ctx() -> Generator[common.TunerContext, None, None]:
     from logging import Logger
     from unittest.mock import MagicMock
 
-    with ir.Context() as ctx:
-        logger: Logger = MagicMock(spec=Logger)
-        yield common.TunerContext(ctx, logger)
+    mock_logger = MagicMock(spec=Logger)
+    with common.TunerContext(logger=mock_logger) as ctx:
+        yield ctx
 
 
 def test_generate_solutions(tuner_ctx: common.TunerContext) -> None:
diff --git a/tuner/tuner/dispatch_parser_test.py b/tuner/tuner/dispatch_parser_test.py
index 7ddb0bb84..204f84b28 100644
--- a/tuner/tuner/dispatch_parser_test.py
+++ b/tuner/tuner/dispatch_parser_test.py
@@ -27,9 +27,9 @@ def tuner_ctx() -> Generator[common.TunerContext, None, None]:
     from logging import Logger
     from unittest.mock import MagicMock
 
-    with ir.Context() as ctx:
-        logger: Logger = MagicMock(spec=Logger)
-        yield common.TunerContext(ctx, logger)
+    mock_logger = MagicMock(spec=Logger)
+    with common.TunerContext(logger=mock_logger) as ctx:
+        yield ctx
 
 
 CONTRACTION_TEMPLATE = r"""
diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py
index fab86c369..b18736ffb 100644
--- a/tuner/tuner/libtuner.py
+++ b/tuner/tuner/libtuner.py
@@ -18,7 +18,6 @@
 
 import math
 import signal
-import subprocess
 import sys
 import shutil
 import logging
@@ -26,7 +25,6 @@
 from datetime import datetime
 from enum import Enum
 from pathlib import Path
-import time
 import multiprocessing
 import queue
 from tqdm import tqdm
@@ -37,6 +35,7 @@
 import iree.runtime as ireert  # type: ignore
 import iree.compiler as ireec  # type: ignore
 from iree.compiler import ir  # type: ignore
+from iree.compiler.dialects import iree_codegen  # type: ignore
 from . import candidate_gen
 from . import dispatch_parser
 from .op_matchers import *
@@ -103,10 +102,8 @@ def get_candidate_vmfb_filename(self, candidate_id: int) -> str:
 
 
 class TuningClient(ABC):
-    def __init__(self):
-        mlir_ctx = ir.Context()
-        logger = logging.getLogger("tune")
-        self.tuner_context = TunerContext(mlir_ctx, logger)
+    def __init__(self, tuner_context: TunerContext):
+        self.tuner_context = tuner_context
 
     @abstractmethod
     def get_iree_compile_flags(self) -> list[str]:
@@ -644,15 +641,14 @@ def generate_candidate_specs(
         # source mlir.
         mlir_text = candidate_gen.strip_compilation_info(path_config.template_mlir)
         mlir_module = dispatch_parser.parse_mlir(mlir_text, tuning_client.tuner_context)
-        with tuning_client.tuner_context.mlir_ctx:
-            logging.debug("Captured messages from candidate_gen.py:")
-            config_specs: list[ir.Module] = candidate_gen.generate_configs_and_td_specs(
-                input_module=mlir_module,
-                tuner_context=tuning_client.tuner_context,
-                limit=args.num_candidates,
-                num_subgroups=args.num_subgroups,
-                codegen_pipeline=get_iree_codegen_pipeline(args.codegen_pipeline),
-            )
+        logging.debug("Captured messages from candidate_gen.py:")
+        config_specs: list[ir.Module] = candidate_gen.generate_configs_and_td_specs(
+            input_module=mlir_module,
+            tuner_context=tuning_client.tuner_context,
+            limit=args.num_candidates,
+            num_subgroups=args.num_subgroups,
+            codegen_pipeline=get_iree_codegen_pipeline(args.codegen_pipeline),
+        )
         logging.debug("candidate_gen.py ends")
         handle_error(
             condition=(len(config_specs) <= 1), msg="Failed to generate any candidates"
diff --git a/tuner/tuner/op_matchers.py b/tuner/tuner/op_matchers.py
index f3966b97d..09fcd17ea 100644
--- a/tuner/tuner/op_matchers.py
+++ b/tuner/tuner/op_matchers.py
@@ -125,7 +125,7 @@ def get_map_result_dim_positions(map: ir.AffineMap):
 
 
 class ContractionOpInterfaceMatcher(GenericOpMatcher):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.contraction_dimensions: Optional[ContractionDimensions] = None
         self.lhs_dims: Optional[list[int]] = None

From 4a025d48ebe66940b3a77d8c2137f52b7a80cb2f Mon Sep 17 00:00:00 2001
From: Marius Brehler <marius.brehler@amd.com>
Date: Wed, 8 Jan 2025 17:00:46 +0100
Subject: [PATCH 14/35] Update pyenv to 2.5.0 and Python version (#784)

Updates python to 2.5.0 and sets the Python version to 3.12.8.
Furthermore, follows the recommendation in the v2.5.0 release notes to
use `pyenv init - <shell>` instead of `pyenv init -` as this should show
better performance.
---
 .github/workflows/ci_linux_x64_asan-libshortfin.yml | 12 ++++++------
 docs/developer_guide.md                             |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci_linux_x64_asan-libshortfin.yml b/.github/workflows/ci_linux_x64_asan-libshortfin.yml
index 3f2cc0d41..121b236fd 100644
--- a/.github/workflows/ci_linux_x64_asan-libshortfin.yml
+++ b/.github/workflows/ci_linux_x64_asan-libshortfin.yml
@@ -32,8 +32,8 @@ concurrency:
 
 env:
   PYENV_ROOT: ${{ github.workspace }}/pyenv
-  PYENV_REF: 96b3fb2fc3bee85650cb22e2cb06c83c24509a6d # v2.4.17
-  PYTHON_VER: 3.12.7
+  PYENV_REF: 2aff0df76deacd9b6c49de8870f0255d24d3b31b # v2.5.0
+  PYTHON_VER: 3.12.8
   CACHE_ASAN_VER: 2
   CACHE_DEPS_VER: 1
   LIBSHORTFIN_DIR: ${{ github.workspace }}/shortfin/
@@ -76,7 +76,7 @@ jobs:
       working-directory: ${{ env.PYENV_ROOT }}
       run: |
         src/configure && make -C src
-        export PATH=${{ env.PYENV_ROOT }}/bin:$PATH && eval "$(pyenv init -)"
+        export PATH=${{ env.PYENV_ROOT }}/bin:$PATH && eval "$(pyenv init - bash)"
         CC=clang-18 CXX=clang++-18 LDFLAGS="-lstdc++" PYTHON_CONFIGURE_OPTS="--with-address-sanitizer" pyenv install -v ${{ env.PYTHON_VER }}
         pyenv global ${{ env.PYTHON_VER }}
 
@@ -122,7 +122,7 @@ jobs:
       if: steps.cache-python-deps-restore.outputs.cache-hit != 'true'
       working-directory: ${{ env.LIBSHORTFIN_DIR }}
       run: |
-        eval "$(pyenv init -)"
+        eval "$(pyenv init - bash)"
         pip install -r requirements-tests.txt
         pip install -r ../requirements-iree-pinned.txt
         pip freeze
@@ -138,7 +138,7 @@ jobs:
     - name: Build shortfin
       working-directory: ${{ env.LIBSHORTFIN_DIR }}
       run: |
-        eval "$(pyenv init -)"
+        eval "$(pyenv init - bash)"
         SHORTFIN_ENABLE_ASAN=ON \
         SHORTFIN_DEV_MODE=ON \
         SHORTFIN_RUN_CTESTS=ON \
@@ -151,5 +151,5 @@ jobs:
         ASAN_OPTIONS: detect_odr_violation=0
       working-directory: ${{ env.LIBSHORTFIN_DIR }}
       run: |
-        eval "$(pyenv init -)"
+        eval "$(pyenv init - bash)"
         pytest -s --durations=10 --timeout=30
diff --git a/docs/developer_guide.md b/docs/developer_guide.md
index 73aee61f7..c347b9dac 100644
--- a/docs/developer_guide.md
+++ b/docs/developer_guide.md
@@ -40,7 +40,7 @@ Then, make pyenv available by adding the below to your `~/.bashrc`:
 ```bash
 export PYENV_ROOT="$HOME/.pyenv"
 command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"
-eval "$(pyenv init -)"
+eval "$(pyenv init - bash)"
 ```
 
 Finally, install a pyenv-managed version of python

From 3bef80f7cc67b761775f5d1eb19b1607d86b1c71 Mon Sep 17 00:00:00 2001
From: Max191 <44243577+Max191@users.noreply.github.com>
Date: Wed, 8 Jan 2025 11:10:49 -0500
Subject: [PATCH 15/35] [tuner] Clean up simple tuner example script (#785)

This PR addresses the TODO in `tuner/examples/simple/simple_tuner.py` to
remove the unused abstract function implementations in the tuner client.
The PR also renames some variables to be more consistent with the name
of the example.

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
---
 tuner/examples/simple/simple_tuner.py | 67 +++++++--------------------
 1 file changed, 16 insertions(+), 51 deletions(-)

diff --git a/tuner/examples/simple/simple_tuner.py b/tuner/examples/simple/simple_tuner.py
index 63421fdfe..a7f718e9b 100644
--- a/tuner/examples/simple/simple_tuner.py
+++ b/tuner/examples/simple/simple_tuner.py
@@ -10,7 +10,7 @@
 from tuner.common import *
 
 
-class TestTuner(libtuner.TuningClient):
+class SimpleTuner(libtuner.TuningClient):
     def __init__(self, tuner_context: libtuner.TunerContext):
         super().__init__(tuner_context)
         self.compile_flags = ["--compile-from=executable-sources"]
@@ -25,62 +25,27 @@ def get_iree_benchmark_module_flags(self) -> list[str]:
     def get_benchmark_timeout_s(self) -> int:
         return 10
 
-    # TODO(Max191): Remove the following unused abstract functions once they
-    # are removed from the TuningClient definition.
-    def get_dispatch_benchmark_timeout_s(self) -> int:
-        return 0
-
-    def get_dispatch_compile_timeout_s(self) -> int:
-        return 0
-
-    def get_dispatch_compile_command(
-        self, candidate_tracker: libtuner.CandidateTracker
-    ) -> list[str]:
-        return []
-
-    def get_dispatch_benchmark_command(
-        self,
-        candidate_tracker: libtuner.CandidateTracker,
-    ) -> list[str]:
-        return []
-
-    def get_model_compile_timeout_s(self) -> int:
-        return 0
-
-    def get_model_compile_command(
-        self, candidate_tracker: libtuner.CandidateTracker
-    ) -> list[str]:
-        return []
-
-    def get_model_benchmark_timeout_s(self) -> int:
-        return 0
-
-    def get_model_benchmark_command(
-        self, candidate_tracker: libtuner.CandidateTracker
-    ) -> list[str]:
-        return []
-
 
 def main():
-    # Custom arguments for the test file.
-    parser = argparse.ArgumentParser(description="Autotune test script")
-    test_args = parser.add_argument_group("Example Test Options")
-    test_args.add_argument(
+    # Custom arguments for the example tuner file.
+    parser = argparse.ArgumentParser(description="Autotune sample script")
+    client_args = parser.add_argument_group("Simple Example Tuner Options")
+    client_args.add_argument(
         "simple_model_file", type=Path, help="Path to the model file to tune (.mlir)"
     )
-    test_args.add_argument(
+    client_args.add_argument(
         "--simple-num-dispatch-candidates",
         type=int,
         default=None,
         help="Number of dispatch candidates to keep for model benchmarks.",
     )
-    test_args.add_argument(
+    client_args.add_argument(
         "--simple-num-model-candidates",
         type=int,
         default=None,
         help="Number of model candidates to produce after tuning.",
     )
-    test_args.add_argument(
+    client_args.add_argument(
         "--simple-hip-target",
         type=str,
         default="gfx942",
@@ -106,9 +71,9 @@ def main():
 
     print("Generating candidate tuning specs...")
     with TunerContext() as tuner_context:
-        test_tuner = TestTuner(tuner_context)
+        simple_tuner = SimpleTuner(tuner_context)
         candidates = libtuner.generate_candidate_specs(
-            args, path_config, candidate_trackers, test_tuner
+            args, path_config, candidate_trackers, simple_tuner
         )
         print(f"Stored candidate tuning specs in {path_config.specs_dir}\n")
         if stop_after_phase == libtuner.ExecutionPhases.generate_candidates:
@@ -116,7 +81,7 @@ def main():
 
         print("Compiling dispatch candidates...")
         compiled_candidates = libtuner.compile(
-            args, path_config, candidates, candidate_trackers, test_tuner
+            args, path_config, candidates, candidate_trackers, simple_tuner
         )
         if stop_after_phase == libtuner.ExecutionPhases.compile_dispatches:
             return
@@ -127,14 +92,14 @@ def main():
             path_config,
             compiled_candidates,
             candidate_trackers,
-            test_tuner,
+            simple_tuner,
             args.simple_num_dispatch_candidates,
         )
         if stop_after_phase == libtuner.ExecutionPhases.benchmark_dispatches:
             return
 
         print("Compiling models with top candidates...")
-        test_tuner.compile_flags = [
+        simple_tuner.compile_flags = [
             "--iree-hal-target-backends=rocm",
             f"--iree-hip-target={args.simple_hip_target}",
         ]
@@ -143,14 +108,14 @@ def main():
             path_config,
             top_candidates,
             candidate_trackers,
-            test_tuner,
+            simple_tuner,
             args.simple_model_file,
         )
         if stop_after_phase == libtuner.ExecutionPhases.compile_models:
             return
 
         print("Benchmarking compiled model candidates...")
-        test_tuner.benchmark_flags = [
+        simple_tuner.benchmark_flags = [
             "--benchmark_repetitions=3",
             "--input=2048x2048xf16",
             "--input=2048x2048xf16",
@@ -160,7 +125,7 @@ def main():
             path_config,
             compiled_model_candidates,
             candidate_trackers,
-            test_tuner,
+            simple_tuner,
             args.simple_num_model_candidates,
         )
 

From 5888624c947c064a2b3a6990b0ac4a0944bc8aba Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Wed, 8 Jan 2025 13:39:12 -0500
Subject: [PATCH 16/35] [Tuner] Add support for flag files in the simple tuner
 (#786)

This generalizes the simple tuner such that it can be used for
real-world models without having to modify the python code.
---
 tuner/examples/simple/README.md               | 11 +++--
 tuner/examples/simple/compile_flags.txt       |  2 +
 .../examples/simple/model_benchmark_flags.txt |  3 ++
 tuner/examples/simple/simple_tuner.py         | 44 +++++++++++++------
 4 files changed, 42 insertions(+), 18 deletions(-)
 create mode 100644 tuner/examples/simple/compile_flags.txt
 create mode 100644 tuner/examples/simple/model_benchmark_flags.txt

diff --git a/tuner/examples/simple/README.md b/tuner/examples/simple/README.md
index b1aeb12e1..bc8b1bb45 100644
--- a/tuner/examples/simple/README.md
+++ b/tuner/examples/simple/README.md
@@ -29,6 +29,8 @@ For an initial trial to test the tuning loop, use:
 cd ../../
 python -m examples.simple examples/simple/double_mmt.mlir \
     examples/simple/tmp/mmt_benchmark.mlir \
+    --simple-compile-flags-file=examples/simple/compile_flags.txt \
+    --simple-model-benchmark-flags-file=examples/simple/model_benchmark_flags.txt \
     --devices=hip://0 --num-candidates=30 \
     --simple-num-dispatch-candidates=5 --simple-num-model-candidates=3 \
 ```
@@ -36,10 +38,11 @@ python -m examples.simple examples/simple/double_mmt.mlir \
 ### Basic Usage
 ```shell
 python -m examples.simple <model_file_path> <benchmark_file_path> \
-    --devices=hip://0 --num-candidates=1024 \
-    --test-num-dispatch-candidates=<num_dispatch_candidates> \
-    --test-num-model-candidates=<num_model_candidates> \
-    --test-hip-target=<hip_target> \
+    --devices=hip://0,hip://1 --num-candidates=1024 \
+    --simple-compile-flags-file=<compile_flags_path> \
+    --simple-model-benchmark-flags-file=<model_benchmark_flags_path> \
+    --simple-num-dispatch-candidates=<num_dispatch_candidates> \
+    --simple-num-model-candidates=<num_model_candidates> \
     --num-candidates=<num_generated_candidates> \
     --codegen-pipeline=<codegen_pipeline>
 ```
diff --git a/tuner/examples/simple/compile_flags.txt b/tuner/examples/simple/compile_flags.txt
new file mode 100644
index 000000000..7ab70517b
--- /dev/null
+++ b/tuner/examples/simple/compile_flags.txt
@@ -0,0 +1,2 @@
+--iree-hal-target-backends=rocm
+--iree-hip-target=gfx942
diff --git a/tuner/examples/simple/model_benchmark_flags.txt b/tuner/examples/simple/model_benchmark_flags.txt
new file mode 100644
index 000000000..4b9ad2480
--- /dev/null
+++ b/tuner/examples/simple/model_benchmark_flags.txt
@@ -0,0 +1,3 @@
+--device_allocator=caching
+--input=2048x2048xf16
+--input=2048x2048xf16
diff --git a/tuner/examples/simple/simple_tuner.py b/tuner/examples/simple/simple_tuner.py
index a7f718e9b..bd5b2eca1 100644
--- a/tuner/examples/simple/simple_tuner.py
+++ b/tuner/examples/simple/simple_tuner.py
@@ -13,8 +13,8 @@
 class SimpleTuner(libtuner.TuningClient):
     def __init__(self, tuner_context: libtuner.TunerContext):
         super().__init__(tuner_context)
-        self.compile_flags = ["--compile-from=executable-sources"]
-        self.benchmark_flags = ["--benchmark_repetitions=3", "--input=1"]
+        self.compile_flags: list[str] = []
+        self.benchmark_flags: list[str] = []
 
     def get_iree_compile_flags(self) -> list[str]:
         return self.compile_flags
@@ -26,6 +26,14 @@ def get_benchmark_timeout_s(self) -> int:
         return 10
 
 
+def read_flags_file(flags_file: str) -> list[str]:
+    if not flags_file:
+        return []
+
+    with open(flags_file) as file:
+        return file.read().splitlines()
+
+
 def main():
     # Custom arguments for the example tuner file.
     parser = argparse.ArgumentParser(description="Autotune sample script")
@@ -46,10 +54,16 @@ def main():
         help="Number of model candidates to produce after tuning.",
     )
     client_args.add_argument(
-        "--simple-hip-target",
+        "--simple-compile-flags-file",
+        type=str,
+        default="",
+        help="Path to the flags file for iree-compile.",
+    )
+    client_args.add_argument(
+        "--simple-model-benchmark-flags-file",
         type=str,
-        default="gfx942",
-        help="Hip target for tuning.",
+        default="",
+        help="Path to the flags file for iree-benchmark-module for model benchmarking.",
     )
     # Remaining arguments come from libtuner
     args = libtuner.parse_arguments(parser)
@@ -69,6 +83,11 @@ def main():
         libtuner.validate_devices(args.devices)
         print("Validation successful!\n")
 
+    compile_flags: list[str] = read_flags_file(args.simple_compile_flags_file)
+    model_benchmark_flags: list[str] = read_flags_file(
+        args.simple_model_benchmark_flags_file
+    )
+
     print("Generating candidate tuning specs...")
     with TunerContext() as tuner_context:
         simple_tuner = SimpleTuner(tuner_context)
@@ -80,6 +99,9 @@ def main():
             return
 
         print("Compiling dispatch candidates...")
+        simple_tuner.compile_flags = compile_flags + [
+            "--compile-from=executable-sources"
+        ]
         compiled_candidates = libtuner.compile(
             args, path_config, candidates, candidate_trackers, simple_tuner
         )
@@ -87,6 +109,7 @@ def main():
             return
 
         print("Benchmarking compiled dispatch candidates...")
+        simple_tuner.benchmark_flags = ["--input=1", "--benchmark_repetitions=3"]
         top_candidates = libtuner.benchmark(
             args,
             path_config,
@@ -99,10 +122,7 @@ def main():
             return
 
         print("Compiling models with top candidates...")
-        simple_tuner.compile_flags = [
-            "--iree-hal-target-backends=rocm",
-            f"--iree-hip-target={args.simple_hip_target}",
-        ]
+        simple_tuner.compile_flags = compile_flags
         compiled_model_candidates = libtuner.compile(
             args,
             path_config,
@@ -115,11 +135,7 @@ def main():
             return
 
         print("Benchmarking compiled model candidates...")
-        simple_tuner.benchmark_flags = [
-            "--benchmark_repetitions=3",
-            "--input=2048x2048xf16",
-            "--input=2048x2048xf16",
-        ]
+        simple_tuner.benchmark_flags = model_benchmark_flags
         top_model_candidates = libtuner.benchmark(
             args,
             path_config,

From 74f03e3b8328abf70990dd637c4a8770b0271255 Mon Sep 17 00:00:00 2001
From: Archana Ramalingam
 <98564406+archana-ramalingam@users.noreply.github.com>
Date: Wed, 8 Jan 2025 11:10:39 -0800
Subject: [PATCH 17/35] [sharktank] Temporarily disable IREE perplexity tests
 (#782)

Temporarily disable IREE perplexity tests till #752 is investigated and
merged.

---------

Co-authored-by: Scott Todd <scott.todd0@gmail.com>
---
 sharktank/tests/evaluate/perplexity_iree_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sharktank/tests/evaluate/perplexity_iree_test.py b/sharktank/tests/evaluate/perplexity_iree_test.py
index 1e42bde9c..dc655af59 100644
--- a/sharktank/tests/evaluate/perplexity_iree_test.py
+++ b/sharktank/tests/evaluate/perplexity_iree_test.py
@@ -34,6 +34,7 @@ def setUp(self):
         with open(self.baseline_perplexity_scores, "r") as f:
             self.baseline_perplexity = json.load(f)
 
+    @pytest.mark.xfail(reason="Runtime segfault", run=False)
     def test_llama3_8B_f16_decomposed(self):
 
         # Llama 3.1 8B decomposed

From 0c53fb0c011b3f80f9177c5d1107598c31062965 Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Wed, 8 Jan 2025 11:16:31 -0800
Subject: [PATCH 18/35] Add missing requirements for sharktank t5 tests. (#788)

Fixes https://github.com/nod-ai/shark-ai/issues/787. See the workflow
passing now:
https://github.com/nod-ai/shark-ai/actions/runs/12676783895/job/35330696315?pr=788.

We could also install another package that transitively includes
`protobuf`, like `transformers[sentencepiece]` or `onnx`. I'm also not
sure if this should be a test requirement or a release requirement,
given how limited and indirect the usage is, so just starting simple and
explicit.
---
 sharktank/requirements-tests.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sharktank/requirements-tests.txt b/sharktank/requirements-tests.txt
index e42d68c8c..3532fb50c 100644
--- a/sharktank/requirements-tests.txt
+++ b/sharktank/requirements-tests.txt
@@ -1,6 +1,7 @@
 datasets==3.0.0
 diffusers
 parameterized
+protobuf
 pytest==8.0.0
 pytest-html
 pytest-xdist==3.5.0

From 262d8f18b3f3942e24303040446027e5a56ce489 Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Wed, 8 Jan 2025 14:30:34 -0800
Subject: [PATCH 19/35] Patch version computations in pypi_deploy.sh. (#795)

Adding `-stable` fixes
```
Computing local versions for sharktank and shortfin...
usage: compute_local_version.py [-h] [--write-json] (-stable | -rc | -dev | --version-suffix VERSION_SUFFIX) path
compute_local_version.py: error: one of the arguments -stable/--stable-release -rc/--nightly-release -dev/--development-release --version-suffix is required
```

Adding `--write-json` fixes the `write_requirements.py` step from either
* using previously written JSON files if present (resulting in old
version pins in the meta package, oops!)
* failing to open files that don't exist (failing to release, but at
least not using the wrong versions)

Future changes should put more logging in place and add a confirmation
step. I only caught this because I ran _part_ of the script and paused
to verify manually.
---
 build_tools/python_deploy/pypi_deploy.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build_tools/python_deploy/pypi_deploy.sh b/build_tools/python_deploy/pypi_deploy.sh
index 63f123ac0..28b3df303 100755
--- a/build_tools/python_deploy/pypi_deploy.sh
+++ b/build_tools/python_deploy/pypi_deploy.sh
@@ -95,8 +95,8 @@ function build_shark_ai_meta_package() {
 
   # TODO: rework `write_requirements.py` to use the versions from the downloaded whls?
   echo "Computing local versions for sharktank and shortfin..."
-  ${SCRIPT_DIR}/compute_local_version.py ${REPO_ROOT}/sharktank
-  ${SCRIPT_DIR}/compute_local_version.py ${REPO_ROOT}/shortfin
+  ${SCRIPT_DIR}/compute_local_version.py ${REPO_ROOT}/sharktank -stable --write-json
+  ${SCRIPT_DIR}/compute_local_version.py ${REPO_ROOT}/shortfin -stable --write-json
 
   echo "Computing common version for shark-ai meta package..."
   ${SCRIPT_DIR}/compute_common_version.py --stable-release --write-json

From 79b0595976510ffacc677c3e9196ca6cbad03a3a Mon Sep 17 00:00:00 2001
From: Marius Brehler <marius.brehler@amd.com>
Date: Wed, 8 Jan 2025 23:32:05 +0100
Subject: [PATCH 20/35] Point to iree-turbine nightly releases (#794)

Advise to install the latest nightly release instead of cloning and
installing and editable version of iree-turbine.
---
 docs/developer_guide.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/docs/developer_guide.md b/docs/developer_guide.md
index c347b9dac..a202a96e1 100644
--- a/docs/developer_guide.md
+++ b/docs/developer_guide.md
@@ -93,8 +93,15 @@ different variant, run one of these commands first:
 # Install editable local projects.
 pip install -r requirements.txt -e sharktank/ shortfin/
 
-# Optionally clone and install the latest editable iree-turbine dep in deps/,
-# along with nightly versions of iree-base-compiler and iree-base-runtime.
+# Install the latest nightly release of iree-turbine, alond with
+# nightly versions of iree-base-compiler and iree-base-runtime.
+pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
+  iree-base-compiler iree-base-runtime iree-turbine
+```
+
+You can also install an editable iree-turbine dep:
+```bash
+# Optionally clone and install the latest editable iree-turbine dep in deps/.
 pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
   iree-base-compiler iree-base-runtime --src deps \
   -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"

From 080b0909b94cfc28ee634e7d6f1086172828bd13 Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Wed, 8 Jan 2025 14:51:07 -0800
Subject: [PATCH 21/35] Bump version to 3.2.0 after releasing 3.1.0. (#796)

We just published version 3.1.0:
https://github.com/nod-ai/shark-ai/releases/tag/v3.1.0.
---
 sharktank/version.json | 2 +-
 shortfin/version.json  | 2 +-
 tuner/version.json     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sharktank/version.json b/sharktank/version.json
index 9519501ae..4d2648408 100644
--- a/sharktank/version.json
+++ b/sharktank/version.json
@@ -1,3 +1,3 @@
 {
-  "package-version": "3.1.0.dev"
+  "package-version": "3.2.0.dev"
 }
diff --git a/shortfin/version.json b/shortfin/version.json
index 9519501ae..4d2648408 100644
--- a/shortfin/version.json
+++ b/shortfin/version.json
@@ -1,3 +1,3 @@
 {
-  "package-version": "3.1.0.dev"
+  "package-version": "3.2.0.dev"
 }
diff --git a/tuner/version.json b/tuner/version.json
index 9519501ae..4d2648408 100644
--- a/tuner/version.json
+++ b/tuner/version.json
@@ -1,3 +1,3 @@
 {
-  "package-version": "3.1.0.dev"
+  "package-version": "3.2.0.dev"
 }

From ec73934c0447e1066aed1c708b15bb6279e0c20a Mon Sep 17 00:00:00 2001
From: Max191 <44243577+Max191@users.noreply.github.com>
Date: Wed, 8 Jan 2025 17:54:35 -0500
Subject: [PATCH 22/35] [tuner] Allow tuning of pipeline options (#791)

This PR adds some command line arguments to tune pipeline options. The
new flags are `--prefetch-shared-memory-options`,
`--no-reduce-shared-memory-bank-conflicts-options`, and
`--waves-per-eu-options`. The flags take a comma separated list of
values (bool or int), representing the possible values that the
corresponding pipeline options can take in tuning.

This PR also adds `promote_operands = [0, 1]` to the TileAndFuse tuning
configurations. This could potentially be a tuned parameter in the
future, but for now it is typically good to promote the lhs and rhs
operands.

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
---
 tuner/tuner/candidate_gen.py             |  37 +++++-
 tuner/tuner/common.py                    |   2 +-
 tuner/tuner/dispatch_constraints.py      | 139 +++++++++++++++--------
 tuner/tuner/dispatch_constraints_test.py |   8 --
 tuner/tuner/libtuner.py                  |  25 ++++
 5 files changed, 153 insertions(+), 58 deletions(-)

diff --git a/tuner/tuner/candidate_gen.py b/tuner/tuner/candidate_gen.py
index d7b2da6f6..1e1e48d66 100644
--- a/tuner/tuner/candidate_gen.py
+++ b/tuner/tuner/candidate_gen.py
@@ -194,6 +194,8 @@ def generate_configs_and_td_specs(
     tuner_context: TunerContext,
     limit: int = 4096,  # Max candidates to be generated
     num_subgroups: int = 4,  # GPU spec, used to determine candidate generation constraints
+    allowed_waves_per_eu: list[int] = [2],
+    pipeline_options_search_space: PipelineOptionsSearchSpace = PipelineOptionsSearchSpace(),
     codegen_pipeline: iree_codegen.DispatchLoweringPassPipeline = iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute,
 ) -> list[ir.Module]:
     dispatch_tuner_registry = DispatchTunerRegistry(check_translation_info=False)
@@ -223,7 +225,13 @@ def generate_configs_and_td_specs(
     mma_list = iree_codegen.query_mma_intrinsics(variant_op)
     for i, config in enumerate(
         generate_solutions(
-            tuner_context, problem_size, num_subgroups, mma_list, codegen_pipeline
+            tuner_context,
+            problem_size,
+            num_subgroups,
+            mma_list,
+            allowed_waves_per_eu,
+            pipeline_options_search_space,
+            codegen_pipeline,
         )
     ):
         if i >= limit:
@@ -348,6 +356,24 @@ def main():
         type=int,
         default=-1,
     )
+    parser.add_argument(
+        "--prefetch-shared-memory-options",
+        type=lambda t: [s.strip().lower() == "true" for s in t.split(",")],
+        default=[True],
+        help="Comma-separated list of allowed values for the prefetch_shared_memory pipeline option. Possible values: [True, False]",
+    )
+    parser.add_argument(
+        "--no-reduce-shared-memory-bank-conflicts-options",
+        type=lambda t: [s.strip().lower() == "true" for s in t.split(",")],
+        default=[None],
+        help="Comma-separated list of allowed values for the no_reduce_shared_memory_bank_conflicts pipeline option. Possible values: [True, False]",
+    )
+    parser.add_argument(
+        "--waves-per-eu-options",
+        type=lambda t: [int(s) for s in t.split(",")],
+        default=[2],
+        help="Comma-separated list of allowed values for the waves_per_eu config option. Possible values: Any positive integer value",
+    )
     parser.add_argument(
         "--verbose", "-v", action="store_true", help="Enable verbose output to stdout"
     )
@@ -363,15 +389,20 @@ def main():
     console_handler.setFormatter(formatter)
     tune_logger.addHandler(console_handler)
 
-    with ir.Context() as ctx:
-        tuner_ctx = TunerContext(ctx, tune_logger)
+    with TunerContext() as tuner_ctx:
         mlir_text = strip_compilation_info(args.input)
         mlir_module = parse_mlir(mlir_text, tuner_ctx)
+        pipeline_options_search_space = PipelineOptionsSearchSpace(
+            prefetch_shared_memory=args.prefetch_shared_memory_options,
+            no_reduce_shared_memory_bank_conflicts=args.no_reduce_shared_memory_bank_conflicts_options,
+        )
         specs = generate_configs_and_td_specs(
             mlir_module,
             tuner_ctx,
             args.limit,
             args.num_subgroups,
+            args.waves_per_eu_options,
+            pipeline_options_search_space,
             iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse,
         )
         for candidate_num, spec in enumerate(specs):
diff --git a/tuner/tuner/common.py b/tuner/tuner/common.py
index 8efac1653..05820526b 100644
--- a/tuner/tuner/common.py
+++ b/tuner/tuner/common.py
@@ -174,7 +174,7 @@ def get_lowering_config(
         # A local variable to hold the transformed value.
         promoted_value = value
         match key:
-            case "workgroup" | "reduction" | "subgroup":
+            case "workgroup" | "reduction" | "subgroup" | "promote_operands":
                 if isinstance(value, list):
                     promoted_value = ir.ArrayAttr.get(
                         [tuner_ctx.type.getI64(x) for x in value]
diff --git a/tuner/tuner/dispatch_constraints.py b/tuner/tuner/dispatch_constraints.py
index c20325249..b41d808b9 100644
--- a/tuner/tuner/dispatch_constraints.py
+++ b/tuner/tuner/dispatch_constraints.py
@@ -86,7 +86,6 @@ def generate_vector_distribute_constraints(
     workgroup_size: list[z3.ArithRef],
     subgroup_m_count: z3.ArithRef,
     subgroup_n_count: z3.ArithRef,
-    waves_per_eu: z3.ArithRef,
     mma_intrinsics: list[iree_gpu.MMAIntrinsic],
 ):
     M, N, K = (
@@ -142,9 +141,6 @@ def generate_vector_distribute_constraints(
     else:
         constraints += [subgroups >= 1, subgroups <= 10]
 
-    constraints += [waves_per_eu == 2]
-    # constraints += [z3.Or(waves_per_eu == 2, waves_per_eu == 3, waves_per_eu == 4)]
-
     shared_memory = calculate_shared_memory_usage_in_bytes(problem_size, [m], [n], [k])
     constraints += [shared_memory <= 65536]
 
@@ -162,7 +158,6 @@ def generate_tile_and_fuse_constraints(
     workgroup_size: list[z3.ArithRef],
     subgroup_m_count: z3.ArithRef,
     subgroup_n_count: z3.ArithRef,
-    waves_per_eu: z3.ArithRef,
     mma_intrinsics: list[iree_gpu.MMAIntrinsic],
 ):
     M, N, K = problem_size.MNK
@@ -230,8 +225,6 @@ def generate_tile_and_fuse_constraints(
         constraints += [subgroups >= 1, subgroups <= 10]
     constraints += [wg_threads == subgroups * subgroup_size]
 
-    constraints += [waves_per_eu == 2]
-
     shared_memory = calculate_shared_memory_usage_in_bytes(
         problem_size, m_tiles, n_tiles, k_tiles
     )
@@ -269,11 +262,89 @@ def getMMAAttr(
     )
 
 
+@dataclass
+class PipelineOptionsSearchSpace:
+    prefetch_shared_memory: list[Optional[bool]] = field(default_factory=lambda: [None])
+    no_reduce_shared_memory_bank_conflicts: list[Optional[bool]] = field(
+        default_factory=lambda: [None]
+    )
+
+
+def generate_allowed_pipeline_options(
+    pipeline_options_search_space: PipelineOptionsSearchSpace,
+) -> list[iree_gpu.PipelineOptionsAttr]:
+    pipeline_options_list = []
+    for psm in pipeline_options_search_space.prefetch_shared_memory:
+        for (
+            nrbc
+        ) in pipeline_options_search_space.no_reduce_shared_memory_bank_conflicts:
+            pipeline_options_list.append(
+                iree_gpu.PipelineOptionsAttr.get(
+                    prefetch_shared_memory=psm,
+                    no_reduce_shared_memory_bank_conflicts=nrbc,
+                )
+            )
+    return pipeline_options_list
+
+
+def generate_compilation_infos(
+    tuner_ctx: TunerContext,
+    mma_attr: iree_gpu.MMAAttr,
+    workgroup_tile_sizes: list[int],
+    reduction_tile_sizes: list[int],
+    subgroup_tile_sizes: list[int],
+    workgroup_sizes: tuple[int, int, int],
+    subgroup_size: int,
+    subgroup_m_count: int,
+    subgroup_n_count: int,
+    codegen_pipeline: iree_codegen.DispatchLoweringPassPipeline,
+    pipeline_options_search_space: PipelineOptionsSearchSpace,
+    allowed_waves_per_eu: list[int],
+) -> list[iree_codegen.CompilationInfoAttr]:
+    # Create the LoweringConfigAttr.
+    lowering_config_args = {
+        "tuner_ctx": tuner_ctx,
+        "mma_kind": mma_attr,
+        "workgroup": workgroup_tile_sizes,
+        "reduction": reduction_tile_sizes,
+        "subgroup_m_count": subgroup_m_count,
+        "subgroup_n_count": subgroup_n_count,
+    }
+    if codegen_pipeline == iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse:
+        lowering_config_args["subgroup"] = subgroup_tile_sizes
+        lowering_config_args["promote_operands"] = [0, 1]
+    lowering_config = get_lowering_config(**lowering_config_args)
+
+    # Create the TranslationInfoAttr
+    pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(codegen_pipeline)
+    pipeline_options_list = generate_allowed_pipeline_options(
+        pipeline_options_search_space
+    )
+    wg_x, wg_y, wg_z = workgroup_sizes
+    compilation_infos = []
+    for pipeline_options in pipeline_options_list:
+        for waves_per_eu in allowed_waves_per_eu:
+            config_dict = get_translation_info_config(pipeline_options, waves_per_eu)
+            translation_info = iree_codegen.TranslationInfoAttr.get(
+                pipeline_attr,
+                None,
+                [wg_x, wg_y, wg_z],
+                subgroup_size,
+                config_dict,
+            )
+            compilation_infos.append(
+                iree_codegen.CompilationInfoAttr.get(lowering_config, translation_info)
+            )
+    return compilation_infos
+
+
 def generate_solutions(
     tuner_ctx: TunerContext,
     problem_size: ProblemSize,
     num_subgrups: int,
     mma_intrinsics: list[iree_gpu.MMAIntrinsic],
+    allowed_waves_per_eu: list[int] = [2],
+    pipeline_options_search_space: PipelineOptionsSearchSpace = PipelineOptionsSearchSpace(),
     codegen_pipeline: iree_codegen.DispatchLoweringPassPipeline = iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute,
 ) -> Iterator[iree_codegen.CompilationInfoAttr]:
     M, N, K = problem_size.MNK
@@ -290,7 +361,6 @@ def generate_solutions(
     wg_x, wg_y, wg_z = z3.Int("wg_x"), z3.Int("wg_y"), z3.Int("wg_z")
     sg_m_cnt = z3.Int("sg_m_cnt")
     sg_n_cnt = z3.Int("sg_n_cnt")
-    waves_per_eu = z3.Int("waves_per_eu")
     all_vars = (
         m_vars
         + n_vars
@@ -304,7 +374,6 @@ def generate_solutions(
             wg_z,
             sg_m_cnt,
             sg_n_cnt,
-            waves_per_eu,
         ]
     )
 
@@ -320,7 +389,6 @@ def generate_solutions(
                 [wg_x, wg_y, wg_z],
                 sg_m_cnt,
                 sg_n_cnt,
-                waves_per_eu,
                 mma_intrinsics,
             )
             constraints += [v == 0 for v in subgroup_m_vars + subgroup_n_vars]
@@ -334,7 +402,6 @@ def generate_solutions(
                 [wg_x, wg_y, wg_z],
                 sg_m_cnt,
                 sg_n_cnt,
-                waves_per_eu,
                 mma_intrinsics,
             )
     solver.add(z3.simplify(z3.And(constraints)))
@@ -407,43 +474,23 @@ def set_cdim_tile_sizes(tile_sizes, contraction_dims, csizes):
             [lookup(v) for v in k_vars],
         )
 
-        # Create the LoweringConfigAttr.
-        lowering_config_args = {
-            "tuner_ctx": tuner_ctx,
-            "mma_kind": mma_attr,
-            "workgroup": workgroup_tile_sizes,
-            "reduction": reduction_tile_sizes,
-            "subgroup_m_count": lookup(sg_m_cnt),
-            "subgroup_n_count": lookup(sg_n_cnt),
-        }
-        if (
-            codegen_pipeline
-            == iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse
-        ):
-            lowering_config_args["subgroup"] = subgroup_tile_sizes
-        lowering_config = get_lowering_config(**lowering_config_args)
-
-        # Create the TranslationInfoAttr
-        pipeline_attr = iree_codegen.DispatchLoweringPassPipelineAttr.get(
-            codegen_pipeline
-        )
-        pipeline_options = iree_gpu.PipelineOptionsAttr.get()
-        config_dict = get_translation_info_config(
-            pipeline_options, lookup(waves_per_eu)
-        )
-        translation_info = iree_codegen.TranslationInfoAttr.get(
-            pipeline_attr,
-            None,
-            [lookup(wg_x), lookup(wg_y), lookup(wg_z)],
+        compilation_infos = generate_compilation_infos(
+            tuner_ctx,
+            mma_attr,
+            workgroup_tile_sizes,
+            reduction_tile_sizes,
+            subgroup_tile_sizes,
+            (lookup(wg_x), lookup(wg_y), lookup(wg_z)),
             lookup(subgroup_size),
-            config_dict,
-        )
-
-        # Create the CompilationInfoAttr.
-        compilation_info = iree_codegen.CompilationInfoAttr.get(
-            lowering_config, translation_info
+            lookup(sg_m_cnt),
+            lookup(sg_n_cnt),
+            codegen_pipeline,
+            pipeline_options_search_space,
+            allowed_waves_per_eu,
         )
 
         solver.add(z3.simplify(z3.Not(z3.And(list(x == model[x] for x in all_vars)))))
         i += 1
-        yield compilation_info
+
+        for compilation_info in compilation_infos:
+            yield compilation_info
diff --git a/tuner/tuner/dispatch_constraints_test.py b/tuner/tuner/dispatch_constraints_test.py
index 9a34e41db..45f6c767e 100644
--- a/tuner/tuner/dispatch_constraints_test.py
+++ b/tuner/tuner/dispatch_constraints_test.py
@@ -166,7 +166,6 @@ def test_generate_tile_and_fuse_constraints_valid_input(
     )
     sg_m_cnt = z3.Int("sg_m_cnt")
     sg_n_cnt = z3.Int("sg_n_cnt")
-    waves_per_eu = z3.Int("waves_per_eu")
 
     constraints = dispatch_constraints.generate_tile_and_fuse_constraints(
         problem_size,
@@ -177,7 +176,6 @@ def test_generate_tile_and_fuse_constraints_valid_input(
         [wg_x, wg_y, wg_z],
         sg_m_cnt,
         sg_n_cnt,
-        waves_per_eu,
         [
             iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
             iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16,
@@ -240,7 +238,6 @@ def test_generate_tile_and_fuse_constraints_invalid_input(
     )
     sg_m_cnt = z3.Int("sg_m_cnt")
     sg_n_cnt = z3.Int("sg_n_cnt")
-    waves_per_eu = z3.Int("waves_per_eu")
 
     constraints = dispatch_constraints.generate_tile_and_fuse_constraints(
         problem_size,
@@ -251,7 +248,6 @@ def test_generate_tile_and_fuse_constraints_invalid_input(
         [wg_x, wg_y, wg_z],
         sg_m_cnt,
         sg_n_cnt,
-        waves_per_eu,
         [
             iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
             iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16,
@@ -300,7 +296,6 @@ def test_generate_vector_distribute_constraints_valid_input(
     )
     sg_m_cnt = z3.Int("sg_m_cnt")
     sg_n_cnt = z3.Int("sg_n_cnt")
-    waves_per_eu = z3.Int("waves_per_eu")
 
     constraints = dispatch_constraints.generate_vector_distribute_constraints(
         problem_size,
@@ -311,7 +306,6 @@ def test_generate_vector_distribute_constraints_valid_input(
         [wg_x, wg_y, wg_z],
         sg_m_cnt,
         sg_n_cnt,
-        waves_per_eu,
         [
             iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
             iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16,
@@ -359,7 +353,6 @@ def test_generate_vector_distribute_constraints_invalid_input(
     )
     sg_m_cnt = z3.Int("sg_m_cnt")
     sg_n_cnt = z3.Int("sg_n_cnt")
-    waves_per_eu = z3.Int("waves_per_eu")
 
     constraints = dispatch_constraints.generate_vector_distribute_constraints(
         problem_size,
@@ -370,7 +363,6 @@ def test_generate_vector_distribute_constraints_invalid_input(
         [wg_x, wg_y, wg_z],
         sg_m_cnt,
         sg_n_cnt,
-        waves_per_eu,
         [
             iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16,
             iree_gpu.MMAIntrinsic.MFMA_F32_32x32x8_F16,
diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py
index b18736ffb..bd0f1b27a 100644
--- a/tuner/tuner/libtuner.py
+++ b/tuner/tuner/libtuner.py
@@ -40,6 +40,7 @@
 from . import dispatch_parser
 from .op_matchers import *
 from .common import *
+from .dispatch_constraints import *
 
 
 # Default values for num_candidates and devices, change it as needed
@@ -300,6 +301,24 @@ def parse_arguments(
     candidate_gen_args.add_argument(
         "--tile-dims", help="Map of tile size matmul dims", type=str, default="mnk"
     )
+    candidate_gen_args.add_argument(
+        "--prefetch-shared-memory-options",
+        type=lambda t: [s.strip().lower() == "true" for s in t.split(",")],
+        default=[True],
+        help="Comma-separated list of allowed values for the prefetch_shared_memory pipeline option. Possible values: [True, False]",
+    )
+    candidate_gen_args.add_argument(
+        "--no-reduce-shared-memory-bank-conflicts-options",
+        type=lambda t: [s.strip().lower() == "true" for s in t.split(",")],
+        default=[None],
+        help="Comma-separated list of allowed values for the no_reduce_shared_memory_bank_conflicts pipeline option. Possible values: [True, False]",
+    )
+    candidate_gen_args.add_argument(
+        "--waves-per-eu-options",
+        type=lambda t: [int(s) for s in t.split(",")],
+        default=[2],
+        help="Comma-separated list of allowed values for the waves_per_eu config option. Possible values: Any positive integer value",
+    )
     general_args.add_argument(
         "--codegen-pipeline",
         choices=[x.value for x in CodegenPipelines],
@@ -642,11 +661,17 @@ def generate_candidate_specs(
         mlir_text = candidate_gen.strip_compilation_info(path_config.template_mlir)
         mlir_module = dispatch_parser.parse_mlir(mlir_text, tuning_client.tuner_context)
         logging.debug("Captured messages from candidate_gen.py:")
+        pipeline_options_search_space = PipelineOptionsSearchSpace(
+            prefetch_shared_memory=args.prefetch_shared_memory_options,
+            no_reduce_shared_memory_bank_conflicts=args.no_reduce_shared_memory_bank_conflicts_options,
+        )
         config_specs: list[ir.Module] = candidate_gen.generate_configs_and_td_specs(
             input_module=mlir_module,
             tuner_context=tuning_client.tuner_context,
             limit=args.num_candidates,
             num_subgroups=args.num_subgroups,
+            allowed_waves_per_eu=args.waves_per_eu_options,
+            pipeline_options_search_space=pipeline_options_search_space,
             codegen_pipeline=get_iree_codegen_pipeline(args.codegen_pipeline),
         )
         logging.debug("candidate_gen.py ends")

From f7153225818b53f02c964e60c769a3eaf5087ae7 Mon Sep 17 00:00:00 2001
From: Archana Ramalingam
 <98564406+archana-ramalingam@users.noreply.github.com>
Date: Wed, 8 Jan 2025 15:20:01 -0800
Subject: [PATCH 23/35] [sharktank] Update perplexity status badge (#790)

Update perplexity status badge to reflect latest run on main branch
[here](https://github.com/nod-ai/shark-ai/tree/main/sharktank#project-status).
---
 sharktank/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sharktank/README.md b/sharktank/README.md
index 7770595ed..bfdac8b4d 100644
--- a/sharktank/README.md
+++ b/sharktank/README.md
@@ -12,7 +12,7 @@ tooling.
 
 ## Project Status
 
-[![CI - Perplexity](https://github.com/nod-ai/shark-ai/actions/workflows/ci_eval.yaml/badge.svg?branch=main&event=schedule)](https://github.com/nod-ai/shark-ai/actions/workflows/ci_eval.yaml)
+[![CI - sharktank perplexity](https://github.com/nod-ai/shark-ai/actions/workflows/ci_eval.yaml/badge.svg?branch=main)](https://github.com/nod-ai/shark-ai/actions/workflows/ci_eval.yaml)
 
 ## Examples
 

From 7849f8eb49c7519da48aa794322962211bc9b091 Mon Sep 17 00:00:00 2001
From: Bangtian Liu <liubangtian@gmail.com>
Date: Wed, 8 Jan 2025 21:12:33 -0500
Subject: [PATCH 24/35] [tuner] clean up candidate gen (#797)

- removed unused function `apply_params`
- removed unused function `validate translation`

Signed-off-by: Bangtian Liu <liubangtian@gmail.com>
---
 tuner/tuner/candidate_gen.py | 45 ++----------------------------------
 1 file changed, 2 insertions(+), 43 deletions(-)

diff --git a/tuner/tuner/candidate_gen.py b/tuner/tuner/candidate_gen.py
index 1e1e48d66..07a694131 100644
--- a/tuner/tuner/candidate_gen.py
+++ b/tuner/tuner/candidate_gen.py
@@ -37,17 +37,6 @@
 
 
 class DispatchTuner(DispatchParser):
-    # TODO(https://github.com/nod-ai/shark-ai/issues/453): Remove this in favor of configuring using transform dialect.
-    @abstractmethod
-    def apply_params(
-        self,
-        problem_size: ProblemSize,
-        template: list[str],
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> MLIRTransformation:
-        """Apply parameter transformations to the operation."""
-        pass
-
     @abstractmethod
     def get_td_spec(
         self,
@@ -59,25 +48,13 @@ def get_td_spec(
 
 
 class DispatchTunerRegistry:
-    def __init__(self, check_translation_info=True):
-        self.check_translation_info = check_translation_info
+    def __init__(self):
         self.registry = set()
 
     def register(self, dispatch_tuners: list[DispatchTuner]) -> None:
         for dispatch_tuner in dispatch_tuners:
             self.registry.add(dispatch_tuner)
 
-    # TODO(Max191): Remove translation info validation.
-    def validate_translation(self, attrs: list[ir.NamedAttribute]) -> bool:
-        if not self.check_translation_info:
-            return True
-        for attr in attrs:
-            if (attr.name == "translation_info") and (
-                "LLVMGPUVectorDistribute" in str(attr.attr)
-            ):
-                return True
-        assert False, "Translation info not supported"
-
     def find_handler(self, op_name: str) -> DispatchTuner:
         for dispatch_tuner in self.registry:
             if dispatch_tuner.supports(op_name):
@@ -86,14 +63,6 @@ def find_handler(self, op_name: str) -> DispatchTuner:
 
 
 class ContractionOpInterfaceTuner(DispatchTuner, ContractionOpInterfaceParser):
-    def apply_params(
-        self,
-        problem_size: ProblemSize,
-        template: list[str],
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> MLIRTransformation:
-        raise NotImplementedError
-
     def get_td_spec(
         self,
         ir_module: ir.Module,
@@ -114,14 +83,6 @@ def get_td_spec(
 
 
 class ConvolutionOpInterfaceTuner(DispatchTuner, ConvolutionOpInterfaceParser):
-    def apply_params(
-        self,
-        problem_size: ProblemSize,
-        template: list[str],
-        compilation_info: iree_codegen.CompilationInfoAttr,
-    ) -> MLIRTransformation:
-        raise NotImplementedError
-
     def get_td_spec(
         self,
         ir_module: ir.Module,
@@ -158,8 +119,6 @@ def walk_callback_get_fn(
     walk_result: OpWalkResult,
     dispatch_tuner_registry: DispatchTunerRegistry,
 ) -> ir.WalkResult:
-    if op.name == "func.func":
-        dispatch_tuner_registry.validate_translation([a for a in op.opview.attributes])
     if op.name == "util.func":
         func_name = str(op.opview.sym_name)
         walk_result.was_interrupted = True
@@ -198,7 +157,7 @@ def generate_configs_and_td_specs(
     pipeline_options_search_space: PipelineOptionsSearchSpace = PipelineOptionsSearchSpace(),
     codegen_pipeline: iree_codegen.DispatchLoweringPassPipeline = iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute,
 ) -> list[ir.Module]:
-    dispatch_tuner_registry = DispatchTunerRegistry(check_translation_info=False)
+    dispatch_tuner_registry = DispatchTunerRegistry()
     dispatch_tuner_registry.register(
         [
             ContractionOpInterfaceTuner(),

From c2ad576463e1b6b10b706a82ef57082ed9eb60d6 Mon Sep 17 00:00:00 2001
From: Max191 <44243577+Max191@users.noreply.github.com>
Date: Thu, 9 Jan 2025 11:30:37 -0500
Subject: [PATCH 25/35] [tuner] Write candidate specs with local scope (#800)

This PR changes the format of generated TD specs to inline the
compilation info into the matcher function. This makes splitting the TD
spec and combining with others after tuning much easier.

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
---
 tuner/tuner/candidate_gen.py | 5 +++--
 tuner/tuner/libtuner.py      | 6 +++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tuner/tuner/candidate_gen.py b/tuner/tuner/candidate_gen.py
index 07a694131..a1ee421d0 100644
--- a/tuner/tuner/candidate_gen.py
+++ b/tuner/tuner/candidate_gen.py
@@ -355,7 +355,7 @@ def main():
             prefetch_shared_memory=args.prefetch_shared_memory_options,
             no_reduce_shared_memory_bank_conflicts=args.no_reduce_shared_memory_bank_conflicts_options,
         )
-        specs = generate_configs_and_td_specs(
+        specs: list[ir.Module] = generate_configs_and_td_specs(
             mlir_module,
             tuner_ctx,
             args.limit,
@@ -369,7 +369,8 @@ def main():
             spec_path = spec_dir / f"{candidate_num}_spec.mlir"
             spec_dir.mkdir(parents=True, exist_ok=True)
             with open(spec_path, "w") as f:
-                f.write(str(spec))
+                local_scope_spec_str: str = spec.operation.get_asm(use_local_scope=True)
+                f.write(local_scope_spec_str)
 
 
 if __name__ == "__main__":
diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py
index bd0f1b27a..6184d0c95 100644
--- a/tuner/tuner/libtuner.py
+++ b/tuner/tuner/libtuner.py
@@ -688,7 +688,11 @@ def generate_candidate_specs(
                 candidate_num
             )
             with open(spec_path, "w") as f:
-                f.write(str(spec))
+                # Write the module with local scope so that compilation info
+                # attributes are inlined. This makes it easier to split up the
+                # TD spec and combine with other specs after tuning.
+                local_scope_spec_str: str = spec.operation.get_asm(use_local_scope=True)
+                f.write(local_scope_spec_str)
             new_candidate = CandidateTracker(
                 mlir_path=path_config.template_mlir,
                 candidate_id=candidate_num,

From 1e23a74fd1bcad87e0af3c4e2b32a8323c16c7ff Mon Sep 17 00:00:00 2001
From: Max191 <44243577+Max191@users.noreply.github.com>
Date: Thu, 9 Jan 2025 11:56:19 -0500
Subject: [PATCH 26/35] [tuner] Add timeout for compilation in TuningClient
 (#798)

This PR adds a timeout for compilation to prevent the tuner from hanging
when iree-compile hangs. The `iree.compiler.compile_file` binding is no
longer used, since it does not support a timeout. Instead, the command
is run as a regular subprocess.

---------

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
---
 tuner/examples/simple/simple_tuner.py |  5 ++++
 tuner/tuner/libtuner.py               | 40 +++++++++++++++------------
 2 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/tuner/examples/simple/simple_tuner.py b/tuner/examples/simple/simple_tuner.py
index bd5b2eca1..d4ec089fe 100644
--- a/tuner/examples/simple/simple_tuner.py
+++ b/tuner/examples/simple/simple_tuner.py
@@ -15,10 +15,14 @@ def __init__(self, tuner_context: libtuner.TunerContext):
         super().__init__(tuner_context)
         self.compile_flags: list[str] = []
         self.benchmark_flags: list[str] = []
+        self.compile_timeout: int = 10
 
     def get_iree_compile_flags(self) -> list[str]:
         return self.compile_flags
 
+    def get_iree_compile_timeout_s(self) -> int:
+        return self.compile_timeout
+
     def get_iree_benchmark_module_flags(self) -> list[str]:
         return self.benchmark_flags
 
@@ -123,6 +127,7 @@ def main():
 
         print("Compiling models with top candidates...")
         simple_tuner.compile_flags = compile_flags
+        simple_tuner.compile_timeout = 60
         compiled_model_candidates = libtuner.compile(
             args,
             path_config,
diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py
index 6184d0c95..63740ee9b 100644
--- a/tuner/tuner/libtuner.py
+++ b/tuner/tuner/libtuner.py
@@ -110,6 +110,10 @@ def __init__(self, tuner_context: TunerContext):
     def get_iree_compile_flags(self) -> list[str]:
         pass
 
+    @abstractmethod
+    def get_iree_compile_timeout_s(self) -> int:
+        pass
+
     @abstractmethod
     def get_iree_benchmark_module_flags(self) -> list[str]:
         pass
@@ -122,6 +126,7 @@ def get_benchmark_timeout_s(self) -> int:
 @dataclass
 class CompilePack:
     iree_compile_flags: list[str]
+    iree_compile_timeout: int
     candidate_tracker: CandidateTracker
 
 
@@ -440,30 +445,29 @@ def run_iree_compile_command(compile_pack: CompilePack) -> Optional[int]:
     logging.debug(
         f"Compiling candidate {candidate_tracker.candidate_id} with spec: {td_spec_path}"
     )
-    extra_flags = [
-        f"--iree-codegen-tuning-spec-path={td_spec_path}",
-    ]
-    extra_flags += compile_pack.iree_compile_flags
     assert candidate_tracker.compiled_vmfb_path, "expected output vmfb path"
     output_path = candidate_tracker.compiled_vmfb_path.as_posix()
     crash_dump_path = f"{output_path}.crash_report.mlir"
     assert candidate_tracker.mlir_path, "expected input mlir file path"
     input_file = candidate_tracker.mlir_path.as_posix()
-    # TODO(Max191): Make the device in `traget_backends` a command line option
-    # instead of hardcoding in ireec.compile_str.
-    try:
-        ireec.compile_file(
-            input_file=input_file,
-            target_backends=["rocm"],
-            output_file=output_path,
-            extra_args=extra_flags,
-            crash_reproducer_path=crash_dump_path,
+    iree_compile = ireec.binaries.find_tool("iree-compile")
+    compile_command = [
+        iree_compile,
+        input_file,
+        f"-o={output_path}",
+        f"--mlir-pass-pipeline-crash-reproducer={crash_dump_path}",
+        f"--iree-codegen-tuning-spec-path={td_spec_path}",
+    ]
+    compile_command += compile_pack.iree_compile_flags
+    result = candidate_gen.run_command(
+        candidate_gen.RunPack(
+            command=compile_command,
+            check=False,
+            timeout_seconds=compile_pack.iree_compile_timeout,
         )
-    except ireec.CompilerToolError as e:
-        logging.info(f"Compilation returned non-zero exit status.")
-        logging.debug(e)
+    )
+    if result.process_res is None or result.is_timeout:
         return None
-
     return candidate_tracker.candidate_id
 
 
@@ -775,6 +779,7 @@ def compile(
     task_list = [
         CompilePack(
             iree_compile_flags=tuning_client.get_iree_compile_flags(),
+            iree_compile_timeout=tuning_client.get_iree_compile_timeout_s(),
             candidate_tracker=candidate_trackers[i],
         )
         for i in candidates
@@ -783,6 +788,7 @@ def compile(
         task_list.append(
             CompilePack(
                 iree_compile_flags=tuning_client.get_iree_compile_flags(),
+                iree_compile_timeout=tuning_client.get_iree_compile_timeout_s(),
                 candidate_tracker=candidate_trackers[0],
             )
         )

From 500df5808cd51859203e0451d79d809d916c4b8c Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Thu, 9 Jan 2025 09:30:35 -0800
Subject: [PATCH 27/35] Bump IREE version pins to 3.2.0rc20250109. (#802)

See https://github.com/nod-ai/shark-ai/issues/760 for context. We want
to stay close to the latest versions while still pinning versions for
predictability. Updating version pins is currently a manual process but
we plan on automating it in the future.

We can decide how noisy we want these dependency updates to be:

* new PRs daily or less frequently
* do or don't reuse existing PRs
* merge ASAP or let them sit for multiple days
---
 requirements-iree-pinned.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements-iree-pinned.txt b/requirements-iree-pinned.txt
index 6895fa7d4..fa7b1fe05 100644
--- a/requirements-iree-pinned.txt
+++ b/requirements-iree-pinned.txt
@@ -3,6 +3,6 @@
 # Keep these versions synced with SHORTFIN_IREE_GIT_TAG in shortfin/CMakeLists.txt
 --pre
 --find-links https://iree.dev/pip-release-links.html
-iree-base-compiler==3.1.0rc20250107
-iree-base-runtime==3.1.0rc20250107
-iree-turbine==3.1.0rc20250107
+iree-base-compiler==3.2.0rc20250109
+iree-base-runtime==3.2.0rc20250109
+iree-turbine==3.2.0rc20250109

From 35ad7d02234b24e61bff6927affcf859b86eff06 Mon Sep 17 00:00:00 2001
From: Max191 <44243577+Max191@users.noreply.github.com>
Date: Thu, 9 Jan 2025 13:32:59 -0500
Subject: [PATCH 28/35] [tuner] Filter out non finite benchmark times (#799)

This PR fixes a bug where math.inf benchmark times can be selected as
the top candidates. Any non finite times are now filtered out before
selecting top candidates.

---------

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
---
 tuner/tuner/libtuner.py      | 89 +++++++++++++++++++++++++++---------
 tuner/tuner/libtuner_test.py | 56 +++++++++++++++++++++++
 2 files changed, 123 insertions(+), 22 deletions(-)

diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py
index 63740ee9b..f5a316a65 100644
--- a/tuner/tuner/libtuner.py
+++ b/tuner/tuner/libtuner.py
@@ -530,9 +530,7 @@ def run_iree_benchmark_module_command(benchmark_pack: BenchmarkPack):
         )
 
     times = []
-    logging.debug(f"candidate {candidate_id} benchmark_results: {benchmark_results}")
     for benchmark_result in benchmark_results:
-        logging.debug(f"candidate {candidate_id} benchmark_result: {benchmark_result}")
         benchmark_name = benchmark_result.benchmark_name
         # With multiple benchmark results, there will be `real_time_mean`, but
         # not with single iteration benchmark results, so ignore the mean time
@@ -818,6 +816,63 @@ def compile(
     return compiled_candidates
 
 
+def select_best_benchmark_results(
+    candidate_results: list[BenchmarkResult],
+    baseline_results: list[BenchmarkResult],
+    num_candidates: Optional[int],
+) -> list[BenchmarkResult]:
+    filtered_candidate_results = [r for r in candidate_results if math.isfinite(r.time)]
+    if len(filtered_candidate_results) == 0:
+        logging.error("No successful candidate benchmarks.")
+        return []
+    fallback_baseline_time: Optional[float] = None
+    filtered_baseline_results: list[BenchmarkResult] = []
+    for r in baseline_results:
+        if math.isfinite(r.time):
+            filtered_baseline_results.append(r)
+            fallback_baseline_time = r.time
+        else:
+            logging.warning(f"Baseline on device {r.device_id} failed.")
+    if fallback_baseline_time is None:
+        logging.warning(
+            f"All baseline benchmarks failed. Baselines will not be used to select top candidates"
+        )
+    baseline_times_by_device = {}
+    for r in filtered_baseline_results:
+        baseline_times_by_device[r.device_id] = r.time
+
+    # Select top candidates
+    def get_speedup(result: BenchmarkResult) -> float:
+        if result.device_id in baseline_times_by_device:
+            return result.time / baseline_times_by_device[result.device_id]
+        assert fallback_baseline_time is not None, "expected fallback_baseline_time"
+        return result.time / fallback_baseline_time
+
+    num_top_candidates = len(filtered_candidate_results)
+    if num_candidates is not None:
+        num_top_candidates = num_candidates
+
+    # Sort by the speedup over baseline on the same device. If a device failed
+    # the baseline benchmark, then use the fallback baseline. If there is no
+    # successful baseline, then the best we can do is to sort by the actual
+    # time.
+    sorting_key = get_speedup
+    if fallback_baseline_time is None:
+        sorting_key = lambda result: result.time
+    best_results = sorted(filtered_candidate_results, key=sorting_key)[
+        :num_top_candidates
+    ]
+    logging.info(f"Selected top[{len(best_results)}]:")
+
+    for r in best_results:
+        if fallback_baseline_time is not None:
+            speedup = f"{round(get_speedup(r) * 100, 2)}% of baseline"
+        else:
+            speedup = "baseline unavailable"
+        logging.info(f"Candidate {r.candidate_id} time: {r.time} ({speedup})")
+    return best_results
+
+
 def benchmark(
     args: argparse.Namespace,
     path_config: PathConfig,
@@ -827,6 +882,9 @@ def benchmark(
     num_candidates: Optional[int] = None,
 ):
     logging.debug("benchmark()")
+    if len(compiled_candidates) == 0:
+        logging.warning("No candidates to benchmark.")
+        return []
 
     task_list = [
         BenchmarkPack(
@@ -838,7 +896,7 @@ def benchmark(
         if i != 0
     ]
     worker_context_queue = create_worker_context_queue(args.devices)
-    candidate_results = multiprocess_progress_wrapper(
+    candidate_results: list[BenchmarkResult] = multiprocess_progress_wrapper(
         num_worker=len(args.devices),
         task_list=task_list,
         function=run_iree_benchmark_module_command,
@@ -855,32 +913,19 @@ def benchmark(
             candidate_tracker=candidate_trackers[0],
         )
     ] * len(args.devices)
-    baseline_results = multiprocess_progress_wrapper(
+    baseline_results: list[BenchmarkResult] = multiprocess_progress_wrapper(
         num_worker=len(args.devices),
         task_list=baseline_task_list,
         function=run_iree_benchmark_module_command,
         initializer=init_worker_context,
         initializer_inputs=(worker_context_queue,),
     )
-    baseline_times_by_device = {}
-    for r in baseline_results:
-        baseline_times_by_device[r.device_id] = r.time
 
-    # Select top candidates
-    def get_speedup(result: BenchmarkResult) -> float:
-        return result.time / baseline_times_by_device[result.device_id]
-
-    num_top_candidates = len(candidate_results)
-    if num_candidates is not None:
-        num_top_candidates = num_candidates
-    best_results = sorted(candidate_results, key=get_speedup)[:num_top_candidates]
-    logging.info(f"Selected top[{len(best_results)}]:")
-
-    for r in best_results:
-        speedup = round(get_speedup(r) * 100, 2)
-        logging.info(
-            f"Candidate {r.candidate_id} time: {r.time} ({speedup}% of baseline)"
-        )
+    best_results: list[BenchmarkResult] = select_best_benchmark_results(
+        candidate_results=candidate_results,
+        baseline_results=baseline_results,
+        num_candidates=num_candidates,
+    )
 
     top_candidates = [result.candidate_id for result in best_results]
     return top_candidates
diff --git a/tuner/tuner/libtuner_test.py b/tuner/tuner/libtuner_test.py
index 767a6aff4..cad57a3cd 100644
--- a/tuner/tuner/libtuner_test.py
+++ b/tuner/tuner/libtuner_test.py
@@ -5,6 +5,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import argparse
+import math
 import pytest
 import json
 from subprocess import CompletedProcess
@@ -175,5 +176,60 @@ def test_validate_devices_with_invalid_device() -> None:
                 assert expected_call in mock_handle_error.call_args_list
 
 
+def test_select_best_benchmark_results() -> None:
+    candidate_results = [
+        libtuner.BenchmarkResult(1, 0.5, "hip://0"),
+        libtuner.BenchmarkResult(2, 0.3, "hip://1"),
+        libtuner.BenchmarkResult(3, 0.2, "hip://2"),
+        libtuner.BenchmarkResult(4, 0.1, "hip://3"),
+    ]
+    baseline_results = [
+        libtuner.BenchmarkResult(0, 1.0, "hip://0"),
+        libtuner.BenchmarkResult(0, 0.1, "hip://1"),
+        libtuner.BenchmarkResult(0, 0.1, "hip://2"),
+        libtuner.BenchmarkResult(0, 0.1, "hip://3"),
+    ]
+    best_results: list[
+        libtuner.BenchmarkResult
+    ] = libtuner.select_best_benchmark_results(
+        candidate_results=candidate_results,
+        baseline_results=baseline_results,
+        num_candidates=3,
+    )
+    assert best_results[0].candidate_id == 1
+    assert best_results[1].candidate_id == 4
+    assert best_results[2].candidate_id == 3
+
+    baseline_results = [
+        libtuner.BenchmarkResult(0, math.inf, "hip://0"),
+        libtuner.BenchmarkResult(0, 0.1, "hip://1"),
+        libtuner.BenchmarkResult(0, 0.1, "hip://2"),
+        libtuner.BenchmarkResult(0, 0.1, "hip://3"),
+    ]
+    best_results = libtuner.select_best_benchmark_results(
+        candidate_results=candidate_results,
+        baseline_results=baseline_results,
+        num_candidates=3,
+    )
+    assert best_results[0].candidate_id == 4
+    assert best_results[1].candidate_id == 3
+    assert best_results[2].candidate_id == 2
+
+    baseline_results = [
+        libtuner.BenchmarkResult(0, math.inf, "hip://0"),
+        libtuner.BenchmarkResult(0, math.inf, "hip://1"),
+        libtuner.BenchmarkResult(0, math.inf, "hip://2"),
+        libtuner.BenchmarkResult(0, math.inf, "hip://3"),
+    ]
+    best_results = libtuner.select_best_benchmark_results(
+        candidate_results=candidate_results,
+        baseline_results=baseline_results,
+        num_candidates=3,
+    )
+    assert best_results[0].candidate_id == 4
+    assert best_results[1].candidate_id == 3
+    assert best_results[2].candidate_id == 2
+
+
 def test_enum_collision():
     from iree.compiler.dialects import linalg, vector, iree_gpu, iree_codegen, iree_input  # type: ignore

From 64359b4e35112f3690bf33b766fb654fbf6c3443 Mon Sep 17 00:00:00 2001
From: Marius Brehler <marius.brehler@amd.com>
Date: Thu, 9 Jan 2025 20:39:28 +0100
Subject: [PATCH 29/35] Drop unneeded pip flags and switch torch channel (#792)

Drops `--pre` and `--upgrade` where dependencies are pinned to specific
versions. Further switches to unstall torch cpu form the stable release
channel.
---
 .github/workflows/ci-sglang-benchmark.yml | 2 +-
 pytorch-cpu-requirements.txt              | 3 +--
 requirements-iree-pinned.txt              | 1 -
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index c958c1f9c..111535712 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -69,7 +69,7 @@ jobs:
           pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
 
           # Pin to known-working versions.
-          pip install -f https://iree.dev/pip-release-links.html --pre --upgrade \
+          pip install -f https://iree.dev/pip-release-links.html
             iree-base-compiler==3.1.0rc20241220 \
             iree-base-runtime==3.1.0rc20241220 \
             "numpy<2.0"
diff --git a/pytorch-cpu-requirements.txt b/pytorch-cpu-requirements.txt
index aae0297db..4447d2385 100644
--- a/pytorch-cpu-requirements.txt
+++ b/pytorch-cpu-requirements.txt
@@ -1,3 +1,2 @@
---pre
---index-url https://download.pytorch.org/whl/test/cpu
+--index-url https://download.pytorch.org/whl/cpu/
 torch==2.3.0
diff --git a/requirements-iree-pinned.txt b/requirements-iree-pinned.txt
index fa7b1fe05..a721d7ebb 100644
--- a/requirements-iree-pinned.txt
+++ b/requirements-iree-pinned.txt
@@ -1,7 +1,6 @@
 # Pinned versions of IREE dependencies.
 
 # Keep these versions synced with SHORTFIN_IREE_GIT_TAG in shortfin/CMakeLists.txt
---pre
 --find-links https://iree.dev/pip-release-links.html
 iree-base-compiler==3.2.0rc20250109
 iree-base-runtime==3.2.0rc20250109

From 45322d432e14cd126234909f0633e89ded254fc9 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Thu, 9 Jan 2025 16:05:16 -0500
Subject: [PATCH 30/35] [Tuner] Fix handling of compilation failures (#807)

Also misc fixes for logging
---
 tuner/tuner/libtuner.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py
index f5a316a65..8c5b15761 100644
--- a/tuner/tuner/libtuner.py
+++ b/tuner/tuner/libtuner.py
@@ -466,7 +466,11 @@ def run_iree_compile_command(compile_pack: CompilePack) -> Optional[int]:
             timeout_seconds=compile_pack.iree_compile_timeout,
         )
     )
-    if result.process_res is None or result.is_timeout:
+
+    # We need to check if the output vmfb exists as iree-compile returns a success
+    # status code when crash reproducers are dumped.
+    output_vmfb_exists = candidate_tracker.compiled_vmfb_path.is_file()
+    if result.process_res is None or result.is_timeout or not output_vmfb_exists:
         return None
     return candidate_tracker.candidate_id
 
@@ -520,7 +524,7 @@ def run_iree_benchmark_module_command(benchmark_pack: BenchmarkPack):
             **extra_flags,
         )
     except ireert.benchmark.BenchmarkTimeoutError as e:
-        logging.warning(
+        logging.info(
             f"Benchmark of candidate {candidate_id} timed out after {timeout} seconds."
         )
         return BenchmarkResult(
@@ -557,7 +561,9 @@ def run_iree_benchmark_module_command(benchmark_pack: BenchmarkPack):
         )
 
     mean_benchmark_time = sum(times) / float(len(times))
-    logging.debug(f"Benchmark time of candidate {candidate_id}: {mean_benchmark_time}")
+    logging.debug(
+        f"Benchmark time of candidate {candidate_id}: {mean_benchmark_time:.2f}"
+    )
     return BenchmarkResult(
         candidate_id=candidate_id,
         time=mean_benchmark_time,

From b920696ed393fc887245bef3d710f2dcb20b65eb Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Thu, 9 Jan 2025 20:32:02 -0500
Subject: [PATCH 31/35] [Tuner] Fix large model benchmarking (#808)

* Add model-specific benchmark timeout
* Fix benchmark argument parsing to allow for `=` in command line
argument values
* Don't print candidate trackers at the very end (too much noise)
* Always promote operands
---
 tuner/examples/simple/simple_tuner.py | 11 +++++------
 tuner/tuner/dispatch_constraints.py   |  3 ++-
 tuner/tuner/libtuner.py               |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tuner/examples/simple/simple_tuner.py b/tuner/examples/simple/simple_tuner.py
index d4ec089fe..3b8af79f9 100644
--- a/tuner/examples/simple/simple_tuner.py
+++ b/tuner/examples/simple/simple_tuner.py
@@ -15,7 +15,8 @@ def __init__(self, tuner_context: libtuner.TunerContext):
         super().__init__(tuner_context)
         self.compile_flags: list[str] = []
         self.benchmark_flags: list[str] = []
-        self.compile_timeout: int = 10
+        self.compile_timeout: int = 16
+        self.benchmark_timeout: int = 16
 
     def get_iree_compile_flags(self) -> list[str]:
         return self.compile_flags
@@ -27,7 +28,7 @@ def get_iree_benchmark_module_flags(self) -> list[str]:
         return self.benchmark_flags
 
     def get_benchmark_timeout_s(self) -> int:
-        return 10
+        return self.benchmark_timeout
 
 
 def read_flags_file(flags_file: str) -> list[str]:
@@ -127,7 +128,7 @@ def main():
 
         print("Compiling models with top candidates...")
         simple_tuner.compile_flags = compile_flags
-        simple_tuner.compile_timeout = 60
+        simple_tuner.compile_timeout = 120
         compiled_model_candidates = libtuner.compile(
             args,
             path_config,
@@ -141,6 +142,7 @@ def main():
 
         print("Benchmarking compiled model candidates...")
         simple_tuner.benchmark_flags = model_benchmark_flags
+        simple_tuner.benchmark_timeout = 60
         top_model_candidates = libtuner.benchmark(
             args,
             path_config,
@@ -154,6 +156,3 @@ def main():
 
         print("Check the detailed execution logs in:")
         print(path_config.run_log.resolve())
-
-    for candidate in candidate_trackers:
-        libtuner.logging.debug(candidate)
diff --git a/tuner/tuner/dispatch_constraints.py b/tuner/tuner/dispatch_constraints.py
index b41d808b9..df7151002 100644
--- a/tuner/tuner/dispatch_constraints.py
+++ b/tuner/tuner/dispatch_constraints.py
@@ -309,10 +309,11 @@ def generate_compilation_infos(
         "reduction": reduction_tile_sizes,
         "subgroup_m_count": subgroup_m_count,
         "subgroup_n_count": subgroup_n_count,
+        "promote_operands": [0, 1],
     }
     if codegen_pipeline == iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse:
         lowering_config_args["subgroup"] = subgroup_tile_sizes
-        lowering_config_args["promote_operands"] = [0, 1]
+
     lowering_config = get_lowering_config(**lowering_config_args)
 
     # Create the TranslationInfoAttr
diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py
index 8c5b15761..9942187ec 100644
--- a/tuner/tuner/libtuner.py
+++ b/tuner/tuner/libtuner.py
@@ -497,10 +497,10 @@ def run_iree_benchmark_module_command(benchmark_pack: BenchmarkPack):
         assert flag[:2] == "--", "iree_benchmark_module_flags should begin with '--'"
         split_key_value = flag[2:].split("=")
         assert (
-            len(split_key_value) == 2
+            len(split_key_value) >= 1
         ), "iree_benchmark_module_flags should have the format --<key>=<value>"
         key = split_key_value[0]
-        value = split_key_value[1]
+        value = "=".join(split_key_value[1:])
         # Allow the tuning client to pass `--function=@func_name`.
         if key == "function":
             func_name = value

From 371ef6e8b9ae81d842098368e337c8cb17bf76d9 Mon Sep 17 00:00:00 2001
From: Bangtian Liu <liubangtian@gmail.com>
Date: Thu, 9 Jan 2025 21:37:22 -0500
Subject: [PATCH 32/35] [tuner] reduce log file size (#809)

This PR aims to the log size detailed in:
https://github.com/nod-ai/shark-ai/issues/806

- Avoid printing the stdout and stderr of running the command
- Reduce the precision of fp constants printed

Signed-off-by: Bangtian Liu <liubangtian@gmail.com>
---
 tuner/tuner/candidate_gen.py | 5 -----
 tuner/tuner/libtuner.py      | 4 ++--
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/tuner/tuner/candidate_gen.py b/tuner/tuner/candidate_gen.py
index a1ee421d0..b3cd31079 100644
--- a/tuner/tuner/candidate_gen.py
+++ b/tuner/tuner/candidate_gen.py
@@ -237,11 +237,6 @@ def run_command(run_pack: RunPack) -> RunResult:
             text=True,
             timeout=timeout_seconds,
         )
-
-        if result.stdout:
-            logging.debug(f"stdout: {result.stdout}")
-        if result.stderr:
-            logging.debug(f"stderr: {result.stderr}")
     except subprocess.TimeoutExpired as e:
         logging.warning(
             f"Command '{command_str}' timed out after {timeout_seconds} seconds."
diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py
index 9942187ec..cdd589022 100644
--- a/tuner/tuner/libtuner.py
+++ b/tuner/tuner/libtuner.py
@@ -803,7 +803,7 @@ def compile(
     compiled_candidates = [c for c in compiled_candidates if c is not None]
     success_rate = float(len(compiled_candidates)) / float(len(candidates))
     logging.info(
-        f"Successfully compiled [{len(compiled_candidates)}] candidates. Success rate: {success_rate}"
+        f"Successfully compiled [{len(compiled_candidates)}] candidates. Success rate: {success_rate:.2f}"
     )
 
     # Remove duplicate vmfbs from the candidate list.
@@ -875,7 +875,7 @@ def get_speedup(result: BenchmarkResult) -> float:
             speedup = f"{round(get_speedup(r) * 100, 2)}% of baseline"
         else:
             speedup = "baseline unavailable"
-        logging.info(f"Candidate {r.candidate_id} time: {r.time} ({speedup})")
+        logging.info(f"Candidate {r.candidate_id} time: {r.time:.2f} ({speedup})")
     return best_results
 
 

From 8de90a540f4a9e411fd50658f20f289d54b862a5 Mon Sep 17 00:00:00 2001
From: "Xida Ren (Cedar)" <cedar.ren@gmail.com>
Date: Thu, 9 Jan 2025 23:17:28 -0500
Subject: [PATCH 33/35] Convert `ci-shark-ai.yml` to use `pkgci_shark_ai.yml`
 so that we only build packages once (#780)

This builds on #625, #589 to make progress on issue #584.

This adds a pkgci.yml to run multiple package-based CI tasks after
building package using Scott's changes in #667. This gives us the
following benefits:

* Integration test workflows are faster because they now use dev
packages, without needing to build them from source or use editable
installs. Also, if more integration tests are added, they can reuse the
built packages.
* Users and developers can access the same dev packages to reproduce CI
results
* Only one runner needs the build requirements (potentially including
clang, ninja, CMake, Rust, etc.), other runners only need Python.

This also switches to using uv to create venvs, which is faster.

This PR brings shortfin CPU LLM CI time to roughly half an hour on the
mi250 runner to a few seconds of package build (fast due to caching) and
around 5 minutes of testing.

---------

Co-authored-by: Scott Todd <scott.todd0@gmail.com>
---
 .github/workflows/ci-shark-ai.yml    |  66 -----
 .github/workflows/pkgci.yml          |  39 +++
 .github/workflows/pkgci_shark_ai.yml |  95 +++++++
 build_tools/pkgci/setup_venv.py      | 380 +++++++++++++++++++++++++++
 pytorch-rocm-requirements.txt        |   3 +-
 5 files changed, 515 insertions(+), 68 deletions(-)
 delete mode 100644 .github/workflows/ci-shark-ai.yml
 create mode 100644 .github/workflows/pkgci.yml
 create mode 100644 .github/workflows/pkgci_shark_ai.yml
 create mode 100755 build_tools/pkgci/setup_venv.py

diff --git a/.github/workflows/ci-shark-ai.yml b/.github/workflows/ci-shark-ai.yml
deleted file mode 100644
index 3957b6d11..000000000
--- a/.github/workflows/ci-shark-ai.yml
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2024 Advanced Micro Devices, Inc.
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-name: CI - shark-ai
-
-on:
-  workflow_dispatch:
-  pull_request:
-  push:
-    branches:
-      - main
-
-concurrency:
-  # A PR number if a pull request and otherwise the commit hash. This cancels
-  # queued and in-progress runs for the same PR (presubmit) or commit
-  # (postsubmit). The workflow name is prepended to avoid conflicts between
-  # different workflows.
-  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
-  cancel-in-progress: true
-
-jobs:
-  test_shortfin_llm_server:
-    name: "Integration Tests - Shortfin LLM Server"
-    strategy:
-      matrix:
-        version: [3.11]
-      fail-fast: false
-    runs-on: nodai-amdgpu-mi250-x86-64
-    defaults:
-      run:
-        shell: bash
-    env:
-      VENV_DIR: ${{ github.workspace }}/.venv
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: "Setting up Python"
-        id: setup_python
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: ${{matrix.version}}
-      - name: Create Python venv
-        run: python -m venv ${VENV_DIR}
-
-      - name: Install pip deps
-        run: |
-          source ${VENV_DIR}/bin/activate
-          python -m pip install --no-compile --upgrade pip
-
-          # Note: We install in three steps in order to satisfy requirements
-          # from non default locations first.
-          pip install --no-compile -r pytorch-cpu-requirements.txt
-          pip install -r requirements-iree-pinned.txt
-          pip install --no-compile \
-            -r requirements.txt \
-            -e sharktank/ shortfin/
-
-          pip freeze
-
-      - name: Run LLM Integration Tests
-        run: |
-          source ${VENV_DIR}/bin/activate
-          pytest -v -s app_tests/integration_tests/llm/shortfin --log-cli-level=INFO
diff --git a/.github/workflows/pkgci.yml b/.github/workflows/pkgci.yml
new file mode 100644
index 000000000..9b1b50033
--- /dev/null
+++ b/.github/workflows/pkgci.yml
@@ -0,0 +1,39 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+name: PkgCI
+
+on:
+  workflow_dispatch:
+  pull_request:
+  push:
+    branches:
+      - main
+
+permissions:
+  contents: read
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  build_packages:
+    name: Build Packages
+    uses: ./.github/workflows/build_packages.yml
+    permissions:
+      contents: write
+    with:
+      build_type: "dev"
+
+  test_shark_ai:
+    name: Test shark-ai
+    needs: [build_packages]
+    uses: ./.github/workflows/pkgci_shark_ai.yml
diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml
new file mode 100644
index 000000000..d7c040c42
--- /dev/null
+++ b/.github/workflows/pkgci_shark_ai.yml
@@ -0,0 +1,95 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+name: PkgCI - shark-ai
+
+on:
+  workflow_call:
+    inputs:
+      artifact_run_id:
+        type: string
+        default: ""
+  workflow_dispatch:
+    inputs:
+      artifact_run_id:
+        type: string
+        description: "Id for a workflow run that produced dev packages"
+        default: ""
+
+jobs:
+  test_shortfin_llm_server:
+    name: "Integration Tests - Shortfin LLM Server"
+    strategy:
+      matrix:
+        version: [3.11]
+      fail-fast: false
+    runs-on: mi300x-4
+    # runs-on: ubuntu-latest # everything else works but this throws an "out of resources" during model loading
+    # TODO: make a copy of this that runs on standard runners with tiny llama instead of a 8b model
+    defaults:
+      run:
+        shell: bash
+    env:
+      PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
+      VENV_DIR: ${{ github.workspace }}/.venv
+    steps:
+      - name: "Checkout Code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{matrix.version}}
+
+      - name: Set Python version without dot
+        run: |
+          echo "PY_VERSION_NO_DOT=$(echo ${{ matrix.version }} | tr -d '.')" >> $GITHUB_ENV
+
+      - name: Setup UV caching
+        run: |
+          CACHE_DIR="${GITHUB_WORKSPACE}/.uv-cache"
+          echo "UV_CACHE_DIR=${CACHE_DIR}" >> $GITHUB_ENV
+          mkdir -p "${CACHE_DIR}"
+
+      - name: Cache UV packages
+        uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+        with:
+          path: .uv-cache
+          key: ${{ runner.os }}-uv-py${{ matrix.version }}-${{ hashFiles('requirements-iree-pinned.txt', 'pytorch-cpu-requirements.txt', 'sharktank/requirements.txt', 'sharktank/requirements-tests.txt', 'shortfin/requirements-tests.txt') }}
+
+      - name: Download sharktank artifacts
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: snapshot-sharktank-linux-x86_64-cp${{ env.PY_VERSION_NO_DOT }}-cp${{ env.PY_VERSION_NO_DOT }}
+          path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
+
+      - name: Download shortfin artifacts
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: snapshot-shortfin-linux-x86_64-cp${{ env.PY_VERSION_NO_DOT }}-cp${{ env.PY_VERSION_NO_DOT }}
+          path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
+
+      - name: Download shark-ai artifacts
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: snapshot-shark-ai-linux-x86_64-cp${{ env.PY_VERSION_NO_DOT }}-cp${{ env.PY_VERSION_NO_DOT }}
+          path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
+
+      - name: Setup venv
+        run: |
+          ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
+            --artifact-path=${PACKAGE_DOWNLOAD_DIR} \
+            --fetch-gh-workflow=${{ inputs.artifact_run_id }}
+
+      - name: Install pinned IREE packages
+        run: |
+          source ${VENV_DIR}/bin/activate
+          uv pip install -r requirements-iree-pinned.txt
+
+      - name: Run LLM Integration Tests
+        run: |
+          source ${VENV_DIR}/bin/activate
+          pytest -v -s app_tests/integration_tests/llm/shortfin --log-cli-level=INFO
diff --git a/build_tools/pkgci/setup_venv.py b/build_tools/pkgci/setup_venv.py
new file mode 100755
index 000000000..19bc96505
--- /dev/null
+++ b/build_tools/pkgci/setup_venv.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+# Copyright 2024 Advanced Micro Devices, Inc.
+# Copyright 2023 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""Sets up a Python venv with shark-ai packages from a workflow run.
+
+There are several modes in which to use this script:
+
+* Within a workflow triggered by `workflow_call`, an artifact action will
+  typically be used to fetch relevant package artifacts. Specify the fetched
+  location with `--artifact-path=`:
+
+  ```yml
+  - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+    with:
+      name: linux_x86_64_release_packages
+      path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
+  - name: Setup venv
+    run: |
+      ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
+      --artifact-path=${PACKAGE_DOWNLOAD_DIR}
+  ```
+
+* Within a workflow triggered by `workflow_dispatch`, pass `artifact_run_id` as
+  an input that developers must specify when running the workflow:
+
+  ```yml
+  on:
+    workflow_dispatch:
+      inputs:
+      artifact_run_id:
+        type: string
+        default: ""
+
+  ...
+    steps:
+    - name: Setup venv
+      run: |
+        ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
+        --fetch-gh-workflow=${{ inputs.artifact_run_id }}
+  ```
+
+  (Note that these two modes are often combined to allow for workflow testing)
+
+* Locally, the `--fetch-gh-workflow=WORKFLOW_ID` can be used to download and
+  setup the venv from a specific workflow run in one step:
+
+
+  ```bash
+  python3.11 ./build_tools/pkgci/setup_venv.py /tmp/.venv --fetch-gh-workflow=12056182052
+  ```
+
+* Locally, the `--fetch-git-ref=GIT_REF` can be used to download and setup the
+  venv from the latest workflow run for a given ref (commit) in one step:
+
+  ```bash
+  python3.11 ./build_tools/pkgci/setup_venv.py /tmp/.venv --fetch-git-ref=main
+  ```
+
+You must have the `gh` command line tool installed and authenticated if you
+will be fetching artifacts.
+"""
+
+from glob import glob
+from pathlib import Path
+from typing import Optional, Dict, Tuple
+
+import argparse
+import functools
+import json
+import os
+import platform
+import subprocess
+import sys
+import tempfile
+import zipfile
+
+THIS_DIR = Path(__file__).parent.resolve()
+REPO_ROOT = THIS_DIR.parent.parent
+
+
+def parse_arguments(argv=None):
+    parser = argparse.ArgumentParser(description="Setup venv")
+    parser.add_argument(
+        "venv_dir", type=Path, help="Directory in which to create the venv"
+    )
+    parser.add_argument("--artifact-path", help="Path in which to find/fetch artifacts")
+    parser.add_argument(
+        "--packages",
+        help="Comma-delimited list of packages to install, in order",
+        default="shark-ai,shortfin,sharktank",
+    )
+    parser.add_argument(
+        "--install-using-index",
+        help="The default mode installs with `--no-index` to be sure that only "
+        "our packages are installed. Setting this flag removes that option, "
+        "more closely matching the behavior that users will see when they "
+        "install published packages.",
+        action="store_true",
+    )
+
+    fetch_group = parser.add_mutually_exclusive_group()
+    fetch_group.add_argument(
+        "--fetch-gh-workflow", help="Fetch artifacts from a GitHub workflow"
+    )
+    fetch_group.add_argument("--fetch-git-ref", help="Fetch artifacts for a git ref")
+
+    args = parser.parse_args(argv)
+    return args
+
+
+def get_latest_workflow_run_id_for_ref(ref: str) -> int:
+    print(f"Normalizing ref: {ref}")
+    normalized_ref = (
+        subprocess.check_output(["git", "rev-parse", ref], cwd=REPO_ROOT)
+        .decode()
+        .strip()
+    )
+
+    print(f"Fetching artifacts for normalized ref: {normalized_ref}")
+    base_path = f"/repos/nod-ai/shark-ai"
+    workflow_run_args = [
+        "gh",
+        "api",
+        "-H",
+        "Accept: application/vnd.github+json",
+        "-H",
+        "X-GitHub-Api-Version: 2022-11-28",
+        f"{base_path}/actions/workflows/pkgci.yml/runs?head_sha={normalized_ref}",
+    ]
+    print(f"Running command to list workflow runs:\n  {' '.join(workflow_run_args)}")
+    workflow_run_output = subprocess.check_output(workflow_run_args)
+    workflow_run_json_output = json.loads(workflow_run_output)
+    if workflow_run_json_output["total_count"] == 0:
+        raise RuntimeError("Workflow did not run at this commit")
+
+    latest_run = workflow_run_json_output["workflow_runs"][-1]
+    print(f"Found workflow run: {latest_run['html_url']}")
+    return latest_run["id"]
+
+
+@functools.lru_cache
+def list_gh_artifacts(run_id: str) -> Dict[str, str]:
+    print(f"Fetching artifacts for workflow run {run_id}")
+    base_path = f"/repos/nod-ai/shark-ai"
+    output = subprocess.check_output(
+        [
+            "gh",
+            "api",
+            "-H",
+            "Accept: application/vnd.github+json",
+            "-H",
+            "X-GitHub-Api-Version: 2022-11-28",
+            f"{base_path}/actions/runs/{run_id}/artifacts",
+        ]
+    )
+    data = json.loads(output)
+    # Uncomment to debug:
+    # print(json.dumps(data, indent=2))
+    artifacts = {
+        rec["name"]: f"{base_path}/actions/artifacts/{rec['id']}/zip"
+        for rec in data["artifacts"]
+    }
+    print("Found artifacts:")
+    for k, v in artifacts.items():
+        print(f"  {k}: {v}")
+    return artifacts
+
+
+def fetch_gh_artifact(api_path: str, file: Path):
+    print(f"Downloading artifact {api_path}")
+    contents = subprocess.check_output(
+        [
+            "gh",
+            "api",
+            "-H",
+            "Accept: application/vnd.github+json",
+            "-H",
+            "X-GitHub-Api-Version: 2022-11-28",
+            api_path,
+        ]
+    )
+    file.write_bytes(contents)
+
+
+def find_venv_python(venv_path: Path) -> Optional[Path]:
+    paths = [venv_path / "bin" / "python", venv_path / "Scripts" / "python.exe"]
+    for p in paths:
+        if p.exists():
+            return p
+    return None
+
+
+def install_with_index(python_exe, wheels):
+    # Install each of the built wheels, allowing dependencies and an index.
+    # Note that --pre pulls in prerelease versions of dependencies too, like
+    # numpy. We could try a solution like https://stackoverflow.com/a/76124424.
+    for artifact_path, package_name in wheels:
+        cmd = [
+            "uv",
+            "pip",
+            "install",
+            "--pre",
+            "-f",
+            str(artifact_path),
+            package_name,
+            "--python",
+            str(python_exe),
+        ]
+        print(f"\nRunning command: {' '.join([str(c) for c in cmd])}")
+        subprocess.check_call(cmd)
+
+
+def install_without_index(python_exe, packages, wheels):
+    # Install each of the built wheels without deps or consulting an index.
+    # This is because we absolutely don't want this falling back to anything
+    # but what we said.
+    for artifact_path, package_name in wheels:
+        cmd = [
+            "uv",
+            "pip",
+            "install",
+            "--no-deps",
+            "--no-index",
+            "-f",
+            str(artifact_path),
+            "--force-reinstall",
+            package_name,
+            "--python",
+            str(python_exe),
+        ]
+        print(f"\nRunning command: {' '.join([str(c) for c in cmd])}")
+        subprocess.check_call(cmd)
+
+    # Install requirements for the requested packages.
+    # Note that not all of these are included in the package dependencies, but
+    # developers usually want the test requirements too.
+    requirements_files = []
+    if "sharktank" in packages:
+        requirements_files.append("sharktank/requirements.txt")
+        requirements_files.append("sharktank/requirements-tests.txt")
+    if "shortfin" in packages:
+        requirements_files.append("shortfin/requirements-tests.txt")
+
+    for requirements_file in requirements_files:
+        cmd = [
+            "uv",
+            "pip",
+            "install",
+            "-r",
+            str(REPO_ROOT / requirements_file),
+            "--python",
+            str(python_exe),
+        ]
+        print(f"\nRunning command: {' '.join([str(c) for c in cmd])}")
+        subprocess.check_call(cmd)
+
+
+def find_wheel(args, artifact_prefix: str, package_name: str) -> Tuple[Path, str]:
+    artifact_path = Path(args.artifact_path)
+
+    def has_package():
+        norm_package_name = package_name.replace("-", "_")
+        pattern = str(artifact_path / f"{norm_package_name}-*.whl")
+        files = glob(pattern)
+        return bool(files)
+
+    if has_package():
+        return (artifact_path, package_name)
+
+    if not args.fetch_gh_workflow:
+        raise RuntimeError(
+            f"Could not find package {package_name} to install from {artifact_path}"
+        )
+
+    # Fetch.
+    artifact_path.mkdir(parents=True, exist_ok=True)
+    artifact_name = f"{artifact_prefix}_dev_packages"
+    artifact_file = artifact_path / f"{artifact_name}.zip"
+    if not artifact_file.exists():
+        print(f"Package {package_name} not found. Fetching from {artifact_name}...")
+        artifacts = list_gh_artifacts(args.fetch_gh_workflow)
+        if artifact_name not in artifacts:
+            raise RuntimeError(
+                f"Could not find required artifact {artifact_name} in run {args.fetch_gh_workflow}"
+            )
+        fetch_gh_artifact(artifacts[artifact_name], artifact_file)
+    print(f"Extracting {artifact_file}")
+    with zipfile.ZipFile(artifact_file) as zip_ref:
+        zip_ref.extractall(artifact_path)
+
+    # Try again.
+    if not has_package():
+        raise RuntimeError(f"Could not find {package_name} in {artifact_path}")
+    return (artifact_path, package_name)
+
+
+def main(args):
+    # Look up the workflow run for a ref.
+    if args.fetch_git_ref:
+        latest_gh_workflow = get_latest_workflow_run_id_for_ref(args.fetch_git_ref)
+        args.fetch_git_ref = ""
+        args.fetch_gh_workflow = str(latest_gh_workflow)
+        return main(args)
+
+    # Make sure we have an artifact path if fetching.
+    if not args.artifact_path and args.fetch_gh_workflow:
+        with tempfile.TemporaryDirectory() as td:
+            args.artifact_path = td
+            return main(args)
+
+    # Parse command-delimited list of packages from args.
+    packages = args.packages.split(",")
+    print("Installing packages:", packages)
+
+    artifact_prefix = f"{platform.system().lower()}_{platform.machine()}"
+    wheels = []
+    for package_name in packages:
+        wheels.append(find_wheel(args, artifact_prefix, package_name))
+    print("Installing wheels:", wheels)
+
+    # Set up venv using 'uv' (https://docs.astral.sh/uv/).
+    # We could use 'pip', but 'uv' is much faster at installing packages.
+    venv_path = args.venv_dir
+    python_exe = find_venv_python(venv_path)
+
+    if not python_exe:
+        print(f"Creating venv at {str(venv_path)}")
+
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "uv"])
+        subprocess.check_call(
+            ["uv", "venv", str(venv_path), "--python", sys.executable]
+        )
+        python_exe = find_venv_python(venv_path)
+        if not python_exe:
+            raise RuntimeError("Error creating venv")
+
+    # Install the PyTorch CPU wheels first to save multiple minutes and a lot of bandwidth.
+    cmd = [
+        "uv",
+        "pip",
+        "install",
+        "-r",
+        str(REPO_ROOT / "pytorch-cpu-requirements.txt"),
+        "--python",
+        str(python_exe),
+    ]
+    print(f"\nRunning command: {' '.join([str(c) for c in cmd])}")
+    subprocess.check_call(cmd)
+
+    if args.install_using_index:
+        install_with_index(python_exe, wheels)
+    else:
+        install_without_index(python_exe, packages, wheels)
+
+    # Log which packages are installed.
+    print("")
+    print(f"Checking packages with 'uv pip freeze':")
+    subprocess.check_call(
+        [
+            "uv",
+            "pip",
+            "freeze",
+            "--python",
+            str(python_exe),
+        ]
+    )
+
+    print("")
+    print(f"venv setup using uv, activate with:\n  source {venv_path}/bin/activate")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(parse_arguments()))
diff --git a/pytorch-rocm-requirements.txt b/pytorch-rocm-requirements.txt
index 85116cfbb..0b1d480f5 100644
--- a/pytorch-rocm-requirements.txt
+++ b/pytorch-rocm-requirements.txt
@@ -1,3 +1,2 @@
---pre
---index-url https://download.pytorch.org/whl/nightly/rocm6.0
+--index-url https://download.pytorch.org/whl/rocm6.2
 torch>=2.3.0

From d02d6e05d68c2d8f0dbda61d68c33c1418bff570 Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Thu, 9 Jan 2025 20:40:10 -0800
Subject: [PATCH 34/35] Bump IREE runtime pin used in shortfin close to latest.
 (#803)

This updates to
https://github.com/iree-org/iree/commit/9055c9d1f6342a061a6747ef9b385816b96a0a8f.
Diff:
https://github.com/iree-org/iree/compare/iree-3.1.0rc20241220...9055c9d1f6342a061a6747ef9b385816b96a0a8f.
Notably, this pulls in many performance improvements and bug fixes to
https://github.com/iree-org/iree/tree/main/runtime/src/iree/hal/drivers/hip.

See discussion on https://github.com/nod-ai/shark-ai/pull/773 and
https://github.com/nod-ai/shark-ai/pull/802 .
---
 shortfin/CMakeLists.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/shortfin/CMakeLists.txt b/shortfin/CMakeLists.txt
index dbc871591..c6368f2c5 100644
--- a/shortfin/CMakeLists.txt
+++ b/shortfin/CMakeLists.txt
@@ -47,7 +47,10 @@ add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/utf-8>")
 # Prefer to keep the IREE git tag synced with the Python package version in the
 # requirements-iree-pinned.txt file. At a minimum, the compiler from those
 # packages must be compatible with the runtime at this source ref.
-set(SHORTFIN_IREE_GIT_TAG "iree-3.1.0rc20241220")
+# TODO: switch back to iree-3.2.0rcYYYYMMDD style tag matching Python package
+#       pin after future nightly releases
+set(SHORTFIN_IREE_GIT_TAG "9055c9d1f6342a061a6747ef9b385816b96a0a8f")
+
 
 # build options
 option(SHORTFIN_BUILD_PYTHON_BINDINGS "Builds Python Bindings" OFF)
@@ -243,7 +246,7 @@ else()
     GIT_REPOSITORY https://github.com/iree-org/iree.git
     GIT_TAG "${SHORTFIN_IREE_GIT_TAG}"
     GIT_SUBMODULES ${IREE_SUBMODULES}
-    GIT_SHALLOW TRUE
+    GIT_SHALLOW FALSE  # TODO: switch back to TRUE when SHORTFIN_IREE_GIT_TAG is a tag and not a commit
     SYSTEM
     EXCLUDE_FROM_ALL
   )

From 2edde8d7ffb17f25c19edb83aab9f65267ebe9e3 Mon Sep 17 00:00:00 2001
From: Marius Brehler <marius.brehler@amd.com>
Date: Fri, 10 Jan 2025 19:07:44 +0100
Subject: [PATCH 35/35] Fix broken pip install command

---
 .github/workflows/ci-sglang-benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 111535712..3af1ad725 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -69,7 +69,7 @@ jobs:
           pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
 
           # Pin to known-working versions.
-          pip install -f https://iree.dev/pip-release-links.html
+          pip install -f https://iree.dev/pip-release-links.html \
             iree-base-compiler==3.1.0rc20241220 \
             iree-base-runtime==3.1.0rc20241220 \
             "numpy<2.0"