AIROptMemtileDmaBDs: Add memtile dma bd generator (#887)

* Fixup stride calculation math when folding for loops; pass in maxSize for wrap-and-stride canonicalization * Update the mlir ir test to reflect the optimized dma bds generated * (Test) Revert removing const check for offset[i-1] * Revert "(Test) Revert removing const check for offset[i-1]" This reverts commit 0e4059f. * Add support for arith addi/muli ops in wrap-and-stride canonicalization * Add air-opt-memtile-dma-bds pass which generates AIE memtile dma bds from logical air.channel ops * Create a new affine apply op for each offset composition * Change foldForLoopNestAsExtendedSizesAndStrides to not directly mutate any existing ops; leave the loop folding and mutation later, so that the code can fall back * When tiling illegal wrap dim, attempt to erase one dummy wrap-and-stride dimension, if any * Replace uses of air-specialize-channel-wrap-and-stride with air-opt-memtile-dma-bds in xrt tests
Xilinx · Feb 6, 2025 · c0ac177 · c0ac177
1 parent 264fec6
commit c0ac177
Show file tree

Hide file tree

Showing 36 changed files with 494 additions and 122 deletions.
diff --git a/mlir/include/air/Transform/AIRDependencyScheduleOpt.h b/mlir/include/air/Transform/AIRDependencyScheduleOpt.h
@@ -63,6 +63,10 @@ std::unique_ptr<mlir::Pass> createAIROptimizeShimDMABDs();
 std::unique_ptr<Pass>
 createAIROptimizeShimDMABDs(AIROptimizeShimDMABDsOptions options);
 
+std::unique_ptr<mlir::Pass> createAIROptimizeMemtileDMABDs();
+std::unique_ptr<Pass>
+createAIROptimizeMemtileDMABDs(AIROptimizeMemtileDMABDsOptions options);
+
 std::unique_ptr<mlir::Pass> createAIRFuseAllocDealloc();
 
 std::unique_ptr<mlir::Pass> createAIRShrinkMemrefSizesByAccess();
@@ -74,7 +78,7 @@ void populateAIRLoopIndexCanonicalizationPatterns(RewritePatternSet &patterns);
 
 // Apply AIRSpecializeChannelWrapAndStridePattern on region.
 void applyAIRSpecializeChannelWrapAndStridePattern(Region *region,
-                                                   int maxNumDims,
+                                                   int maxNumDims, int maxSize,
                                                    bool enableForLoopUnrolling);
 
 // Populate patterns for fusing scf.for loops within air.launch.

diff --git a/mlir/include/air/Transform/PassDetail.h b/mlir/include/air/Transform/PassDetail.h
@@ -68,6 +68,7 @@ namespace air {
 #define GEN_PASS_DEF_AFFINELOOPOPTPASS
 #define GEN_PASS_DEF_AIRLOOPFUSION
 #define GEN_PASS_DEF_AIROPTIMIZESHIMDMABDS
+#define GEN_PASS_DEF_AIROPTIMIZEMEMTILEDMABDS
 #define GEN_PASS_DEF_AIRFUSEALLOCDEALLOC
 #define GEN_PASS_DEF_AIRSHRINKMEMREFSIZESBYACCESS
 #define GEN_PASS_DEF_AIRSPLITL2MEMREFFORBUFFERCONSTRAINTPASS

diff --git a/mlir/include/air/Transform/Passes.td b/mlir/include/air/Transform/Passes.td
@@ -1069,7 +1069,7 @@ def AIRLoopFusion: Pass<"air-loop-fusion", "func::FuncOp"> {
 }
 
 def AIROptimizeShimDMABDs: Pass<"air-opt-shim-dma-bds", "func::FuncOp"> {
-  let summary = "Optimize logical air.channel.put/get op into efficient shim dma block descriptor (BD)";
+  let summary = "Optimize logical air.channel.put/get op into efficient AIE shim dma block descriptor (BD)";
   let constructor = "xilinx::air::createAIROptimizeShimDMABDs()";
   let description = [{
     Optimize the logical data movement by transforming them, represented as air.channel.put/get operations, into explicit representation of physical data movement block descriptors (BDs), also represented as air.channel.put/get operations.
@@ -1081,6 +1081,19 @@ def AIROptimizeShimDMABDs: Pass<"air-opt-shim-dma-bds", "func::FuncOp"> {
   ];
 }
 
+def AIROptimizeMemtileDMABDs: Pass<"air-opt-memtile-dma-bds", "func::FuncOp"> {
+  let summary = "Optimize logical air.channel.put/get op into efficient AIE memtile dma block descriptor (BD)";
+  let constructor = "xilinx::air::createAIROptimizeMemtileDMABDs()";
+  let description = [{
+    Optimize the logical data movement by transforming them, represented as air.channel.put/get operations, into explicit representation of physical data movement block descriptors (BDs), also represented as air.channel.put/get operations.
+  }];
+  let options = [
+    Option<"clDevice", "device", "std::string",
+          /*default=*/"\"xcvc1902\"",
+           "AIE device to target.">,
+  ];
+}
+
 def AIRFuseAllocDealloc: Pass<"air-fuse-alloc-dealloc", "func::FuncOp"> {
   let summary = "Fuse pairs of memref.alloc/dealloc ops into the inner-most region containing all uses of the memref.";
   let constructor = "xilinx::air::createAIRFuseAllocDealloc()";

diff --git a/mlir/include/air/Util/Util.h b/mlir/include/air/Util/Util.h
@@ -160,11 +160,9 @@ LogicalResult foldForLoopNestAsExtendedSizesAndStrides(
     SmallVector<Value> &strides, Value memref);
 
 // Canonicalize wrap and stride lists, by removing redundant dimensions.
-LogicalResult canonicalizeWrapAndStrideList(OpBuilder builder,
-                                            SmallVector<Value> &offsets,
-                                            SmallVector<Value> &sizes,
-                                            SmallVector<Value> &strides,
-                                            int memref_volume);
+LogicalResult canonicalizeWrapAndStrideList(
+    OpBuilder builder, SmallVector<Value> &offsets, SmallVector<Value> &sizes,
+    SmallVector<Value> &strides, int memref_volume, int maxSize = -1);
 
 // If wrap-and-stride lists are empty, populate them with default data access
 // layout (contiguous, row-major).
@@ -278,6 +276,18 @@ bool isRegionEquivalentTo(Region *lhs, Region *rhs);
 // const value equivalences.
 bool isEquivalentTo(Operation *lhs, Operation *rhs);
 
+// Generate composed affine apply op from arith addi op operating on Index
+// values.
+affine::AffineApplyOp
+consructComposedAffineApplyOpFromArithAddI(OpBuilder &builder,
+                                           arith::AddIOp addOp);
+
+// Generate composed affine apply op from arith muli op operating on Index
+// values.
+affine::AffineApplyOp
+consructComposedAffineApplyOpFromArithMulI(OpBuilder &builder,
+                                           arith::MulIOp mulOp);
+
 } // namespace air
 } // namespace xilinx
 

diff --git a/mlir/lib/Conversion/AIRRtToNpuPass.cpp b/mlir/lib/Conversion/AIRRtToNpuPass.cpp
@@ -666,6 +666,22 @@ void tileIllegalWrapDim(airrt::DmaMemcpyNdOp memcpy_op) {
                      builder.create<arith::ConstantOp>(
                          loc, builder.getI64Type(),
                          IntegerAttr::get(builder.getI64Type(), 0)));
+      // Attempt to find one dummy dimension in the wrap-and-stride list and
+      // erase.
+      auto offsetWrapZip = llvm::zip_equal(offsets, wraps);
+      auto it =
+          llvm::find_if(offsetWrapZip, [](std::tuple<Value, Value> entry) {
+            auto off = getConstantIntValue(std::get<0>(entry));
+            auto siz = getConstantIntValue(std::get<1>(entry));
+            return off && siz && *off == 0 && *siz == 1;
+          });
+      if (it != offsetWrapZip.end()) {
+        offsets.erase(offsets.begin() +
+                      std::distance(offsetWrapZip.begin(), it));
+        wraps.erase(wraps.begin() + std::distance(offsetWrapZip.begin(), it));
+        strides.erase(strides.begin() +
+                      std::distance(offsetWrapZip.begin(), it));
+      }
       i++;
     }
   }
@@ -674,51 +690,48 @@ void tileIllegalWrapDim(airrt::DmaMemcpyNdOp memcpy_op) {
   // goes beyond 4.
   SmallVector<affine::AffineForOp> for_loop_nest;
   Value inner_affine_for_iv = nullptr;
-  if (wraps.size() > AIE2_DIM_COUNT) {
+  while (wraps.size() > AIE2_DIM_COUNT) {
     affine::AffineForOp inner_affine_for = nullptr;
-    while (wraps.size() > AIE2_DIM_COUNT) {
-      auto const_offset = *getConstantIntValue(offsets[0]);
-      auto const_lowest_offset = *getConstantIntValue(offsets.back());
-      auto const_wrap = *getConstantIntValue(wraps[0]);
-      auto const_stride = *getConstantIntValue(strides[0]);
-
-      // Convert the outer dimension into an affine.for loop.
-      int const_lower_bound =
-          const_stride ? (const_offset * const_stride + const_lowest_offset)
-                       : 0;
-      auto const_upper_bound =
-          const_stride ? (const_offset * const_stride +
-                          const_wrap * const_stride + const_lowest_offset)
-                       : const_wrap;
-      int const_step = const_stride ? const_stride : 1;
-      auto new_for_op =
-          (inner_affine_for_iv)
-              ? (builder.create<affine::AffineForOp>(
-                    loc,
-                    SmallVector<Value>{builder.create<arith::AddIOp>(
-                        loc, inner_affine_for_iv,
-                        builder.create<arith::ConstantIndexOp>(
-                            loc, const_lower_bound))},
-                    AffineMap::get(ctx),
-                    SmallVector<Value>{builder.create<arith::AddIOp>(
-                        loc, inner_affine_for_iv,
-                        builder.create<arith::ConstantIndexOp>(
-                            loc, const_upper_bound))},
-                    AffineMap::get(ctx), const_step))
-              : (builder.create<affine::AffineForOp>(
-                    loc, const_lower_bound, const_upper_bound, const_step));
-      for_loop_nest.push_back(new_for_op);
-      inner_affine_for = new_for_op;
-
-      // Pop front.
-      offsets.erase(offsets.begin());
-      wraps.erase(wraps.begin());
-      strides.erase(strides.begin());
-
-      builder.setInsertionPointToStart(inner_affine_for.getBody());
-      if (const_stride)
-        inner_affine_for_iv = inner_affine_for.getInductionVar();
-    }
+    auto const_offset = *getConstantIntValue(offsets[0]);
+    auto const_lowest_offset = *getConstantIntValue(offsets.back());
+    auto const_wrap = *getConstantIntValue(wraps[0]);
+    auto const_stride = *getConstantIntValue(strides[0]);
+
+    // Convert the outer dimension into an affine.for loop.
+    int const_lower_bound =
+        const_stride ? (const_offset * const_stride + const_lowest_offset) : 0;
+    auto const_upper_bound =
+        const_stride ? (const_offset * const_stride +
+                        const_wrap * const_stride + const_lowest_offset)
+                     : const_wrap;
+    int const_step = const_stride ? const_stride : 1;
+    auto new_for_op =
+        (inner_affine_for_iv)
+            ? (builder.create<affine::AffineForOp>(
+                  loc,
+                  SmallVector<Value>{builder.create<arith::AddIOp>(
+                      loc, inner_affine_for_iv,
+                      builder.create<arith::ConstantIndexOp>(
+                          loc, const_lower_bound))},
+                  AffineMap::get(ctx),
+                  SmallVector<Value>{builder.create<arith::AddIOp>(
+                      loc, inner_affine_for_iv,
+                      builder.create<arith::ConstantIndexOp>(
+                          loc, const_upper_bound))},
+                  AffineMap::get(ctx), const_step))
+            : (builder.create<affine::AffineForOp>(
+                  loc, const_lower_bound, const_upper_bound, const_step));
+    for_loop_nest.push_back(new_for_op);
+    inner_affine_for = new_for_op;
+
+    // Pop front.
+    offsets.erase(offsets.begin());
+    wraps.erase(wraps.begin());
+    strides.erase(strides.begin());
+
+    builder.setInsertionPointToStart(inner_affine_for.getBody());
+    if (const_stride)
+      inner_affine_for_iv = inner_affine_for.getInductionVar();
   }
 
   // Stride field implicit last element one, pop.
@@ -869,6 +882,9 @@ struct AIRSpecializeAIRRtDmaWrapAndStrideInAffineFor
         tmp = strides[0];
         strides[0] = strides[i];
         strides[i] = tmp;
+        tmp = offsets[0];
+        offsets[0] = offsets[i];
+        offsets[i] = tmp;
       } else {
         (void)loopUnrollFull(for_op);
         return success();
@@ -915,6 +931,9 @@ struct AIRSpecializeAIRRtDmaWrapAndStrideInAffineFor
         loc, tys, air::lookupOrDefaultRange(opers, remap));
     new_dma->setAttrs(memcpy_op->getDiscardableAttrDictionary());
 
+    rewriter.replaceAllUsesWith(for_op.getInductionVar(),
+                                rewriter.create<arith::ConstantIndexOp>(
+                                    loc, for_op.getConstantLowerBound()));
     rewriter.eraseOp(for_op.getOperation());
 
     return success();