Skip to content

Commit

Permalink
AIROptMemtileDmaBDs: Add memtile dma bd generator (#887)
Browse files Browse the repository at this point in the history
* Fixup stride calculation math when folding for loops; pass in maxSize for wrap-and-stride canonicalization

* Update the mlir ir test to reflect the optimized dma bds generated

* (Test) Revert removing const check for offset[i-1]

* Revert "(Test) Revert removing const check for offset[i-1]"

This reverts commit 0e4059f.

* Add support for arith addi/muli ops in wrap-and-stride canonicalization

* Add air-opt-memtile-dma-bds pass which generates AIE memtile dma bds from logical air.channel ops

* Create a new affine apply op for each offset composition

* Change foldForLoopNestAsExtendedSizesAndStrides to not directly mutate any existing ops; leave the loop folding and mutation later, so that the code can fall back

* When tiling illegal wrap dim, attempt to erase one dummy wrap-and-stride dimension, if any

* Replace uses of air-specialize-channel-wrap-and-stride with air-opt-memtile-dma-bds in xrt tests
  • Loading branch information
erwei-xilinx authored Feb 6, 2025
1 parent 264fec6 commit c0ac177
Show file tree
Hide file tree
Showing 36 changed files with 494 additions and 122 deletions.
6 changes: 5 additions & 1 deletion mlir/include/air/Transform/AIRDependencyScheduleOpt.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ std::unique_ptr<mlir::Pass> createAIROptimizeShimDMABDs();
std::unique_ptr<Pass>
createAIROptimizeShimDMABDs(AIROptimizeShimDMABDsOptions options);

std::unique_ptr<mlir::Pass> createAIROptimizeMemtileDMABDs();
std::unique_ptr<Pass>
createAIROptimizeMemtileDMABDs(AIROptimizeMemtileDMABDsOptions options);

std::unique_ptr<mlir::Pass> createAIRFuseAllocDealloc();

std::unique_ptr<mlir::Pass> createAIRShrinkMemrefSizesByAccess();
Expand All @@ -74,7 +78,7 @@ void populateAIRLoopIndexCanonicalizationPatterns(RewritePatternSet &patterns);

// Apply AIRSpecializeChannelWrapAndStridePattern on region.
void applyAIRSpecializeChannelWrapAndStridePattern(Region *region,
int maxNumDims,
int maxNumDims, int maxSize,
bool enableForLoopUnrolling);

// Populate patterns for fusing scf.for loops within air.launch.
Expand Down
1 change: 1 addition & 0 deletions mlir/include/air/Transform/PassDetail.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ namespace air {
#define GEN_PASS_DEF_AFFINELOOPOPTPASS
#define GEN_PASS_DEF_AIRLOOPFUSION
#define GEN_PASS_DEF_AIROPTIMIZESHIMDMABDS
#define GEN_PASS_DEF_AIROPTIMIZEMEMTILEDMABDS
#define GEN_PASS_DEF_AIRFUSEALLOCDEALLOC
#define GEN_PASS_DEF_AIRSHRINKMEMREFSIZESBYACCESS
#define GEN_PASS_DEF_AIRSPLITL2MEMREFFORBUFFERCONSTRAINTPASS
Expand Down
15 changes: 14 additions & 1 deletion mlir/include/air/Transform/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -1069,7 +1069,7 @@ def AIRLoopFusion: Pass<"air-loop-fusion", "func::FuncOp"> {
}

def AIROptimizeShimDMABDs: Pass<"air-opt-shim-dma-bds", "func::FuncOp"> {
let summary = "Optimize logical air.channel.put/get op into efficient shim dma block descriptor (BD)";
let summary = "Optimize logical air.channel.put/get op into efficient AIE shim dma block descriptor (BD)";
let constructor = "xilinx::air::createAIROptimizeShimDMABDs()";
let description = [{
Optimize the logical data movement by transforming them, represented as air.channel.put/get operations, into explicit representation of physical data movement block descriptors (BDs), also represented as air.channel.put/get operations.
Expand All @@ -1081,6 +1081,19 @@ def AIROptimizeShimDMABDs: Pass<"air-opt-shim-dma-bds", "func::FuncOp"> {
];
}

def AIROptimizeMemtileDMABDs: Pass<"air-opt-memtile-dma-bds", "func::FuncOp"> {
let summary = "Optimize logical air.channel.put/get op into efficient AIE memtile dma block descriptor (BD)";
let constructor = "xilinx::air::createAIROptimizeMemtileDMABDs()";
let description = [{
Optimize the logical data movement by transforming them, represented as air.channel.put/get operations, into explicit representation of physical data movement block descriptors (BDs), also represented as air.channel.put/get operations.
}];
let options = [
Option<"clDevice", "device", "std::string",
/*default=*/"\"xcvc1902\"",
"AIE device to target.">,
];
}

def AIRFuseAllocDealloc: Pass<"air-fuse-alloc-dealloc", "func::FuncOp"> {
let summary = "Fuse pairs of memref.alloc/dealloc ops into the inner-most region containing all uses of the memref.";
let constructor = "xilinx::air::createAIRFuseAllocDealloc()";
Expand Down
20 changes: 15 additions & 5 deletions mlir/include/air/Util/Util.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,9 @@ LogicalResult foldForLoopNestAsExtendedSizesAndStrides(
SmallVector<Value> &strides, Value memref);

// Canonicalize wrap and stride lists, by removing redundant dimensions.
LogicalResult canonicalizeWrapAndStrideList(OpBuilder builder,
SmallVector<Value> &offsets,
SmallVector<Value> &sizes,
SmallVector<Value> &strides,
int memref_volume);
LogicalResult canonicalizeWrapAndStrideList(
OpBuilder builder, SmallVector<Value> &offsets, SmallVector<Value> &sizes,
SmallVector<Value> &strides, int memref_volume, int maxSize = -1);

// If wrap-and-stride lists are empty, populate them with default data access
// layout (contiguous, row-major).
Expand Down Expand Up @@ -278,6 +276,18 @@ bool isRegionEquivalentTo(Region *lhs, Region *rhs);
// const value equivalences.
bool isEquivalentTo(Operation *lhs, Operation *rhs);

// Generate composed affine apply op from arith addi op operating on Index
// values.
affine::AffineApplyOp
consructComposedAffineApplyOpFromArithAddI(OpBuilder &builder,
arith::AddIOp addOp);

// Generate composed affine apply op from arith muli op operating on Index
// values.
affine::AffineApplyOp
consructComposedAffineApplyOpFromArithMulI(OpBuilder &builder,
arith::MulIOp mulOp);

} // namespace air
} // namespace xilinx

Expand Down
107 changes: 63 additions & 44 deletions mlir/lib/Conversion/AIRRtToNpuPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,22 @@ void tileIllegalWrapDim(airrt::DmaMemcpyNdOp memcpy_op) {
builder.create<arith::ConstantOp>(
loc, builder.getI64Type(),
IntegerAttr::get(builder.getI64Type(), 0)));
// Attempt to find one dummy dimension in the wrap-and-stride list and
// erase.
auto offsetWrapZip = llvm::zip_equal(offsets, wraps);
auto it =
llvm::find_if(offsetWrapZip, [](std::tuple<Value, Value> entry) {
auto off = getConstantIntValue(std::get<0>(entry));
auto siz = getConstantIntValue(std::get<1>(entry));
return off && siz && *off == 0 && *siz == 1;
});
if (it != offsetWrapZip.end()) {
offsets.erase(offsets.begin() +
std::distance(offsetWrapZip.begin(), it));
wraps.erase(wraps.begin() + std::distance(offsetWrapZip.begin(), it));
strides.erase(strides.begin() +
std::distance(offsetWrapZip.begin(), it));
}
i++;
}
}
Expand All @@ -674,51 +690,48 @@ void tileIllegalWrapDim(airrt::DmaMemcpyNdOp memcpy_op) {
// goes beyond 4.
SmallVector<affine::AffineForOp> for_loop_nest;
Value inner_affine_for_iv = nullptr;
if (wraps.size() > AIE2_DIM_COUNT) {
while (wraps.size() > AIE2_DIM_COUNT) {
affine::AffineForOp inner_affine_for = nullptr;
while (wraps.size() > AIE2_DIM_COUNT) {
auto const_offset = *getConstantIntValue(offsets[0]);
auto const_lowest_offset = *getConstantIntValue(offsets.back());
auto const_wrap = *getConstantIntValue(wraps[0]);
auto const_stride = *getConstantIntValue(strides[0]);

// Convert the outer dimension into an affine.for loop.
int const_lower_bound =
const_stride ? (const_offset * const_stride + const_lowest_offset)
: 0;
auto const_upper_bound =
const_stride ? (const_offset * const_stride +
const_wrap * const_stride + const_lowest_offset)
: const_wrap;
int const_step = const_stride ? const_stride : 1;
auto new_for_op =
(inner_affine_for_iv)
? (builder.create<affine::AffineForOp>(
loc,
SmallVector<Value>{builder.create<arith::AddIOp>(
loc, inner_affine_for_iv,
builder.create<arith::ConstantIndexOp>(
loc, const_lower_bound))},
AffineMap::get(ctx),
SmallVector<Value>{builder.create<arith::AddIOp>(
loc, inner_affine_for_iv,
builder.create<arith::ConstantIndexOp>(
loc, const_upper_bound))},
AffineMap::get(ctx), const_step))
: (builder.create<affine::AffineForOp>(
loc, const_lower_bound, const_upper_bound, const_step));
for_loop_nest.push_back(new_for_op);
inner_affine_for = new_for_op;

// Pop front.
offsets.erase(offsets.begin());
wraps.erase(wraps.begin());
strides.erase(strides.begin());

builder.setInsertionPointToStart(inner_affine_for.getBody());
if (const_stride)
inner_affine_for_iv = inner_affine_for.getInductionVar();
}
auto const_offset = *getConstantIntValue(offsets[0]);
auto const_lowest_offset = *getConstantIntValue(offsets.back());
auto const_wrap = *getConstantIntValue(wraps[0]);
auto const_stride = *getConstantIntValue(strides[0]);

// Convert the outer dimension into an affine.for loop.
int const_lower_bound =
const_stride ? (const_offset * const_stride + const_lowest_offset) : 0;
auto const_upper_bound =
const_stride ? (const_offset * const_stride +
const_wrap * const_stride + const_lowest_offset)
: const_wrap;
int const_step = const_stride ? const_stride : 1;
auto new_for_op =
(inner_affine_for_iv)
? (builder.create<affine::AffineForOp>(
loc,
SmallVector<Value>{builder.create<arith::AddIOp>(
loc, inner_affine_for_iv,
builder.create<arith::ConstantIndexOp>(
loc, const_lower_bound))},
AffineMap::get(ctx),
SmallVector<Value>{builder.create<arith::AddIOp>(
loc, inner_affine_for_iv,
builder.create<arith::ConstantIndexOp>(
loc, const_upper_bound))},
AffineMap::get(ctx), const_step))
: (builder.create<affine::AffineForOp>(
loc, const_lower_bound, const_upper_bound, const_step));
for_loop_nest.push_back(new_for_op);
inner_affine_for = new_for_op;

// Pop front.
offsets.erase(offsets.begin());
wraps.erase(wraps.begin());
strides.erase(strides.begin());

builder.setInsertionPointToStart(inner_affine_for.getBody());
if (const_stride)
inner_affine_for_iv = inner_affine_for.getInductionVar();
}

// Stride field implicit last element one, pop.
Expand Down Expand Up @@ -869,6 +882,9 @@ struct AIRSpecializeAIRRtDmaWrapAndStrideInAffineFor
tmp = strides[0];
strides[0] = strides[i];
strides[i] = tmp;
tmp = offsets[0];
offsets[0] = offsets[i];
offsets[i] = tmp;
} else {
(void)loopUnrollFull(for_op);
return success();
Expand Down Expand Up @@ -915,6 +931,9 @@ struct AIRSpecializeAIRRtDmaWrapAndStrideInAffineFor
loc, tys, air::lookupOrDefaultRange(opers, remap));
new_dma->setAttrs(memcpy_op->getDiscardableAttrDictionary());

rewriter.replaceAllUsesWith(for_op.getInductionVar(),
rewriter.create<arith::ConstantIndexOp>(
loc, for_op.getConstantLowerBound()));
rewriter.eraseOp(for_op.getOperation());

return success();
Expand Down
Loading

0 comments on commit c0ac177

Please sign in to comment.