Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add amd-aie-direct HAL target (3/n) #420

Merged
merged 3 commits into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions build_tools/ci/print_ir_aie2xclbin/basic_dma_transpose.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// CHECK: Generating:{{.*}}aie_cdo_elfs.bin
// CHECK: Generating:{{.*}}aie_cdo_init.bin
// CHECK: Generating:{{.*}}aie_cdo_enable.bin
module attributes {hal.device.targets = [#hal.device.target<"amd-aie-direct", [#hal.executable.target<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>]>]} {
hal.executable private @dummy1 {
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @dummy2 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>]} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
aie.device(npu1_4col) {
%tile_0_0 = aie.tile(0, 0)
%tile_0_2 = aie.tile(0, 2)
aie.objectfifo @in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
aie.objectfifo.link [@in] -> [@out]()
%core_0_2 = aie.core(%tile_0_2) {
%c0 = arith.constant 0 : index
// %0 = memref.alloc() : memref<10xf32>
// %1 = memref.load %0[%c0] : memref<10xf32>
// memref.store %1, %0[%c0] : memref<10xf32>
aie.end
}
func.func @dummy2(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) {
aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<4096xi32>
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 64, 64, 1][1, 1, 64]) {id = 1 : i64, metadata = @in} : memref<4096xi32>
aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
return
}
}
}
}
}
util.func public @dummy3(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = ""}} {
// this is all gibberish just to hit serializeExecutable
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%element_type_i8 = hal.element_type<i8> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c1, %c1]) type(%element_type_i8) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1024x512xi8> in !stream.resource<external>{%c1}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c1} => !stream.timepoint

%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c1}) {
stream.cmd.dispatch @dummy1::@amdaie_xclbin_fb::@dummy2 {
ro %arg2[%c0 for %c1] : !stream.resource<external>{%c1}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c1}
%4 = stream.tensor.export %3 : tensor<1024x1024xi32> in !stream.resource<external>{%c1} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
150 changes: 150 additions & 0 deletions build_tools/ci/print_ir_aie2xclbin/buffers_xclbin.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
// CHECK: {
// CHECK: "ps-kernels": {
// CHECK: "kernels": [
// CHECK: {
// CHECK: "arguments": [
// CHECK: {
// CHECK: "address-qualifier": "SCALAR",
// CHECK: "name": "opcode",
// CHECK: "offset": "0x00",
// CHECK: "type": "uint64_t"
// CHECK: },
// CHECK: {
// CHECK: "address-qualifier": "GLOBAL",
// CHECK: "memory-connection": "SRAM",
// CHECK: "name": "instr",
// CHECK: "offset": "0x08",
// CHECK: "type": "char *"
// CHECK: },
// CHECK: {
// CHECK: "address-qualifier": "SCALAR",
// CHECK: "name": "ninstr",
// CHECK: "offset": "0x10",
// CHECK: "type": "uint32_t"
// CHECK: },
// CHECK: {
// CHECK: "address-qualifier": "GLOBAL",
// CHECK: "memory-connection": "HOST",
// CHECK: "name": "bo0",
// CHECK: "offset": "0x14",
// CHECK: "type": "void*"
// CHECK: },
// CHECK: {
// CHECK: "address-qualifier": "GLOBAL",
// CHECK: "memory-connection": "HOST",
// CHECK: "name": "bo1",
// CHECK: "offset": "0x1c",
// CHECK: "type": "void*"
// CHECK: },
// CHECK: {
// CHECK: "address-qualifier": "GLOBAL",
// CHECK: "memory-connection": "HOST",
// CHECK: "name": "bo2",
// CHECK: "offset": "0x24",
// CHECK: "type": "void*"
// CHECK: },
// CHECK: {
// CHECK: "address-qualifier": "GLOBAL",
// CHECK: "memory-connection": "HOST",
// CHECK: "name": "bo3",
// CHECK: "offset": "0x2c",
// CHECK: "type": "void*"
// CHECK: },
// CHECK: {
// CHECK: "address-qualifier": "GLOBAL",
// CHECK: "memory-connection": "HOST",
// CHECK: "name": "bo4",
// CHECK: "offset": "0x34",
// CHECK: "type": "void*"
// CHECK: },
// CHECK: {
// CHECK: "address-qualifier": "GLOBAL",
// CHECK: "memory-connection": "HOST",
// CHECK: "name": "bo5",
// CHECK: "offset": "0x3c",
// CHECK: "type": "void*"
// CHECK: }
// CHECK: ],
// CHECK: "extended-data": {
// CHECK: "dpu_kernel_id": "0x0",
// CHECK: "functional": "0",
// CHECK: "subtype": "DPU"
// CHECK: },
// CHECK: "instances": [
// CHECK: {
// CHECK: "name": "dummy2_0"
// CHECK: }
// CHECK: ],
// CHECK: "name": "dummy2_0",
// CHECK: "type": "dpu"
// CHECK: }
// CHECK: ]
// CHECK: }
// CHECK: }

module attributes {hal.device.targets = [#hal.device.target<"amd-aie-direct", [#hal.executable.target<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>]>]} {
hal.executable private @dummy1 {
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @dummy2 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>]} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
aie.device(npu1_4col) {
memref.global "public" @in0 : memref<1024xi32>
memref.global "public" @out0 : memref<1024xi32>
memref.global "public" @in1 : memref<1024xi32>
memref.global "public" @out1 : memref<1024xi32>
memref.global "public" @in2 : memref<1024xi32>
memref.global "public" @out2 : memref<1024xi32>
%02 = aie.tile(0, 2)
%12 = aie.tile(1, 2)
%22 = aie.tile(2, 2)

aie.core(%12) {
aie.end
}
aie.shim_dma_allocation @in0(MM2S, 0, 0)
aie.shim_dma_allocation @out0(S2MM, 0, 0)
aie.shim_dma_allocation @in1(MM2S, 1, 0)
aie.shim_dma_allocation @out1(S2MM, 1, 0)
aie.shim_dma_allocation @in2(MM2S, 2, 0)
aie.shim_dma_allocation @out2(S2MM, 2, 0)

func.func @dummy2(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>, %arg3: memref<1024xi32>, %arg4: memref<1024xi32>, %arg5: memref<1024xi32>) {
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 0 : i64, metadata = @in0} : memref<1024xi32>
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 1 : i64, metadata = @out0} : memref<1024xi32>
aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 2 : i64, metadata = @in1} : memref<1024xi32>
aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 3 : i64, metadata = @out1} : memref<1024xi32>
aiex.npu.sync {channel = 1 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
aiex.npu.dma_memcpy_nd(0, 0, %arg4[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 2 : i64, metadata = @in2} : memref<1024xi32>
aiex.npu.dma_memcpy_nd(0, 0, %arg5[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 3 : i64, metadata = @out2} : memref<1024xi32>
aiex.npu.sync {channel = 0 : i32, column = 2 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
return
}
}
}
}
}
util.func public @dummy3(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = ""}} {
// this is all gibberish just to hit serializeExecutable
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%element_type_i8 = hal.element_type<i8> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c1, %c1]) type(%element_type_i8) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1024x512xi8> in !stream.resource<external>{%c1}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c1} => !stream.timepoint

%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c1}) {
stream.cmd.dispatch @dummy1::@amdaie_xclbin_fb::@dummy2 {
ro %arg2[%c0 for %c1] : !stream.resource<external>{%c1}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c1}
%4 = stream.tensor.export %3 : tensor<1024x1024xi32> in !stream.resource<external>{%c1} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
50 changes: 50 additions & 0 deletions build_tools/ci/print_ir_aie2xclbin/print_ir_aie2xclbin.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ if [ "$#" -eq 2 ]; then
MLIR_AIE=`realpath .venv/lib/python3.10/site-packages/mlir_aie`
fi

echo "chess-clang: $(find $VITIS -name chess-clang)"
echo "xchesscc: $(find $VITIS -name xchesscc)"

# The local set-paths-manually case:
if [ "$#" -eq 6 ]; then
PEANO="$3"
Expand Down Expand Up @@ -183,4 +186,51 @@ ${FILECHECK_EXE} --input-file ${STDERR_FULLPATH} ${0} --check-prefix=CHECK-STDER
# CHECK-STDOUT-DAG: MEM_TOPOLOGY
${FILECHECK_EXE} --input-file ${STDOUT_FULLPATH} ${0} --check-prefix=CHECK-STDOUT

SOURCE_MLIR_FILE="${THIS}/basic_dma_transpose.mlir"

IREE_COMPILE_COMMAND="${IREE_COMPILE_EXE} \
${SOURCE_MLIR_FILE} \
--compile-mode=hal-executable \
--iree-hal-target-backends=amd-aie-direct \
--iree-amd-aie-peano-install-dir=${PEANO} \
--iree-amd-aie-mlir-aie-install-dir=${MLIR_AIE} \
--iree-amd-aie-vitis-install-dir=${VITIS} \
--iree-hal-dump-executable-intermediates-to=${OUTPUT} \
--iree-hal-dump-executable-files-to=${OUTPUT} \
--mlir-disable-threading \
--iree-amd-aie-show-invoked-commands"

echo "Executing command: $IREE_COMPILE_COMMAND"
eval $IREE_COMPILE_COMMAND 1> ${STDOUT_FULLPATH}
if [ ! -f "${STDOUT_FULLPATH}" ]; then
echo "stdout file was not created: ${STDOUT_FULLPATH}"
exit 1
fi

${FILECHECK_EXE} --input-file ${STDOUT_FULLPATH} $SOURCE_MLIR_FILE


SOURCE_MLIR_FILE="${THIS}/buffers_xclbin.mlir"

IREE_COMPILE_COMMAND="${IREE_COMPILE_EXE} \
${SOURCE_MLIR_FILE} \
--compile-mode=hal-executable \
--iree-hal-target-backends=amd-aie-direct \
--iree-amd-aie-peano-install-dir=${PEANO} \
--iree-amd-aie-mlir-aie-install-dir=${MLIR_AIE} \
--iree-amd-aie-vitis-install-dir=${VITIS} \
--iree-hal-dump-executable-intermediates-to=${OUTPUT} \
--iree-hal-dump-executable-files-to=${OUTPUT} \
--mlir-disable-threading \
--iree-amd-aie-show-invoked-commands"

echo "Executing command: $IREE_COMPILE_COMMAND"
eval $IREE_COMPILE_COMMAND 1> ${STDOUT_FULLPATH}
if [ ! -f "${STDOUT_FULLPATH}" ]; then
echo "stdout file was not created: ${STDOUT_FULLPATH}"
exit 1
fi

${FILECHECK_EXE} --input-file ${OUTPUT}/module_dummy1_amdaie_xclbin_fb/kernels.json $SOURCE_MLIR_FILE

echo "Test of printing in aie2xclbin passed."
92 changes: 85 additions & 7 deletions compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,67 @@ iree_tablegen_library(
-gen-enum-defs Dialect/AIEVec/IR/AIEVecEnums.cpp.inc
)

iree_tablegen_library(
NAME
AIEVecOpsGen
TD_FILE
"${IREE_MLIR_AIE_SOURCE_DIR}/include/aie/Dialect/AIEVec/IR/AIEVecOps.td"
OUTS
-gen-op-decls Dialect/AIEVec/IR/AIEVecOps.h.inc
-gen-op-defs Dialect/AIEVec/IR/AIEVecOps.cpp.inc
)

iree_tablegen_library(
NAME
AIEVecConversionPassIncGen
TD_FILE
"${IREE_MLIR_AIE_SOURCE_DIR}/include/aie/Conversion/Passes.td"
OUTS
# this doesn't follow the correct naming convention but it's burned in downstream
-gen-pass-decls Conversion/Passes.h.inc
-gen-enum-decls Conversion/PassesEnums.h.inc
-gen-enum-defs Conversion/PassesEnums.cpp.inc
)

iree_tablegen_library(
NAME
AIEVecPassIncGen
TD_FILE
"${IREE_MLIR_AIE_SOURCE_DIR}/include/aie/Dialect/AIEVec/Transforms/Passes.td"
OUTS
-gen-pass-decls Dialect/AIEVec/Transforms/Passes.h.inc
)

iree_tablegen_library(
NAME
AIEVecAnalysisPassesIncGen
TD_FILE
"${IREE_MLIR_AIE_SOURCE_DIR}/include/aie/Dialect/AIEVec/Analysis/Passes.td"
OUTS
-gen-pass-decls Dialect/AIEVec/Analysis/Passes.h.inc
)

iree_tablegen_library(
NAME
AIEVecXLLVMOpsGen
TD_FILE
"${IREE_MLIR_AIE_SOURCE_DIR}/include/aie/Dialect/XLLVM/IR/XLLVMOps.td"
OUTS
-gen-dialect-decls -dialect=xllvm Dialect/XLLVM/IR/XLLVMDialect.h.inc
-gen-dialect-defs -dialect=xllvm Dialect/XLLVM/IR/XLLVMDialect.cpp.inc
-gen-op-decls Dialect/XLLVM/IR/XLLVMOps.h.inc
-gen-op-defs Dialect/XLLVM/IR/XLLVMOps.cpp.inc
)

iree_tablegen_library(
NAME
AIEVecXLLVMConversionPassIncGen
TD_FILE
"${IREE_MLIR_AIE_SOURCE_DIR}/include/aie/Dialect/XLLVM/IR/XLLVMOps.td"
OUTS
-gen-llvmir-conversions Dialect/XLLVM/IR/XLLVMConversions.inc
)

iree_tablegen_library(
NAME
AIEInterfacesGen
Expand Down Expand Up @@ -169,6 +230,7 @@ iree_cc_library(
SRCS
${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/IR/AIEVecOps.cpp
${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/IR/AIEVecTypes.cpp
${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/XLLVM/XLLVMOps.cpp
DEPS
::defs
::AIEVecOpsGen
Expand All @@ -177,14 +239,30 @@ iree_cc_library(
MLIRIR
)

iree_tablegen_library(
iree_cc_library(
NAME
AIEVecOpsGen
TD_FILE
"${IREE_MLIR_AIE_SOURCE_DIR}/include/aie/Dialect/AIEVec/IR/AIEVecOps.td"
OUTS
-gen-op-decls Dialect/AIEVec/IR/AIEVecOps.h.inc
-gen-op-defs Dialect/AIEVec/IR/AIEVecOps.cpp.inc
AIEVecConvertToLLVM
SRCS
"${IREE_MLIR_AIE_SOURCE_DIR}/lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp"
"${IREE_MLIR_AIE_SOURCE_DIR}/lib/Target/LLVMIR/Dialect/XLLVM/XLLVMToLLVMIRTranslation.cpp"
"${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/IntervalReuse.cpp"
"${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/AIEVectorize.cpp"
"${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/ConvertVectorToAIEVec.cpp"
"${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp"
"${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp"
"${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/AIEVecOptimizations.cpp"
"${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/FoldMulAddChainToConvOp.cpp"
"${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/CopyRemoval.cpp"
"${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/DynamicSizeNoImplicitBroadcast.cpp"
"${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Utils/Utils.cpp"
DEPS
::defs
::AIEVecDialectIR
::AIEVecAnalysisPassesIncGen
::AIEVecConversionPassIncGen
::AIEVecPassIncGen
::AIEVecXLLVMConversionPassIncGen
::AIEVecXLLVMOpsGen
)

###############################################################################
Expand Down
Loading
Loading