Skip to content

Commit

Permalink
Update aircc.py and some board tests (#364)
Browse files Browse the repository at this point in the history
Co-authored-by: root <[email protected]>
  • Loading branch information
erwei-xilinx and root authored Dec 14, 2023
1 parent faa6f97 commit 7caefd2
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 115 deletions.
4 changes: 2 additions & 2 deletions python/air/compiler/aircc/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from joblib import Parallel, delayed
import shutil

from air.mlir.passmanager import PassManager
from air.mlir.ir import Module, Context, Location
from air.passmanager import PassManager
from air.ir import Module, Context, Location
from air.dialects import air as airdialect

import air.compiler.aircc.cl_arguments as cl_arguments
Expand Down
82 changes: 37 additions & 45 deletions test/44_air_mmult_2x2/air.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -6,55 +6,47 @@
//
//===----------------------------------------------------------------------===//

#map = affine_map<()[s0] -> (s0 * 32)>
module attributes {torch.debug_module_name = "mmult"} {
func.func @forward(%a0: memref<64x64xi32>, %a1: memref<64x64xi32>, %a2: memref<64x64xi32>) {
air.segment @segment0 args(%arg0=%a0, %arg1=%a1, %arg2=%a2) : memref<64x64xi32>, memref<64x64xi32>, memref<64x64xi32> {
%c2 = arith.constant 2 : index
%c0_i32 = arith.constant 0 : i32
%0 = memref.alloc() {alignment = 128 : i64} : memref<64x64xi32>
%1 = memref.alloc() {alignment = 128 : i64} : memref<64x64xi32>
affine.for %arg3 = 0 to 64 {
affine.for %arg4 = 0 to 64 {
affine.store %c0_i32, %0[%arg3, %arg4] : memref<64x64xi32>
}
}
memref.copy %0, %1 : memref<64x64xi32> to memref<64x64xi32>
air.herd tile (%arg3, %arg4) in (%arg5=%c2, %arg6=%c2) args(%arg7=%arg0, %arg8=%arg1, %arg9=%1) : memref<64x64xi32>, memref<64x64xi32>, memref<64x64xi32> attributes {sym_name = "herd_0"} {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c64 = arith.constant 64 : index
%c32 = arith.constant 32 : index
%2 = affine.apply #map()[%arg3]
%3 = affine.apply #map()[%arg4]
scf.for %arg10 = %c0 to %c64 step %c32 {
%4 = memref.alloc() : memref<32x32xi32, 2>
%5 = memref.alloc() : memref<32x32xi32, 2>
%6 = memref.alloc() : memref<32x32xi32, 2>
air.dma_memcpy_nd (%4[] [] [], %arg7[%2, %arg10] [%c32, %c32] [%c64, %c1]) {id = 1 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
air.dma_memcpy_nd (%5[] [] [], %arg8[%arg10, %3] [%c32, %c32] [%c64, %c1]) {id = 2 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
air.dma_memcpy_nd (%6[] [] [], %arg9[%2, %3] [%c32, %c32] [%c64, %c1]) {id = 3 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
affine.for %arg11 = 0 to 32 {
affine.for %arg12 = 0 to 32 {
affine.for %arg13 = 0 to 32 {
%7 = affine.load %4[%arg11, %arg13] : memref<32x32xi32, 2>
%8 = affine.load %5[%arg13, %arg12] : memref<32x32xi32, 2>
%9 = affine.load %6[%arg11, %arg12] : memref<32x32xi32, 2>
%10 = arith.muli %7, %8 : i32
%11 = arith.addi %9, %10 : i32
affine.store %11, %6[%arg11, %arg12] : memref<32x32xi32, 2>
}
}
#map = affine_map<()[s0] -> (s0 * 64)>
#map1 = affine_map<()[s0] -> (s0 * 32)>
module {
func.func @forward(%arg0: memref<64x64xi32>, %arg1: memref<64x64xi32>, %arg2: memref<64x64xi32>) {
%c1 = arith.constant 1 : index
%alloc = memref.alloc() : memref<64x64xi32>
air.launch (%arg3, %arg4) in (%arg5=%c1, %arg6=%c1) args(%arg7=%alloc, %arg8=%arg0, %arg9=%arg1) : memref<64x64xi32>, memref<64x64xi32>, memref<64x64xi32> {
air.segment @segment_0 args(%arg10=%arg3, %arg11=%arg4, %arg12=%arg7, %arg13=%arg8, %arg14=%arg9) : index, index, memref<64x64xi32>, memref<64x64xi32>, memref<64x64xi32> {
%c2 = arith.constant 2 : index
%0 = affine.apply #map()[%arg10]
%1 = affine.apply #map()[%arg11]
air.herd @herd_0 tile (%arg15, %arg16) in (%arg17=%c2, %arg18=%c2) args(%arg19=%0, %arg20=%1, %arg21=%arg12, %arg22=%arg13, %arg23=%arg14) : index, index, memref<64x64xi32>, memref<64x64xi32>, memref<64x64xi32> {
%c1_0 = arith.constant 1 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%c64 = arith.constant 64 : index
%c32 = arith.constant 32 : index
%2 = affine.apply #map1()[%arg15]
%3 = affine.apply #map1()[%arg16]
%4 = arith.addi %arg19, %2 : index
%5 = arith.addi %arg20, %3 : index
%alloc_1 = memref.alloc() : memref<32x32xi32, 2>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x32xi32, 2>)
scf.for %arg24 = %c0 to %c64 step %c32 {
%alloc_2 = memref.alloc() : memref<32x32xi32, 2>
%alloc_3 = memref.alloc() : memref<32x32xi32, 2>
air.dma_memcpy_nd (%alloc_2[] [] [], %arg22[%4, %arg24] [%c32, %c32] [%c64, %c1_0]) {id = 1 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
air.dma_memcpy_nd (%alloc_3[] [] [], %arg23[%arg24, %5] [%c32, %c32] [%c64, %c1_0]) {id = 2 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%alloc_2, %alloc_3 : memref<32x32xi32, 2>, memref<32x32xi32, 2>) outs(%alloc_1 : memref<32x32xi32, 2>)
memref.dealloc %alloc_2 : memref<32x32xi32, 2>
memref.dealloc %alloc_3 : memref<32x32xi32, 2>
}
air.dma_memcpy_nd (%arg9[%2, %3] [%c32, %c32] [%c64, %c1], %6[] [] []) {id = 4 : i32} : (memref<64x64xi32>, memref<32x32xi32, 2>)
memref.dealloc %4 : memref<32x32xi32, 2>
memref.dealloc %5 : memref<32x32xi32, 2>
memref.dealloc %6 : memref<32x32xi32, 2>
air.dma_memcpy_nd (%arg21[%4, %5] [%c32, %c32] [%c64, %c1_0], %alloc_1[] [] []) {id = 3 : i32} : (memref<64x64xi32>, memref<32x32xi32, 2>)
memref.dealloc %alloc_1 : memref<32x32xi32, 2>
air.herd_terminator
}
air.herd_terminator
air.segment_terminator
}
memref.copy %1, %arg2 : memref<64x64xi32> to memref<64x64xi32>
air.launch_terminator
}
memref.copy %alloc, %arg2 : memref<64x64xi32> to memref<64x64xi32>
return
}
}
Expand Down
164 changes: 96 additions & 68 deletions test/46_air_mmult_2x2_tokens/air.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -5,79 +5,107 @@
//
//===----------------------------------------------------------------------===//

#map = affine_map<()[s0] -> (s0 * 32)>
module attributes {torch.debug_module_name = "mmult"} {
#map = affine_map<()[s0] -> (s0 * 64)>
#map1 = affine_map<()[s0] -> (s0 * 32)>
#set = affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>
#set1 = affine_set<()[s0, s1] : (s0 - 1 == 0, s1 >= 0, -s1 + 1 >= 0)>
#set2 = affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>
#set3 = affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 - 1 == 0)>
module {
func.func @forward(%arg0: memref<64x64xi32>, %arg1: memref<64x64xi32>, %arg2: memref<64x64xi32>) {
%c2 = arith.constant 2 : index
%c0_i32 = arith.constant 0 : i32
%c1 = arith.constant 1 : index
%async_token, %results = air.execute -> (memref<64x64xi32>) {
%alloc = memref.alloc() {alignment = 128 : i64} : memref<64x64xi32>
%alloc = memref.alloc() : memref<64x64xi32>
air.execute_terminator %alloc : memref<64x64xi32>
} {id = 1 : i32}
%async_token_0 = air.execute [%async_token] {
linalg.fill ins(%c0_i32 : i32) outs(%results : memref<64x64xi32>)
} {id = 2 : i32}
%async_token_1, %results_2 = air.execute -> (memref<64x64xi32>) {
%alloc = memref.alloc() {alignment = 128 : i64} : memref<64x64xi32>
air.execute_terminator %alloc : memref<64x64xi32>
} {id = 3 : i32}
%async_token_3 = air.execute [%async_token_1, %async_token_0] {
memref.copy %results, %results_2 : memref<64x64xi32> to memref<64x64xi32>
} {id = 4 : i32}
%0 = air.herd @herd_0 async [%async_token_3] tile (%arg3, %arg4) in (%arg5=%c2, %arg6=%c2) args(%arg7=%arg0, %arg8=%arg1, %arg9=%results_2) : memref<64x64xi32>, memref<64x64xi32>, memref<64x64xi32> attributes {id = 1 : i32} {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c64 = arith.constant 64 : index
%c32 = arith.constant 32 : index
%async_token_5, %results_6 = air.execute -> (index) {
%3 = affine.apply #map()[%arg3]
air.execute_terminator %3 : index
} {id = 5 : i32}
%async_token_7, %results_8 = air.execute -> (index) {
%3 = affine.apply #map()[%arg4]
air.execute_terminator %3 : index
} {id = 6 : i32}
%1 = air.wait_all async [%async_token_5, %async_token_7] {id = 2 : i32}
%2 = scf.for %arg10 = %c0 to %c64 step %c32 iter_args(%arg11 = %1) -> (!air.async.token) {
%c32_9 = arith.constant 32 : index
%c64_10 = arith.constant 64 : index
%c1_11 = arith.constant 1 : index
%async_token_12, %results_13 = air.execute [%arg11] -> (memref<32x32xi32, 2>) {
%alloc = memref.alloc() : memref<32x32xi32, 2>
air.execute_terminator %alloc : memref<32x32xi32, 2>
} {id = 7 : i32}
%async_token_14, %results_15 = air.execute [%arg11] -> (memref<32x32xi32, 2>) {
%alloc = memref.alloc() : memref<32x32xi32, 2>
air.execute_terminator %alloc : memref<32x32xi32, 2>
} {id = 8 : i32}
%async_token_16, %results_17 = air.execute [%arg11] -> (memref<32x32xi32, 2>) {
%alloc = memref.alloc() : memref<32x32xi32, 2>
air.execute_terminator %alloc : memref<32x32xi32, 2>
} {id = 9 : i32}
%3 = air.dma_memcpy_nd async [%async_token_12, %arg11] (%results_13[] [] [], %arg7[%results_6, %arg10] [%c32_9, %c32_9] [%c64_10, %c1_11]) {id = 1 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
%4 = air.dma_memcpy_nd async [%async_token_14, %arg11] (%results_15[] [] [], %arg8[%arg10, %results_8] [%c32_9, %c32_9] [%c64_10, %c1_11]) {id = 2 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
%5 = air.dma_memcpy_nd async [%async_token_16, %arg11, %arg11] (%results_17[] [] [], %arg9[%results_6, %results_8] [%c32_9, %c32_9] [%c64_10, %c1_11]) {id = 3 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
%async_token_18 = air.execute [%4, %5, %3] {
linalg.matmul ins(%results_13, %results_15 : memref<32x32xi32, 2>, memref<32x32xi32, 2>) outs(%results_17 : memref<32x32xi32, 2>)
} {id = 10 : i32}
%6 = air.dma_memcpy_nd async [%async_token_18] (%arg9[%results_6, %results_8] [%c32_9, %c32_9] [%c64_10, %c1_11], %results_17[] [] []) {id = 4 : i32} : (memref<64x64xi32>, memref<32x32xi32, 2>)
%async_token_19 = air.execute [%async_token_18] {
memref.dealloc %results_13 : memref<32x32xi32, 2>
} {id = 11 : i32}
%async_token_20 = air.execute [%async_token_18] {
memref.dealloc %results_15 : memref<32x32xi32, 2>
} {id = 12 : i32}
%async_token_21 = air.execute [%6] {
memref.dealloc %results_17 : memref<32x32xi32, 2>
} {id = 13 : i32}
%7 = air.wait_all async [%async_token_19, %async_token_20, %async_token_21] {id = 1 : i32}
scf.yield %7 : !air.async.token
}
%0 = air.launch async [%async_token] (%arg3, %arg4) in (%arg5=%c1, %arg6=%c1) args(%arg7=%results, %arg8=%arg0, %arg9=%arg1) : memref<64x64xi32>, memref<64x64xi32>, memref<64x64xi32> attributes {id = 1 : i32} {
%1 = air.segment @segment_0 async args(%arg10=%arg3, %arg11=%arg4, %arg12=%arg7, %arg13=%arg8, %arg14=%arg9) : index, index, memref<64x64xi32>, memref<64x64xi32>, memref<64x64xi32> attributes {id = 2 : i32} {
%c2 = arith.constant 2 : index
%async_token_1, %results_2 = air.execute -> (index) {
%3 = affine.apply #map()[%arg10]
air.execute_terminator %3 : index
}
%async_token_3, %results_4 = air.execute -> (index) {
%3 = affine.apply #map()[%arg11]
air.execute_terminator %3 : index
}
%2 = air.herd @herd_0 async tile (%arg15, %arg16) in (%arg17=%c2, %arg18=%c2) args(%arg19=%results_2, %arg20=%results_4, %arg21=%arg12, %arg22=%arg13, %arg23=%arg14) : index, index, memref<64x64xi32>, memref<64x64xi32>, memref<64x64xi32> attributes {id = 3 : i32} {
%c1_5 = arith.constant 1 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%c64 = arith.constant 64 : index
%c32 = arith.constant 32 : index
%async_token_6, %results_7 = air.execute -> (index) {
%6 = affine.apply #map1()[%arg15]
air.execute_terminator %6 : index
}
%async_token_8, %results_9 = air.execute -> (index) {
%6 = affine.apply #map1()[%arg16]
air.execute_terminator %6 : index
}
%async_token_10, %results_11 = air.execute [%async_token_6] -> (index) {
%6 = arith.addi %arg19, %results_7 : index
air.execute_terminator %6 : index
}
%async_token_12, %results_13 = air.execute [%async_token_8] -> (index) {
%6 = arith.addi %arg20, %results_9 : index
air.execute_terminator %6 : index
}
%async_token_14, %results_15 = air.execute -> (memref<32x32xi32, 2>) {
%alloc = memref.alloc() : memref<32x32xi32, 2>
air.execute_terminator %alloc : memref<32x32xi32, 2>
}
%async_token_16 = air.execute [%async_token_14] {
linalg.fill ins(%c0_i32 : i32) outs(%results_15 : memref<32x32xi32, 2>)
}
%3 = air.wait_all async [%async_token_10, %async_token_12, %async_token_16]
%4 = scf.for %arg24 = %c0 to %c64 step %c32 iter_args(%arg25 = %3) -> (!air.async.token) {
%async_token_18, %results_19 = air.execute -> (memref<32x32xi32, 2>) {
%alloc = memref.alloc() : memref<32x32xi32, 2>
air.execute_terminator %alloc : memref<32x32xi32, 2>
}
%async_token_20, %results_21 = air.execute -> (memref<32x32xi32, 2>) {
%alloc = memref.alloc() : memref<32x32xi32, 2>
air.execute_terminator %alloc : memref<32x32xi32, 2>
}
%6 = affine.if #set()[%arg15, %arg16] -> !air.async.token {
%8 = air.dma_memcpy_nd async [%arg25, %async_token_18] (%results_19[] [] [], %arg22[%results_11, %arg24] [%c32, %c32] [%c64, %c1_5]) {broadcast_set = #set, id = 1 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
affine.yield %8 : !air.async.token
} else {
%8 = air.dma_memcpy_nd async [%arg25, %async_token_18] (%results_19[] [] [], %arg22[%results_11, %arg24] [%c32, %c32] [%c64, %c1_5]) {broadcast_set = #set1, id = 2 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
affine.yield %8 : !air.async.token
}
%7 = affine.if #set2()[%arg15, %arg16] -> !air.async.token {
%8 = air.dma_memcpy_nd async [%arg25, %async_token_20] (%results_21[] [] [], %arg23[%arg24, %results_13] [%c32, %c32] [%c64, %c1_5]) {broadcast_set = #set2, id = 3 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
affine.yield %8 : !air.async.token
} else {
%8 = air.dma_memcpy_nd async [%arg25, %async_token_20] (%results_21[] [] [], %arg23[%arg24, %results_13] [%c32, %c32] [%c64, %c1_5]) {broadcast_set = #set3, id = 4 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
affine.yield %8 : !air.async.token
}
%async_token_22 = air.execute [%7, %6] {
linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%results_19, %results_21 : memref<32x32xi32, 2>, memref<32x32xi32, 2>) outs(%results_15 : memref<32x32xi32, 2>)
}
%async_token_23 = air.execute [%async_token_22] {
memref.dealloc %results_19 : memref<32x32xi32, 2>
}
%async_token_24 = air.execute [%async_token_22] {
memref.dealloc %results_21 : memref<32x32xi32, 2>
}
scf.yield %async_token_22 : !air.async.token
}
%5 = air.dma_memcpy_nd async [%4] (%arg21[%results_11, %results_13] [%c32, %c32] [%c64, %c1_5], %results_15[] [] []) {id = 5 : i32} : (memref<64x64xi32>, memref<32x32xi32, 2>)
%async_token_17 = air.execute [%5] {
memref.dealloc %results_15 : memref<32x32xi32, 2>
}
air.herd_terminator
}
air.segment_terminator
}
air.herd_terminator
air.launch_terminator
}
%async_token_0 = air.execute [%0] {
memref.copy %results, %arg2 : memref<64x64xi32> to memref<64x64xi32>
}
%async_token_4 = air.execute [%0] {
memref.copy %results_2, %arg2 : memref<64x64xi32> to memref<64x64xi32>
} {id = 14 : i32}
return
}
}
Expand Down

0 comments on commit 7caefd2

Please sign in to comment.