diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp b/sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp index fef7bf0bda74..92ae22103d49 100644 --- a/sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp +++ b/sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp @@ -20,6 +20,7 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { size_t NDRangeM = M / TM + (((M % TM) != 0) ? 1 : 0); size_t NDRangeN = N / TN; size_t sg_size = get_sg_size>(q); + std::cout << "SG size: " << sg_size << " "; q.submit([&](handler &cgh) { cgh.parallel_for>( @@ -109,21 +110,31 @@ void test() { matrix_multiply_ref(A, B, D, MATRIX_M, MATRIX_N, MATRIX_K); // test data - if constexpr (A_layout == layout::row_major) { - if constexpr (B_layout == layout::row_major) { - matrix_multiply(C, A, B, q); - } else if constexpr (B_layout == layout::col_major) { - } else { - Tab *vnniB = malloc_shared(MATRIX_K * MATRIX_N, q); - matrix_vnni(MATRIX_K, MATRIX_N, B, vnniB, vnniFactor); - matrix_multiply(C, A, vnniB, q); - free(vnniB, q); - } - } else { + if constexpr (A_layout == layout::col_major) { + Tab *colA = malloc_shared(MATRIX_K * MATRIX_M, q); + matrix_transpose(MATRIX_M, MATRIX_K, colA, A); + Tab *tmp = A; + A = colA; + free(tmp, q); } + if constexpr (B_layout == layout::col_major) { + Tab *colB = malloc_shared(MATRIX_N * MATRIX_K, q); + matrix_transpose(MATRIX_K, MATRIX_N, colB, B); + Tab *tmp = B; + B = colB; + free(tmp, q); + } + + if constexpr (B_layout == layout::ext_intel_packed) { + Tab *vnniB = malloc_shared(MATRIX_K * MATRIX_N, q); + matrix_vnni(MATRIX_K, MATRIX_N, B, vnniB, vnniFactor); + Tab *tmp = B; + B = vnniB; + free(tmp, q); + } + + matrix_multiply(C, A, B, q); assert(matrix_compare(MATRIX_M, MATRIX_N, C, D)); std::cout << "passed" << std::endl; diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp index dbc33017b0a6..72f172f16be9 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp @@ -16,15 +16,18 @@ #include "joint_matrix_out_bounds_impl.hpp" int main() { - std::cout << "bf16:\n"; + std::cout << "bf16 A row major, B row major: "; test(); + std::cout << "bf16 A row major, B packed: "; test(); // unaligned k: + std::cout << "bf16 A row major, B row major: "; test(); + std::cout << "bf16 A row major, B packed: "; test(); } diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds_colmajor.cpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_colmajor.cpp new file mode 100644 index 000000000000..531a4d0bf160 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_colmajor.cpp @@ -0,0 +1,44 @@ +//==-------- joint_matrix_out_bounds.cpp - DPC++ joint_matrix--------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix +// UNSUPPORTED: gpu-intel-dg2, cpu + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// RUN: %{build} -o %t32.out -DSG_SZ=32 +// RUN: %{run} %t32.out + +// XFAIL:gpu +// XFAIL-TRACKER: GSD-5768 + +#include "common.hpp" +#include "joint_matrix_out_bounds_impl.hpp" + +int main() { + std::cout << "bf16 A col major, B col major: "; + test(); + std::cout << "half A col major, B col major: "; + test(); + std::cout << "int8 A col major, B col major: "; + test(); + + // unaligned k: + std::cout << "bf16 A col major, B col major: "; + test(); + std::cout << "half A col major, B col major: "; + test(); + std::cout << "int8 A col major, B col major: "; + test(); +}