From cbad008c259fa43e98cc0900972bb930fa58cf6f Mon Sep 17 00:00:00 2001 From: Simon Frasch Date: Wed, 17 Apr 2024 18:15:12 +0200 Subject: [PATCH 1/2] remove multi-threading. Rely on host blas library for multi-threading --- CMakeLists.txt | 1 - include/spla/context.h | 17 --- include/spla/context.hpp | 17 +-- src/gemm/gemm_gpu.cpp | 2 +- src/gemm/gemm_host.cpp | 76 ++---------- src/gemm/gemm_host.hpp | 5 +- src/pgemm_sbs/pgemm_sbs_gpu.cpp | 1 - src/pgemm_sbs/pgemm_sbs_host.cpp | 14 +-- src/pgemm_sbs/ring_sbs_host.cpp | 17 ++- src/pgemm_sbs/ring_sbs_host.hpp | 7 +- src/pgemm_ssb/pgemm_ssb_gpu.cpp | 1 - src/pgemm_ssb/pgemm_ssb_host.cpp | 15 +-- src/pgemm_ssb/ring_ssb_host.cpp | 15 ++- src/pgemm_ssb/ring_ssb_host.hpp | 11 +- src/spla/context.cpp | 36 +----- src/spla/context_internal.hpp | 18 +-- src/spla/gemm.cpp | 11 +- src/util/blas_interface.cpp | 51 -------- src/util/blas_interface.hpp | 8 -- src/util/blas_threads_guard.hpp | 65 ---------- src/util/omp_definitions.hpp | 46 -------- tests/CMakeLists.txt | 4 +- tests/gtest_mpi.cpp | 157 ++++++++++++------------- tests/programs/benchmark.cpp | 3 - tests/programs/benchmark_scalapack.cpp | 3 - tests/programs/run_tests.cpp | 1 - tests/test_gemm.cpp | 31 ++--- tests/test_gemm_sbs.cpp | 37 +++--- tests/test_gemm_ssb.cpp | 37 +++--- 29 files changed, 182 insertions(+), 525 deletions(-) delete mode 100644 src/util/blas_threads_guard.hpp delete mode 100644 src/util/omp_definitions.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index d4b62a3..a8a9f64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,7 +33,6 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/cmake/modules) # Options option(SPLA_STATIC "Compile as static library" OFF) -option(SPLA_OMP "Compile with OpenMP support" ON) option(SPLA_BUILD_TESTS "Build tests" OFF) option(SPLA_BUILD_EXAMPLES "Compile examples" OFF) option(SPLA_INSTALL "Enable CMake install commands" ON) diff --git a/include/spla/context.h b/include/spla/context.h index 65b24aa..2029e9c 100644 --- a/include/spla/context.h +++ b/include/spla/context.h @@ -69,14 +69,6 @@ SPLA_EXPORT SplaError spla_ctx_destroy(SplaContext* ctx); */ SPLA_EXPORT SplaError spla_ctx_processing_unit(SplaContext ctx, SplaProcessingUnit* pu); -/** - * Access a Context parameter. - * @param[in] ctx Context handle. - * @param[out] numThreads Maximum number of threads used for computations. - * @return Error code or SPLA_SUCCESS. - */ -SPLA_EXPORT SPLA_DEPRECATED SplaError spla_ctx_num_threads(SplaContext ctx, int* numThreads); - /** * Access a Context parameter. * @param[in] ctx Context handle. @@ -147,15 +139,6 @@ SPLA_EXPORT SplaError spla_ctx_allocated_memory_pinned(SplaContext ctx, uint_lea */ SPLA_EXPORT SplaError spla_ctx_allocated_memory_gpu(SplaContext ctx, uint_least64_t* size) ; -/** - * Set the number of threads to be used. - * - * @param[in] ctx Context handle. - * @param[in] numThreads Number of threads. - * @return Error code or SPLA_SUCCESS. - */ -SPLA_EXPORT SPLA_DEPRECATED SplaError spla_ctx_set_num_threads(SplaContext ctx, int numThreads); - /** * Set the number of tiles. * diff --git a/include/spla/context.hpp b/include/spla/context.hpp index d0cc517..5927287 100644 --- a/include/spla/context.hpp +++ b/include/spla/context.hpp @@ -83,12 +83,6 @@ class SPLA_EXPORT Context { */ SplaProcessingUnit processing_unit() const; - /** - * Access a Context parameter. - * @return Maximum number of threads used for computations. - */ - SPLA_DEPRECATED int num_threads() const; - /** * Access a Context parameter. * @return Number of tiles used to overlap computation and communication. @@ -142,19 +136,12 @@ class SPLA_EXPORT Context { */ std::uint_least64_t allocated_memory_gpu() const; - /** - * Set the number of threads to be used. - * - * @param[in] numThreads Number of threads. - */ - SPLA_DEPRECATED void set_num_threads(int numThreads); - /** * Set the number of tiles. * - * @param[in] numTilesPerThread Number of tiles. + * @param[in] numTiles Number of tiles. */ - void set_num_tiles(int numTilesPerThread); + void set_num_tiles(int numTiles); /** * Set the tile size used for computations on host and partitioning of communication. diff --git a/src/gemm/gemm_gpu.cpp b/src/gemm/gemm_gpu.cpp index 23e1c11..ca1d26d 100644 --- a/src/gemm/gemm_gpu.cpp +++ b/src/gemm/gemm_gpu.cpp @@ -95,7 +95,7 @@ void gemm_gpu(SplaOperation opA, SplaOperation opB, IntType m, IntType n, IntTyp k * n < ctx.op_threshold_gpu() / (2 * m)) { // m always != 0 here using hostType = typename ComplexTypeHost::type; return gemm_host( - ctx.num_threads(), opA, opB, m, n, k, *reinterpret_cast(&alpha), + opA, opB, m, n, k, *reinterpret_cast(&alpha), reinterpret_cast(A), lda, reinterpret_cast(B), ldb, *reinterpret_cast(&beta), reinterpret_cast(C), ldc); } diff --git a/src/gemm/gemm_host.cpp b/src/gemm/gemm_host.cpp index ce37ad4..85402ea 100644 --- a/src/gemm/gemm_host.cpp +++ b/src/gemm/gemm_host.cpp @@ -37,7 +37,6 @@ #include "spla/context_internal.hpp" #include "spla/types.h" #include "util/blas_interface.hpp" -#include "util/blas_threads_guard.hpp" #include "util/check_gemm_param.hpp" namespace spla { @@ -54,9 +53,8 @@ static auto map_op_to_host_blas(SplaOperation op) -> blas::Operation { } template -void gemm_host(IntType numThreads, SplaOperation opA, SplaOperation opB, IntType m, IntType n, - IntType k, T alpha, const T *A, IntType lda, const T *B, IntType ldb, T beta, T *C, - IntType ldc) { +void gemm_host(SplaOperation opA, SplaOperation opB, IntType m, IntType n, IntType k, T alpha, + const T *A, IntType lda, const T *B, IntType ldb, T beta, T *C, IntType ldc) { if (m == 0 || n == 0) { return; } @@ -70,76 +68,28 @@ void gemm_host(IntType numThreads, SplaOperation opA, SplaOperation opB, IntType if (ldb < 1) ldb = 1; if (ldc < 1) ldc = 1; -#ifdef SPLA_OMP - const bool useOMP = true; -#else - const bool useOMP = false; -#endif - - // if blas library is parallelized or not thread safe, call it directly - if (!useOMP || blas::is_parallel() || !blas::is_thread_safe()) { - BlasThreadsGuard threadGuard(numThreads); - blas::gemm(blas::Order::COL_MAJOR, opBlasA, opBlasB, m, n, k, alpha, A, lda, B, ldb, beta, C, - ldc); - return; - } - - // assume blas is not parallelized - HostArrayConstView2D viewA(A, opA == SplaOperation::SPLA_OP_NONE ? k : m, - opA == SplaOperation::SPLA_OP_NONE ? m : k, lda); - HostArrayConstView2D viewB(B, opB == SplaOperation::SPLA_OP_NONE ? n : k, - opB == SplaOperation::SPLA_OP_NONE ? k : n, ldb); - HostArrayView2D viewC(C, n, m, ldc); - - // If there are multiple threads, use 2 times as many tiles to take advantage of dynamic - // scheduling - const IntType numThreadCols = numThreads; - const IntType numThreadRows = numThreads > 1 ? 2 : 1; - - const IntType minBlockSize = 5; - - const IntType colBlockSize = - std::min((n + numThreadCols - 1) / numThreadCols, minBlockSize); - const IntType rowBlockSize = - std::min((m + numThreadRows - 1) / numThreadRows, minBlockSize); - - SPLA_OMP_PRAGMA("omp parallel for schedule(dynamic) collapse(2) num_threads(numThreads)") - for (IntType col = 0; col < n; col += colBlockSize) { - for (IntType row = 0; row < m; row += rowBlockSize) { - const IntType currentCols = std::min(viewC.dim_outer() - col, colBlockSize); - const IntType currentRows = std::min(viewC.dim_inner() - row, rowBlockSize); - const IntType rowA = opA == SplaOperation::SPLA_OP_NONE ? row : 0; - const IntType colA = opA == SplaOperation::SPLA_OP_NONE ? 0 : row; - const IntType rowB = opB == SplaOperation::SPLA_OP_NONE ? 0 : col; - const IntType colB = opB == SplaOperation::SPLA_OP_NONE ? col : 0; - blas::gemm(blas::Order::COL_MAJOR, opBlasA, opBlasB, currentRows, currentCols, k, alpha, - viewA.size() ? &viewA(colA, rowA) : nullptr, lda, - viewB.size() ? &viewB(colB, rowB) : nullptr, ldb, beta, &viewC(col, row), ldc); - } - } + blas::gemm(blas::Order::COL_MAJOR, opBlasA, opBlasB, m, n, k, alpha, A, lda, B, ldb, beta, C, + ldc); } -template auto gemm_host(IntType numThreads, SplaOperation opA, SplaOperation opB, IntType m, - IntType n, IntType k, float alpha, const float *A, IntType lda, - const float *B, IntType ldb, float beta, float *C, IntType ldc) - -> void; +template auto gemm_host(SplaOperation opA, SplaOperation opB, IntType m, IntType n, + IntType k, float alpha, const float *A, IntType lda, const float *B, + IntType ldb, float beta, float *C, IntType ldc) -> void; -template auto gemm_host(IntType numThreads, SplaOperation opA, SplaOperation opB, IntType m, - IntType n, IntType k, double alpha, const double *A, IntType lda, +template auto gemm_host(SplaOperation opA, SplaOperation opB, IntType m, IntType n, + IntType k, double alpha, const double *A, IntType lda, const double *B, IntType ldb, double beta, double *C, IntType ldc) -> void; -template auto gemm_host>(IntType numThreads, SplaOperation opA, - SplaOperation opB, IntType m, IntType n, IntType k, - std::complex alpha, +template auto gemm_host>(SplaOperation opA, SplaOperation opB, IntType m, + IntType n, IntType k, std::complex alpha, const std::complex *A, IntType lda, const std::complex *B, IntType ldb, std::complex beta, std::complex *C, IntType ldc) -> void; -template auto gemm_host>(IntType numThreads, SplaOperation opA, - SplaOperation opB, IntType m, IntType n, IntType k, - std::complex alpha, +template auto gemm_host>(SplaOperation opA, SplaOperation opB, IntType m, + IntType n, IntType k, std::complex alpha, const std::complex *A, IntType lda, const std::complex *B, IntType ldb, std::complex beta, std::complex *C, diff --git a/src/gemm/gemm_host.hpp b/src/gemm/gemm_host.hpp index 9d7d5bc..363e25c 100644 --- a/src/gemm/gemm_host.hpp +++ b/src/gemm/gemm_host.hpp @@ -34,9 +34,8 @@ namespace spla { template -void gemm_host(IntType numThreads, SplaOperation opA, SplaOperation opB, IntType m, IntType n, - IntType k, T alpha, const T *A, IntType lda, const T *B, IntType ldb, T beta, T *C, - IntType ldc); +void gemm_host(SplaOperation opA, SplaOperation opB, IntType m, IntType n, IntType k, T alpha, + const T *A, IntType lda, const T *B, IntType ldb, T beta, T *C, IntType ldc); } // namespace spla #endif diff --git a/src/pgemm_sbs/pgemm_sbs_gpu.cpp b/src/pgemm_sbs/pgemm_sbs_gpu.cpp index 8ec5bb0..b92eb6e 100644 --- a/src/pgemm_sbs/pgemm_sbs_gpu.cpp +++ b/src/pgemm_sbs/pgemm_sbs_gpu.cpp @@ -52,7 +52,6 @@ #include "util/block_size_selection.hpp" #include "util/check_gemm_param.hpp" #include "util/common_types.hpp" -#include "util/omp_definitions.hpp" namespace spla { /* diff --git a/src/pgemm_sbs/pgemm_sbs_host.cpp b/src/pgemm_sbs/pgemm_sbs_host.cpp index e84c56f..93f5136 100644 --- a/src/pgemm_sbs/pgemm_sbs_host.cpp +++ b/src/pgemm_sbs/pgemm_sbs_host.cpp @@ -46,11 +46,9 @@ #include "spla/spla.hpp" #include "timing/timing.hpp" #include "util/blas_interface.hpp" -#include "util/blas_threads_guard.hpp" #include "util/block_size_selection.hpp" #include "util/check_gemm_param.hpp" #include "util/common_types.hpp" -#include "util/omp_definitions.hpp" namespace spla { @@ -101,12 +99,10 @@ void pgemm_sbs_host_internal(int mLocal, int n, int k, T alpha, const T *A, int auto &comms = descB.get_comms(numTiles); std::array, numTiles> tiles{ - RingSBSHost{ringThreshold, maxBlockSize, ctx.num_threads(), comms[0], - ctx.allocators().host(), gen, alpha, viewA, viewB, bRowOffset, - bColOffset, beta, viewC}, - RingSBSHost{ringThreshold, maxBlockSize, ctx.num_threads(), comms[1], - ctx.allocators().host(), gen, alpha, viewA, viewB, bRowOffset, - bColOffset, beta, viewC}}; + RingSBSHost{ringThreshold, maxBlockSize, comms[0], ctx.allocators().host(), gen, + alpha, viewA, viewB, bRowOffset, bColOffset, beta, viewC}, + RingSBSHost{ringThreshold, maxBlockSize, comms[1], ctx.allocators().host(), gen, + alpha, viewA, viewB, bRowOffset, bColOffset, beta, viewC}}; std::vector blocks; blocks.reserve(descB.comm().size()); @@ -182,7 +178,7 @@ void pgemm_sbs_host(int mLocal, int n, int k, T alpha, const T *A, int lda, cons // Check if local operations only if (descB.comm().size() == 1 || descB.type() == SplaDistributionType::SPLA_DIST_MIRROR) { - return gemm_host(ctx.num_threads(), SPLA_OP_NONE, SPLA_OP_NONE, mLocal, n, k, alpha, A, lda, + return gemm_host(SPLA_OP_NONE, SPLA_OP_NONE, mLocal, n, k, alpha, A, lda, B + bRowOffset + bColOffset * ldb, ldb, beta, C, ldc); } diff --git a/src/pgemm_sbs/ring_sbs_host.cpp b/src/pgemm_sbs/ring_sbs_host.cpp index 8a5e412..272ec81 100644 --- a/src/pgemm_sbs/ring_sbs_host.cpp +++ b/src/pgemm_sbs/ring_sbs_host.cpp @@ -51,7 +51,7 @@ static constexpr int ringTag = 2; template RingSBSHost::RingSBSHost( - double ringThreshold, IntType maxBlockSize, IntType numThreads, MPICommunicatorHandle comm, + double ringThreshold, IntType maxBlockSize, MPICommunicatorHandle comm, const std::shared_ptr> allocator, BLOCK_GEN baseMatGen, ValueType alpha, const HostArrayConstView2D &A, const HostArrayConstView2D &B, IntType bRowOffset, IntType bColOffset, ValueType beta, HostArrayView2D C) @@ -66,7 +66,6 @@ RingSBSHost::RingSBSHost( bColOffset_(bColOffset), alpha_(alpha), beta_(beta), - numThreads_(numThreads), maxBlockSize_(maxBlockSize), ringThreshold_(ringThreshold) { assert(A_.dim_inner() == C_.dim_inner()); @@ -194,10 +193,9 @@ auto RingSBSHost::process_step_ring(std::unordered_set &b betaColIndeces.emplace(block.col); beta = beta_; } - gemm_host(numThreads_, SplaOperation::SPLA_OP_NONE, SplaOperation::SPLA_OP_NONE, - A_.dim_inner(), block.numCols, block.numRows, alpha_, &A_(block.row, 0), - A_.ld_inner(), sendView_.data(), block.numRows, beta, &C_(block.col, 0), - C_.ld_inner()); + gemm_host(SplaOperation::SPLA_OP_NONE, SplaOperation::SPLA_OP_NONE, A_.dim_inner(), + block.numCols, block.numRows, alpha_, &A_(block.row, 0), A_.ld_inner(), + sendView_.data(), block.numRows, beta, &C_(block.col, 0), C_.ld_inner()); } } state_ = stepIdx_ >= comm_.size() - 1 ? TileState::Empty : TileState::PartiallyProcessed; @@ -224,10 +222,9 @@ auto RingSBSHost::process_step_broadcast(std::unordered_set(numThreads_, SplaOperation::SPLA_OP_NONE, SplaOperation::SPLA_OP_NONE, - A_.dim_inner(), block.numCols, block.numRows, alpha_, &A_(block.row, 0), - A_.ld_inner(), blockView.data(), block.numRows, beta, &C_(block.col, 0), - C_.ld_inner()); + gemm_host(SplaOperation::SPLA_OP_NONE, SplaOperation::SPLA_OP_NONE, A_.dim_inner(), + block.numCols, block.numRows, alpha_, &A_(block.row, 0), A_.ld_inner(), + blockView.data(), block.numRows, beta, &C_(block.col, 0), C_.ld_inner()); } } diff --git a/src/pgemm_sbs/ring_sbs_host.hpp b/src/pgemm_sbs/ring_sbs_host.hpp index b5c7656..ecca1be 100644 --- a/src/pgemm_sbs/ring_sbs_host.hpp +++ b/src/pgemm_sbs/ring_sbs_host.hpp @@ -55,9 +55,9 @@ class RingSBSHost { public: using ValueType = T; - RingSBSHost(double ringThreshold, IntType maxBlockSize, IntType numThreads, - MPICommunicatorHandle comm, const std::shared_ptr> allocator, - BLOCK_GEN baseMatGen, ValueType alpha, const HostArrayConstView2D& A, + RingSBSHost(double ringThreshold, IntType maxBlockSize, MPICommunicatorHandle comm, + const std::shared_ptr> allocator, BLOCK_GEN baseMatGen, + ValueType alpha, const HostArrayConstView2D& A, const HostArrayConstView2D& B, IntType bRowOffset, IntType bColOffset, ValueType beta, HostArrayView2D C); @@ -101,7 +101,6 @@ class RingSBSHost { HostArrayView2D C_; const IntType bRowOffset_, bColOffset_; const ValueType alpha_, beta_; - const IntType numThreads_; const IntType maxBlockSize_; const double ringThreshold_; }; diff --git a/src/pgemm_ssb/pgemm_ssb_gpu.cpp b/src/pgemm_ssb/pgemm_ssb_gpu.cpp index 980ceee..0bf29c1 100644 --- a/src/pgemm_ssb/pgemm_ssb_gpu.cpp +++ b/src/pgemm_ssb/pgemm_ssb_gpu.cpp @@ -53,7 +53,6 @@ #include "util/block_size_selection.hpp" #include "util/check_gemm_param.hpp" #include "util/common_types.hpp" -#include "util/omp_definitions.hpp" namespace spla { diff --git a/src/pgemm_ssb/pgemm_ssb_host.cpp b/src/pgemm_ssb/pgemm_ssb_host.cpp index e4f9e7f..45eea5f 100644 --- a/src/pgemm_ssb/pgemm_ssb_host.cpp +++ b/src/pgemm_ssb/pgemm_ssb_host.cpp @@ -44,11 +44,9 @@ #include "spla/types.h" #include "timing/timing.hpp" #include "util/blas_interface.hpp" -#include "util/blas_threads_guard.hpp" #include "util/block_size_selection.hpp" #include "util/check_gemm_param.hpp" #include "util/common_types.hpp" -#include "util/omp_definitions.hpp" namespace spla { @@ -85,11 +83,10 @@ void pgemm_ssb_host_internal(int m, int n, int kLocal, SplaOperation opA, T alph auto &comms = descC.get_comms(numTiles); std::array, numTiles> tiles{ - RingSSBHost{ringThreshold, maxBlockSize, ctx.num_threads(), comms[0], - ctx.allocators().host(), gen, opA, alpha, viewA, viewB, beta, viewC}, - RingSSBHost{ringThreshold, maxBlockSize, ctx.num_threads(), comms[1], - ctx.allocators().host(), gen, opA, alpha, viewA, viewB, beta, - viewC}}; + RingSSBHost{ringThreshold, maxBlockSize, comms[0], ctx.allocators().host(), gen, + opA, alpha, viewA, viewB, beta, viewC}, + RingSSBHost{ringThreshold, maxBlockSize, comms[1], ctx.allocators().host(), gen, + opA, alpha, viewA, viewB, beta, viewC}}; std::vector blocks; blocks.reserve(descC.comm().size()); @@ -169,8 +166,8 @@ void pgemm_ssb_host(int m, int n, int kLocal, SplaOperation opA, T alpha, const } if (descC.comm().size() == 1) { - return gemm_host(ctx.num_threads(), opA, SPLA_OP_NONE, m, n, kLocal, alpha, A, lda, B, ldb, - beta, C + cRowOffset + cColOffset * ldc, ldc); + return gemm_host(opA, SPLA_OP_NONE, m, n, kLocal, alpha, A, lda, B, ldb, beta, + C + cRowOffset + cColOffset * ldc, ldc); } if (descC.type() == SplaDistributionType::SPLA_DIST_BLACS_BLOCK_CYCLIC) { diff --git a/src/pgemm_ssb/ring_ssb_host.cpp b/src/pgemm_ssb/ring_ssb_host.cpp index 6cdb74d..d622d84 100644 --- a/src/pgemm_ssb/ring_ssb_host.cpp +++ b/src/pgemm_ssb/ring_ssb_host.cpp @@ -52,7 +52,7 @@ static constexpr int ringTag = 2; template RingSSBHost::RingSSBHost(double ringThreshold, IntType maxBlockSize, - IntType numThreads, MPICommunicatorHandle comm, + MPICommunicatorHandle comm, const std::shared_ptr> allocator, BLOCK_GEN baseMatGen, SplaOperation opA, ValueType alpha, const HostArrayConstView2D &A, @@ -69,7 +69,6 @@ RingSSBHost::RingSSBHost(double ringThreshold, IntType maxBlockSiz alpha_(alpha), beta_(beta), opA_(opA), - numThreads_(numThreads), maxBlockSize_(maxBlockSize), ringThreshold_(ringThreshold) { assert(A_.dim_inner() == B_.dim_inner()); @@ -162,9 +161,9 @@ auto RingSSBHost::process_step_ring() -> void { const auto &block = blocks_[blockIdx]; if (A_.dim_inner() != 0) { SCOPED_TIMING("gemm") - gemm_host(numThreads_, opA_, SplaOperation::SPLA_OP_NONE, block.numRows, block.numCols, - A_.dim_inner(), alpha_, &A_(block.row, 0), A_.ld_inner(), &B_(block.col, 0), - B_.ld_inner(), 1.0, sendView_.data(), block.numRows); + gemm_host(opA_, SplaOperation::SPLA_OP_NONE, block.numRows, block.numCols, A_.dim_inner(), + alpha_, &A_(block.row, 0), A_.ld_inner(), &B_(block.col, 0), B_.ld_inner(), 1.0, + sendView_.data(), block.numRows); } if (stepIdx_ < comm_.size() - 1) { // continue sending around in ring SCOPED_TIMING("send") @@ -216,9 +215,9 @@ auto RingSSBHost::process_step_reduction() -> void { std::memset(sendView_.data(), 0, sendView_.size() * sizeof(T)); } else { SCOPED_TIMING("gemm") - gemm_host(numThreads_, opA_, SplaOperation::SPLA_OP_NONE, block.numRows, block.numCols, - A_.dim_inner(), alpha_, &A_(block.row, 0), A_.ld_inner(), &B_(block.col, 0), - B_.ld_inner(), 0.0, sendView_.data(), block.numRows); + gemm_host(opA_, SplaOperation::SPLA_OP_NONE, block.numRows, block.numCols, A_.dim_inner(), + alpha_, &A_(block.row, 0), A_.ld_inner(), &B_(block.col, 0), B_.ld_inner(), 0.0, + sendView_.data(), block.numRows); } START_TIMING("iallreduce") diff --git a/src/pgemm_ssb/ring_ssb_host.hpp b/src/pgemm_ssb/ring_ssb_host.hpp index 4d2ebd5..8ed2ae5 100644 --- a/src/pgemm_ssb/ring_ssb_host.hpp +++ b/src/pgemm_ssb/ring_ssb_host.hpp @@ -56,11 +56,11 @@ class RingSSBHost { public: using ValueType = T; - RingSSBHost(double ringThreshold, IntType maxBlockSize, IntType numThreads, - MPICommunicatorHandle comm, const std::shared_ptr> allocator, - BLOCK_GEN baseMatGen, SplaOperation opA, ValueType alpha, - const HostArrayConstView2D &A, const HostArrayConstView2D &B, - ValueType beta, HostArrayView2D C); + RingSSBHost(double ringThreshold, IntType maxBlockSize, MPICommunicatorHandle comm, + const std::shared_ptr> allocator, BLOCK_GEN baseMatGen, + SplaOperation opA, ValueType alpha, const HostArrayConstView2D &A, + const HostArrayConstView2D &B, ValueType beta, + HostArrayView2D C); // Prepare to process input blocks auto prepare(std::vector::const_iterator begin, std::vector::const_iterator end) @@ -107,7 +107,6 @@ class RingSSBHost { HostArrayView2D C_; const ValueType alpha_, beta_; const SplaOperation opA_; - const IntType numThreads_; const IntType maxBlockSize_; const double ringThreshold_; }; diff --git a/src/spla/context.cpp b/src/spla/context.cpp index 7644505..86ee9a9 100644 --- a/src/spla/context.cpp +++ b/src/spla/context.cpp @@ -40,8 +40,6 @@ Context::Context(SplaProcessingUnit pu) : ctxInternal_(new ContextInternal(pu)) SplaProcessingUnit Context::processing_unit() const { return ctxInternal_->processing_unit(); } -int Context::num_threads() const { return ctxInternal_->num_threads(); } - int Context::num_tiles() const { return ctxInternal_->num_tiles(); } int Context::tile_size_host() const { return ctxInternal_->tile_size_host(); } @@ -72,10 +70,8 @@ std::uint_least64_t Context::allocated_memory_gpu() const { #endif } -void Context::set_num_threads(int numThreads) { ctxInternal_->set_num_threads(numThreads); } - -void Context::set_num_tiles(int numTilesPerThread) { - ctxInternal_->set_num_tiles(numTilesPerThread); +void Context::set_num_tiles(int numTiles) { + ctxInternal_->set_num_tiles(numTiles); } void Context::set_tile_size_host(int tileSizeHost) { @@ -149,20 +145,6 @@ SplaError spla_ctx_destroy(SplaContext* ctx) { return SplaError::SPLA_SUCCESS; } -SplaError spla_ctx_num_threads(SplaContext ctx, int* numThreads) { - if (!ctx) { - return SplaError::SPLA_INVALID_HANDLE_ERROR; - } - try { - *numThreads = reinterpret_cast(ctx)->num_threads(); - } catch (const spla::GenericError& e) { - return e.error_code(); - } catch (...) { - return SplaError::SPLA_UNKNOWN_ERROR; - } - return SplaError::SPLA_SUCCESS; -} - SplaError spla_ctx_num_tiles(SplaContext ctx, int* numTiles) { if (!ctx) { return SplaError::SPLA_INVALID_HANDLE_ERROR; @@ -275,20 +257,6 @@ SplaError spla_ctx_allocated_memory_gpu(SplaContext ctx, uint_least64_t* size) { return SplaError::SPLA_SUCCESS; } -SplaError spla_ctx_set_num_threads(SplaContext ctx, int numThreads) { - if (!ctx) { - return SplaError::SPLA_INVALID_HANDLE_ERROR; - } - try { - reinterpret_cast(ctx)->set_num_threads(numThreads); - } catch (const spla::GenericError& e) { - return e.error_code(); - } catch (...) { - return SplaError::SPLA_UNKNOWN_ERROR; - } - return SplaError::SPLA_SUCCESS; -} - SplaError spla_ctx_set_num_tiles(SplaContext ctx, int numTiles) { if (!ctx) { return SplaError::SPLA_INVALID_HANDLE_ERROR; diff --git a/src/spla/context_internal.hpp b/src/spla/context_internal.hpp index 8867358..280b717 100644 --- a/src/spla/context_internal.hpp +++ b/src/spla/context_internal.hpp @@ -44,7 +44,6 @@ #include "spla/context.hpp" #include "spla/exceptions.hpp" #include "util/common_types.hpp" -#include "util/omp_definitions.hpp" #if defined(SPLA_CUDA) || defined(SPLA_ROCM) #include "gpu_util/gpu_blas_handle.hpp" @@ -58,7 +57,6 @@ class ContextInternal { public: explicit ContextInternal(SplaProcessingUnit pu) : pu_(pu), - numThreads_(omp_get_max_threads()), numTiles_(4), tileSizeHost_(pu == SplaProcessingUnit::SPLA_PU_HOST ? 500 : 1500), tileSizeGPU_(2048), @@ -105,8 +103,6 @@ class ContextInternal { inline auto processing_unit() const -> SplaProcessingUnit { return pu_; } - inline auto num_threads() const -> IntType { return numThreads_; } - inline auto num_tiles() const -> IntType { return numTiles_; } inline auto tile_size_host() const -> IntType { return tileSizeHost_; } @@ -123,16 +119,9 @@ class ContextInternal { // Set methods - inline auto set_num_threads(IntType numThreads) -> void { - if (numThreads > 0) - numThreads_ = numThreads; - else - numThreads_ = omp_get_max_threads(); - } - - inline auto set_num_tiles(IntType numTilesPerThread) -> void { - if (numTilesPerThread < 1) throw InvalidParameterError(); - numTiles_ = numTilesPerThread; + inline auto set_num_tiles(IntType numTiles) -> void { + if (numTiles < 1) throw InvalidParameterError(); + numTiles_ = numTiles; } inline auto set_tile_size_host(IntType tileSizeHost) -> void { @@ -152,7 +141,6 @@ class ContextInternal { private: SplaProcessingUnit pu_; - IntType numThreads_; IntType numTiles_; IntType tileSizeHost_; IntType tileSizeGPU_; diff --git a/src/spla/gemm.cpp b/src/spla/gemm.cpp index 9f8d65e..dbb1b1c 100644 --- a/src/spla/gemm.cpp +++ b/src/spla/gemm.cpp @@ -41,8 +41,7 @@ namespace spla { void gemm(SplaOperation opA, SplaOperation opB, int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, Context &ctx) { if (ctx.processing_unit() == SplaProcessingUnit::SPLA_PU_HOST) { - gemm_host(ctx.ctxInternal_->num_threads(), opA, opB, m, n, k, alpha, A, lda, B, ldb, - beta, C, ldc); + gemm_host(opA, opB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } else { #if defined(SPLA_CUDA) || defined(SPLA_ROCM) gemm_gpu(opA, opB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, *(ctx.ctxInternal_)); @@ -55,7 +54,7 @@ void gemm(SplaOperation opA, SplaOperation opB, int m, int n, int k, float alpha void gemm(SplaOperation opA, SplaOperation opB, int m, int n, int k, double alpha, const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc, Context &ctx) { if (ctx.processing_unit() == SplaProcessingUnit::SPLA_PU_HOST) { - gemm_host(ctx.ctxInternal_->num_threads(), opA, opB, m, n, k, alpha, A, lda, B, ldb, + gemm_host( opA, opB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } else { #if defined(SPLA_CUDA) || defined(SPLA_ROCM) @@ -70,8 +69,7 @@ void gemm(SplaOperation opA, SplaOperation opB, int m, int n, int k, std::comple const std::complex *A, int lda, const std::complex *B, int ldb, std::complex beta, std::complex *C, int ldc, Context &ctx) { if (ctx.processing_unit() == SplaProcessingUnit::SPLA_PU_HOST) { - gemm_host>(ctx.ctxInternal_->num_threads(), opA, opB, m, n, k, alpha, A, - lda, B, ldb, beta, C, ldc); + gemm_host>(opA, opB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } else { #if defined(SPLA_CUDA) || defined(SPLA_ROCM) gemm_gpu( @@ -90,8 +88,7 @@ void gemm(SplaOperation opA, SplaOperation opB, int m, int n, int k, std::comple const std::complex *A, int lda, const std::complex *B, int ldb, std::complex beta, std::complex *C, int ldc, Context &ctx) { if (ctx.processing_unit() == SplaProcessingUnit::SPLA_PU_HOST) { - gemm_host>(ctx.ctxInternal_->num_threads(), opA, opB, m, n, k, alpha, A, - lda, B, ldb, beta, C, ldc); + gemm_host>(opA, opB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } else { #if defined(SPLA_CUDA) || defined(SPLA_ROCM) gemm_gpu( diff --git a/src/util/blas_interface.cpp b/src/util/blas_interface.cpp index 7ef7288..a88c567 100644 --- a/src/util/blas_interface.cpp +++ b/src/util/blas_interface.cpp @@ -151,56 +151,5 @@ auto gemm(Order order, Operation transA, Operation transB, IntType M, IntType N, static_cast(ldc)); } -auto get_num_threads() -> IntType { -#if defined(SPLA_BLAS_OPENBLAS) && defined(SPLA_BLAS_HEADER_NAME) - return openblas_get_num_threads(); -#elif defined(SPLA_BLAS_MKL) && defined(SPLA_BLAS_HEADER_NAME) - return mkl_get_max_threads(); -#elif defined(SPLA_BLAS_ARMPL) && defined(SPLA_BLAS_HEADER_NAME) - return armpl_get_num_threads(); -#elif defined(SPLA_BLAS_BLIS) && defined(SPLA_BLAS_HEADER_NAME) - return bli_thread_get_num_threads(); -#else - return 1; -#endif -} - -auto set_num_threads(IntType numThreads) -> void { -#if defined(SPLA_BLAS_OPENBLAS) && defined(SPLA_BLAS_HEADER_NAME) - openblas_set_num_threads(numThreads); -#elif defined(SPLA_BLAS_MKL) && defined(SPLA_BLAS_HEADER_NAME) - mkl_set_num_threads(numThreads); -#elif defined(SPLA_BLAS_ARMPL) && defined(SPLA_BLAS_HEADER_NAME) - armpl_set_num_threads(numThreads); -#elif defined(SPLA_BLAS_BLIS) && defined(SPLA_BLAS_HEADER_NAME) - bli_thread_set_num_threads(numThreads); -#endif -} - -auto is_parallel() -> bool { -#if defined(SPLA_BLAS_OPENBLAS) && defined(SPLA_BLAS_HEADER_NAME) - return openblas_get_parallel(); -#elif defined(SPLA_BLAS_MKL) && defined(SPLA_BLAS_HEADER_NAME) - return mkl_get_max_threads() != 1; -#elif defined(SPLA_BLAS_ARMPL) && defined(SPLA_BLAS_HEADER_NAME) - return armpl_get_max_threads() != 1; -#elif defined(SPLA_BLAS_BLIS) && defined(SPLA_BLAS_HEADER_NAME) - return bli_info_get_enable_threading(); -#elif defined(SPLA_BLAS_SCI) - return true; -#else - return false; -#endif -} - -auto is_thread_safe() -> bool { -#if defined(SPLA_BLAS_OPENBLAS) || defined(SPLA_BLAS_UNKNOWN) - // OpenBLAS is not thread-safe and unknown blas library may not be either - return false; -#else - return true; -#endif -} - } // namespace blas } // namespace spla diff --git a/src/util/blas_interface.hpp b/src/util/blas_interface.hpp index 60f12c3..d3cfb32 100644 --- a/src/util/blas_interface.hpp +++ b/src/util/blas_interface.hpp @@ -39,14 +39,6 @@ namespace blas { enum class Order { ROW_MAJOR = 101, COL_MAJOR = 102 }; enum class Operation { NONE = 111, TRANS = 112, CONJ_TRANS = 113 }; -auto is_parallel() -> bool; - -auto is_thread_safe() -> bool; - -auto get_num_threads() -> IntType; - -auto set_num_threads(IntType numThreads) -> void; - auto gemm(Order order, Operation transA, Operation transB, IntType M, IntType N, IntType K, float alpha, const float *A, IntType lda, const float *B, IntType ldb, float beta, float *C, IntType ldc) -> void; diff --git a/src/util/blas_threads_guard.hpp b/src/util/blas_threads_guard.hpp deleted file mode 100644 index 525be8c..0000000 --- a/src/util/blas_threads_guard.hpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2020 ETH Zurich, Simon Frasch - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef SPLA_BLAS_THREADS_GUARD_HPP -#define SPLA_BLAS_THREADS_GUARD_HPP - -#include "util/blas_interface.hpp" -#include "util/common_types.hpp" - -namespace spla { -class BlasThreadsGuard { -public: - explicit BlasThreadsGuard(IntType numThreadsTarget) - : orignalNumThreads_(blas::get_num_threads()), numThreadsSet_(false) { - if (orignalNumThreads_ != numThreadsTarget) { - blas::set_num_threads(numThreadsTarget); - numThreadsSet_ = true; - } - } - - BlasThreadsGuard() = delete; - - BlasThreadsGuard(const BlasThreadsGuard&) = delete; - - BlasThreadsGuard(BlasThreadsGuard&&) = delete; - - auto operator=(const BlasThreadsGuard&) -> BlasThreadsGuard& = delete; - - auto operator=(BlasThreadsGuard &&) -> BlasThreadsGuard& = delete; - - ~BlasThreadsGuard() { - if (numThreadsSet_) blas::set_num_threads(orignalNumThreads_); - } - -private: - IntType orignalNumThreads_; - bool numThreadsSet_; -}; -} // namespace spla - -#endif // SPLA_BLAS_THREADS_GUARD_HPP diff --git a/src/util/omp_definitions.hpp b/src/util/omp_definitions.hpp deleted file mode 100644 index 9719355..0000000 --- a/src/util/omp_definitions.hpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2020 ETH Zurich, Simon Frasch - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef SPLA_OMP_DEFINITIONS_HPP -#define SPLA_OMP_DEFINITIONS_HPP - -#include "spla/config.h" -#ifdef SPLA_OMP -#include -#define SPLA_OMP_PRAGMA(content) _Pragma(content) -#else -#define SPLA_OMP_PRAGMA(content) -namespace spla { -inline int omp_get_num_threads() { return 1; } -inline int omp_get_thread_num() { return 0; } -inline int omp_get_max_threads() { return 1; } -inline int omp_in_parallel() { return 0; } -inline int omp_get_num_procs() { return 1; } -} // namespace spla -#endif - -#endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ff8e5e6..7649385 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -20,9 +20,9 @@ if(SPLA_BUNDLED_GOOGLETEST) ) FetchContent_MakeAvailable(googletest) else() - find_package(googletest CONFIG REQUIRED) + find_package(GTest CONFIG REQUIRED) endif() -list(APPEND SPLA_TEST_LIBRARIES gtest_main) +list(APPEND SPLA_TEST_LIBRARIES GTest::gtest) # add command line parser if(SPLA_BUNDLED_CLI11) diff --git a/tests/gtest_mpi.cpp b/tests/gtest_mpi.cpp index ea82184..5fdefed 100644 --- a/tests/gtest_mpi.cpp +++ b/tests/gtest_mpi.cpp @@ -1,8 +1,13 @@ +#include "gtest_mpi.hpp" + #include -#include #include + +#include +#include +#include +#include #include -#include "gtest_mpi.hpp" namespace gtest_mpi { @@ -17,109 +22,97 @@ class MPIListener : public testing::EmptyTestEventListener { using TestSuite = testing::TestSuite; MPIListener(testing::TestEventListener *listener) - : listener_(listener), comm_(MPI_COMM_WORLD), forward_calls_(false) { + : listener_(listener), comm_(MPI_COMM_WORLD), gather_called_(false) { MPI_Comm_dup(MPI_COMM_WORLD, &comm_); int rank; MPI_Comm_rank(comm_, &rank); - if (rank != 0) - listener_.reset(); + if (rank != 0) listener_.reset(); } void OnTestProgramStart(const UnitTest &u) override { - if (listener_) - listener_->OnTestProgramStart(u); + if (listener_) listener_->OnTestProgramStart(u); } void OnTestProgramEnd(const UnitTest &u) override { - if (listener_) - listener_->OnTestProgramEnd(u); + if (listener_) listener_->OnTestProgramEnd(u); } void OnTestStart(const TestInfo &test_info) override { - if (listener_) - listener_->OnTestStart(test_info); + gather_called_ = false; + if (listener_) listener_->OnTestStart(test_info); } void OnTestPartResult(const TestPartResult &test_part_result) override { - if (listener_){ + if (listener_) { listener_->OnTestPartResult(test_part_result); } else if (test_part_result.type() == TestPartResult::Type::kFatalFailure || - test_part_result.type() == - TestPartResult::Type::kNonFatalFailure) { - std::size_t fileIndex = strings_.size(); - strings_ += test_part_result.file_name() ; + test_part_result.type() == TestPartResult::Type::kNonFatalFailure) { + std::size_t file_index = strings_.size(); + strings_ += test_part_result.file_name(); strings_ += '\0'; - std::size_t messageIndex = strings_.size(); + std::size_t message_index = strings_.size(); strings_ += test_part_result.message(); strings_ += '\0'; - infos_.emplace_back(ResultInfo{test_part_result.type(), fileIndex, - test_part_result.line_number(), - messageIndex}); + infos_.emplace_back(ResultInfo{test_part_result.type(), file_index, + test_part_result.line_number(), message_index}); } } void OnTestEnd(const TestInfo &test_info) override { - if (listener_) - listener_->OnTestEnd(test_info); - } + if (!gather_called_) { + std::cerr << "Missing GTEST_MPI_GUARD in test case!" << std::endl; + throw std::runtime_error("Missing GTEST_MPI_GUARD in test case!"); + } - void OnTestIterationStart(const UnitTest & u, - int it) override { - if (listener_) - listener_->OnTestIterationStart(u, it); + if (listener_) listener_->OnTestEnd(test_info); } - void OnEnvironmentsSetUpStart(const UnitTest & u) override { - if (listener_) - listener_->OnEnvironmentsSetUpStart(u); + void OnTestIterationStart(const UnitTest &u, int it) override { + if (listener_) listener_->OnTestIterationStart(u, it); } - void OnEnvironmentsSetUpEnd(const UnitTest & u) override { - if (listener_) - listener_->OnEnvironmentsSetUpEnd(u); + void OnEnvironmentsSetUpStart(const UnitTest &u) override { + if (listener_) listener_->OnEnvironmentsSetUpStart(u); } - void OnTestSuiteStart(const TestSuite & t) override { - if (listener_) - listener_->OnTestSuiteStart(t); + void OnEnvironmentsSetUpEnd(const UnitTest &u) override { + if (listener_) listener_->OnEnvironmentsSetUpEnd(u); } - void OnTestDisabled(const TestInfo & t) override { - if (listener_) - listener_->OnTestDisabled(t); + void OnTestSuiteStart(const TestSuite &t) override { + if (listener_) listener_->OnTestSuiteStart(t); + } + void OnTestDisabled(const TestInfo &t) override { + if (listener_) listener_->OnTestDisabled(t); } - void OnTestSuiteEnd(const TestSuite & t) override { - if (listener_) - listener_->OnTestSuiteEnd(t); + void OnTestSuiteEnd(const TestSuite &t) override { + if (listener_) listener_->OnTestSuiteEnd(t); } - void OnEnvironmentsTearDownStart(const UnitTest & u) override { - if (listener_) - listener_->OnEnvironmentsTearDownStart(u); + void OnEnvironmentsTearDownStart(const UnitTest &u) override { + if (listener_) listener_->OnEnvironmentsTearDownStart(u); } - void OnEnvironmentsTearDownEnd(const UnitTest & u) override { - if (listener_) - listener_->OnEnvironmentsTearDownEnd(u); + void OnEnvironmentsTearDownEnd(const UnitTest &u) override { + if (listener_) listener_->OnEnvironmentsTearDownEnd(u); } - void OnTestIterationEnd(const UnitTest & u, - int it) override { - if (listener_) - listener_->OnTestIterationEnd(u, it); + void OnTestIterationEnd(const UnitTest &u, int it) override { + if (listener_) listener_->OnTestIterationEnd(u, it); } void GatherPartResults() { + gather_called_ = true; int rank, n_proc; MPI_Comm_rank(comm_, &rank); MPI_Comm_size(comm_, &n_proc); if (rank == 0) { - decltype(infos_) remoteInfos; - decltype(strings_) remoteStrings; + decltype(infos_) remote_infos; + decltype(strings_) remote_strings; for (int r = 1; r < n_proc; ++r) { MPI_Status status; int count; @@ -127,42 +120,38 @@ class MPIListener : public testing::EmptyTestEventListener { // Result infos MPI_Probe(r, 0, comm_, &status); MPI_Get_count(&status, MPI_CHAR, &count); - auto numResults = static_cast(count) / sizeof(decltype(remoteInfos)::value_type); - remoteInfos.resize(numResults); - MPI_Recv(remoteInfos.data(), count, MPI_BYTE, r, 0, comm_, - MPI_STATUS_IGNORE); + auto num_results = + static_cast(count) / sizeof(decltype(remote_infos)::value_type); + remote_infos.resize(num_results); + MPI_Recv(remote_infos.data(), count, MPI_BYTE, r, 0, comm_, MPI_STATUS_IGNORE); // Only continue if any results - if (numResults) { + if (num_results) { // Get strings MPI_Probe(r, 0, comm_, &status); MPI_Get_count(&status, MPI_CHAR, &count); - auto stringSize = static_cast(count) / - sizeof(decltype(remoteStrings)::value_type); - remoteStrings.resize(stringSize); - MPI_Recv(&remoteStrings[0], count, MPI_BYTE, r, 0, comm_, - MPI_STATUS_IGNORE); + auto string_size = + static_cast(count) / sizeof(decltype(remote_strings)::value_type); + remote_strings.resize(string_size); + MPI_Recv(&remote_strings[0], count, MPI_BYTE, r, 0, comm_, MPI_STATUS_IGNORE); // Create error for every remote fail - for (const auto &info : remoteInfos) { + for (const auto &info : remote_infos) { if (info.type == TestPartResult::Type::kFatalFailure || info.type == TestPartResult::Type::kNonFatalFailure) { - ADD_FAILURE_AT(&remoteStrings[info.fileIndex], info.lineNumber) - << "Rank " << r << ": " << &remoteStrings[info.messageIndex]; + ADD_FAILURE_AT(&remote_strings[info.file_index], info.line_number) + << "Rank " << r << ": " << &remote_strings[info.message_index]; } } - } } } else { - MPI_Send(infos_.data(), - infos_.size() * sizeof(decltype(infos_)::value_type), MPI_BYTE, - 0, 0, comm_); + MPI_Send(infos_.data(), infos_.size() * sizeof(decltype(infos_)::value_type), MPI_BYTE, 0, 0, + comm_); // Only send string if results exist - if(infos_.size()) { - MPI_Send(strings_.data(), - strings_.size() * sizeof(decltype(strings_)::value_type), + if (infos_.size()) { + MPI_Send(strings_.data(), strings_.size() * sizeof(decltype(strings_)::value_type), MPI_BYTE, 0, 0, comm_); } } @@ -174,14 +163,14 @@ class MPIListener : public testing::EmptyTestEventListener { private: struct ResultInfo { TestPartResult::Type type; - std::size_t fileIndex; - int lineNumber; - std::size_t messageIndex; + std::size_t file_index; + int line_number; + std::size_t message_index; }; std::unique_ptr listener_; MPI_Comm comm_; - bool forward_calls_; + bool gather_called_; std::vector infos_; std::string strings_; @@ -189,23 +178,21 @@ class MPIListener : public testing::EmptyTestEventListener { MPIListener *globalMPIListener = nullptr; -} // namespace +} // namespace void InitGoogleTestMPI(int *argc, char **argv) { - ::testing::InitGoogleTest(argc, argv); auto &test_listeners = ::testing::UnitTest::GetInstance()->listeners(); - globalMPIListener = new MPIListener( - test_listeners.Release(test_listeners.default_result_printer())); + globalMPIListener = + new MPIListener(test_listeners.Release(test_listeners.default_result_printer())); test_listeners.Append(globalMPIListener); } -TestGuard CreateTestGuard() -{ +TestGuard CreateTestGuard() { return TestGuard{[]() { globalMPIListener->GatherPartResults(); }}; } -} // namespace gtest_mpi +} // namespace gtest_mpi diff --git a/tests/programs/benchmark.cpp b/tests/programs/benchmark.cpp index 2e6b2f1..9c55bef 100644 --- a/tests/programs/benchmark.cpp +++ b/tests/programs/benchmark.cpp @@ -134,7 +134,6 @@ int main(int argc, char** argv) { app.add_option("-m", m, "Number of rows in C")->required(); app.add_option("-k", k, "Number of rows in A and B")->required(); app.add_option("-o", outputFileName, "Output file name")->default_val("timers.json"); - app.add_option("-t,--threads", numThreads, "Number of threads")->default_val("-1"); app.add_option("--type", typeName, "Data type") ->check(CLI::IsMember({"scalar", "complex"})) ->default_val("complex"); @@ -156,7 +155,6 @@ int main(int argc, char** argv) { procName == "cpu" ? SplaProcessingUnit::SPLA_PU_HOST : SplaProcessingUnit::SPLA_PU_GPU; spla::Context ctx(pu); ctx.set_tile_size_host(lengthTarget); - ctx.set_num_threads(numThreads); ctx.set_tile_size_gpu(4096); if (worldRank == 0) { @@ -169,7 +167,6 @@ int main(int argc, char** argv) { std::cout << "repeats = " << repeats << std::endl; std::cout << "proc = " << procName << std::endl; std::cout << "type = " << typeName << std::endl; - std::cout << "threads = " << ctx.num_threads() << std::endl; } spla::AllocatorCollection allocators; diff --git a/tests/programs/benchmark_scalapack.cpp b/tests/programs/benchmark_scalapack.cpp index 228a7d1..94ef035 100644 --- a/tests/programs/benchmark_scalapack.cpp +++ b/tests/programs/benchmark_scalapack.cpp @@ -102,7 +102,6 @@ void run_gemm(const std::shared_ptr>& alloca auto arrayDesc = spla::MatrixDistribution::create_blacs_block_cyclic( MPI_COMM_WORLD, 'R', worldSize, 1, blacsBlockSize, blacsBlockSize); - ctx.set_num_threads(numThreads); // run once to warm up spla::pgemm_ssb(colsA, colsB, localNumRows, SPLA_OP_CONJ_TRANSPOSE, 1.0, A.data(), localNumRows, @@ -165,7 +164,6 @@ int main(int argc, char** argv) { app.add_option("-n", colsB, "Number of columns in C")->required(); app.add_option("-m", colsA, "Number of rows in C")->required(); app.add_option("-k", rows, "Number of rows in A and B")->required(); - app.add_option("-t,--threads", numThreads, "Number of threads")->required(); app.add_option("-b,--blocksize", blacsBlockSize, "ScaLAPACK block size of C")->required(); app.add_option("-p", procName, "Processing unit") ->check(CLI::IsMember({"cpu", "gpu", "gpu-gpu"})) @@ -184,7 +182,6 @@ int main(int argc, char** argv) { procName == "cpu" ? SplaProcessingUnit::SPLA_PU_HOST : SplaProcessingUnit::SPLA_PU_GPU; spla::Context ctx(pu); ctx.set_tile_size_host(lengthTarget); - ctx.set_num_threads(numThreads); ctx.set_tile_size_gpu(4096); spla::AllocatorCollection allocators; diff --git a/tests/programs/run_tests.cpp b/tests/programs/run_tests.cpp index e565a3e..d2d5410 100644 --- a/tests/programs/run_tests.cpp +++ b/tests/programs/run_tests.cpp @@ -1,5 +1,4 @@ #include -#include // for MPI debugging #include "gtest/gtest.h" #include "gtest_mpi.hpp" diff --git a/tests/test_gemm.cpp b/tests/test_gemm.cpp index 0dc7a13..389fb05 100644 --- a/tests/test_gemm.cpp +++ b/tests/test_gemm.cpp @@ -157,6 +157,7 @@ typedef GemmTest GemmScalar; typedef GemmTest> GemmComplex; TEST_P(GemmScalar, Host) { + GTEST_MPI_GUARD try { this->mulitply_host(); } catch (const std::exception& e) { @@ -236,18 +237,18 @@ static auto param_type_names( return stream.str(); } -INSTANTIATE_TEST_CASE_P(FullGemmTest, GemmScalar, - ::testing::Combine(::testing::Values(1, 13, 32, 263), - ::testing::Values(1, 13, 32, 263), - ::testing::Values(1, 13, 32, 263), - ::testing::Values(SPLA_OP_NONE, SPLA_OP_CONJ_TRANSPOSE), - ::testing::Values(SPLA_OP_NONE, SPLA_OP_CONJ_TRANSPOSE)), - param_type_names); - -INSTANTIATE_TEST_CASE_P(FullGemmTest, GemmComplex, - ::testing::Combine(::testing::Values(1, 13, 32, 263), - ::testing::Values(1, 13, 32, 263), - ::testing::Values(1, 13, 32, 263), - ::testing::Values(SPLA_OP_NONE, SPLA_OP_CONJ_TRANSPOSE), - ::testing::Values(SPLA_OP_NONE, SPLA_OP_CONJ_TRANSPOSE)), - param_type_names); +INSTANTIATE_TEST_SUITE_P( + FullGemmTest, GemmScalar, + ::testing::Combine(::testing::Values(1, 13, 32, 263), ::testing::Values(1, 13, 32, 263), + ::testing::Values(1, 13, 32, 263), + ::testing::Values(SPLA_OP_NONE, SPLA_OP_CONJ_TRANSPOSE), + ::testing::Values(SPLA_OP_NONE, SPLA_OP_CONJ_TRANSPOSE)), + param_type_names); + +INSTANTIATE_TEST_SUITE_P( + FullGemmTest, GemmComplex, + ::testing::Combine(::testing::Values(1, 13, 32, 263), ::testing::Values(1, 13, 32, 263), + ::testing::Values(1, 13, 32, 263), + ::testing::Values(SPLA_OP_NONE, SPLA_OP_CONJ_TRANSPOSE), + ::testing::Values(SPLA_OP_NONE, SPLA_OP_CONJ_TRANSPOSE)), + param_type_names); diff --git a/tests/test_gemm_sbs.cpp b/tests/test_gemm_sbs.cpp index a5684f6..60799e6 100644 --- a/tests/test_gemm_sbs.cpp +++ b/tests/test_gemm_sbs.cpp @@ -159,18 +159,17 @@ static auto find_rectangle(int n) -> std::pair { template class GemmSBSTest : public ::testing::TestWithParam< - std::tuple, int, int>> { + std::tuple, int, int>> { protected: GemmSBSTest() - : rowBlockSize_(std::get<2>(GetParam())), - colBlockSize_(std::get<3>(GetParam())), + : rowBlockSize_(std::get<1>(GetParam())), + colBlockSize_(std::get<2>(GetParam())), m_(0), - n_(std::get<5>(GetParam())), - k_(std::get<6>(GetParam())), + n_(std::get<4>(GetParam())), + k_(std::get<5>(GetParam())), ctx_(std::get<0>(GetParam())) { - ctx_.set_num_threads(std::get<1>(GetParam())); - const std::pair mRange = std::get<4>(GetParam()); + const std::pair mRange = std::get<3>(GetParam()); std::uniform_int_distribution mLocalDistribution(mRange.first, mRange.second); // generate local m size within range @@ -461,7 +460,7 @@ TEST_P(GemmSBSComplex, MirrorOffset) { static auto param_type_names( const ::testing::TestParamInfo< - std::tuple, int, int>>& info) + std::tuple, int, int>>& info) -> std::string { std::stringstream stream; if (std::get<0>(info.param) == SplaProcessingUnit::SPLA_PU_HOST) { @@ -469,25 +468,23 @@ static auto param_type_names( } else { stream << "GPU_"; } - stream << "t_" << std::to_string(std::get<1>(info.param)) << "_"; - stream << "mb_" << std::to_string(std::get<2>(info.param)) << "_"; - stream << "nb_" << std::get<3>(info.param) << "_"; - stream << "mMin_" << std::get<4>(info.param).first << "_"; - stream << "mMax_" << std::get<4>(info.param).second << "_"; - stream << "n_" << std::get<5>(info.param) << "_"; - stream << "k_" << std::get<6>(info.param); + stream << "mb_" << std::to_string(std::get<1>(info.param)) << "_"; + stream << "nb_" << std::get<2>(info.param) << "_"; + stream << "mMin_" << std::get<3>(info.param).first << "_"; + stream << "mMax_" << std::get<3>(info.param).second << "_"; + stream << "n_" << std::get<4>(info.param) << "_"; + stream << "k_" << std::get<5>(info.param); return stream.str(); } -INSTANTIATE_TEST_CASE_P(GemmSBS, GemmSBSScalar, +INSTANTIATE_TEST_SUITE_P(GemmSBS, GemmSBSScalar, ::testing::Combine(::testing::Values(SplaProcessingUnit::SPLA_PU_HOST #if defined(SPLA_CUDA) || defined(SPLA_ROCM) , SplaProcessingUnit::SPLA_PU_GPU #endif ), - ::testing::Values(1, 4), // number of threads ::testing::Values(1, 64), // row block size ::testing::Values(1, 64), // coloumn block size ::testing::Values(std::pair(0, 1), @@ -497,14 +494,13 @@ INSTANTIATE_TEST_CASE_P(GemmSBS, GemmSBSScalar, ::testing::Values(1, 13, 32, 263)), // k param_type_names); -INSTANTIATE_TEST_CASE_P(GemmSBS, GemmSBSComplex, +INSTANTIATE_TEST_SUITE_P(GemmSBS, GemmSBSComplex, ::testing::Combine(::testing::Values(SplaProcessingUnit::SPLA_PU_HOST #if defined(SPLA_CUDA) || defined(SPLA_ROCM) , SplaProcessingUnit::SPLA_PU_GPU #endif ), - ::testing::Values(1, 4), // number of threads ::testing::Values(1, 64), // row block size ::testing::Values(1, 64), // coloumn block size ::testing::Values(std::pair(0, 1), @@ -514,14 +510,13 @@ INSTANTIATE_TEST_CASE_P(GemmSBS, GemmSBSComplex, ::testing::Values(1, 13, 32, 263)), // k param_type_names); -INSTANTIATE_TEST_CASE_P(LargeGemmSBS, GemmSBSScalar, +INSTANTIATE_TEST_SUITE_P(LargeGemmSBS, GemmSBSScalar, ::testing::Combine(::testing::Values(SplaProcessingUnit::SPLA_PU_HOST #if defined(SPLA_CUDA) || defined(SPLA_ROCM) , SplaProcessingUnit::SPLA_PU_GPU #endif ), - ::testing::Values(2), // number of threads ::testing::Values(128), // row block size ::testing::Values(128), // coloumn block size ::testing::Values(std::pair(50, diff --git a/tests/test_gemm_ssb.cpp b/tests/test_gemm_ssb.cpp index f553201..8bfadaf 100644 --- a/tests/test_gemm_ssb.cpp +++ b/tests/test_gemm_ssb.cpp @@ -155,18 +155,17 @@ static auto find_rectangle(int n) -> std::pair { template class GemmSSBTest : public ::testing::TestWithParam< - std::tuple>> { + std::tuple>> { protected: GemmSSBTest() - : rowBlockSize_(std::get<2>(GetParam())), - colBlockSize_(std::get<3>(GetParam())), - m_(std::get<4>(GetParam())), - n_(std::get<5>(GetParam())), + : rowBlockSize_(std::get<1>(GetParam())), + colBlockSize_(std::get<2>(GetParam())), + m_(std::get<3>(GetParam())), + n_(std::get<4>(GetParam())), k_(0), ctx_(std::get<0>(GetParam())) { - ctx_.set_num_threads(std::get<1>(GetParam())); - const std::pair kRange = std::get<6>(GetParam()); + const std::pair kRange = std::get<5>(GetParam()); std::uniform_int_distribution kLocalDistribution(kRange.first, kRange.second); // generate local k size within range @@ -458,7 +457,7 @@ TEST_P(GemmSSBComplex, MirrorOffset) { static auto param_type_names( const ::testing::TestParamInfo< - std::tuple>>& info) + std::tuple>>& info) -> std::string { std::stringstream stream; if (std::get<0>(info.param) == SplaProcessingUnit::SPLA_PU_HOST) { @@ -466,25 +465,23 @@ static auto param_type_names( } else { stream << "GPU_"; } - stream << "t_" << std::to_string(std::get<1>(info.param)) << "_"; - stream << "mb_" << std::to_string(std::get<2>(info.param)) << "_"; - stream << "nb_" << std::get<3>(info.param) << "_"; - stream << "m_" << std::get<4>(info.param) << "_"; - stream << "n_" << std::get<5>(info.param) << "_"; - stream << "kMin_" << std::get<6>(info.param).first << "_"; - stream << "kMax_" << std::get<6>(info.param).second; + stream << "mb_" << std::to_string(std::get<1>(info.param)) << "_"; + stream << "nb_" << std::get<2>(info.param) << "_"; + stream << "m_" << std::get<3>(info.param) << "_"; + stream << "n_" << std::get<4>(info.param) << "_"; + stream << "kMin_" << std::get<5>(info.param).first << "_"; + stream << "kMax_" << std::get<5>(info.param).second; return stream.str(); } -INSTANTIATE_TEST_CASE_P(GemmSSB, GemmSSBScalar, +INSTANTIATE_TEST_SUITE_P(GemmSSB, GemmSSBScalar, ::testing::Combine(::testing::Values(SplaProcessingUnit::SPLA_PU_HOST #if defined(SPLA_CUDA) || defined(SPLA_ROCM) , SplaProcessingUnit::SPLA_PU_GPU #endif ), - ::testing::Values(1, 4), // number of threads ::testing::Values(1, 64), // coloumn block size ::testing::Values(1, 64), // row block size ::testing::Values(1, 13, 32, 263), // m @@ -494,14 +491,13 @@ INSTANTIATE_TEST_CASE_P(GemmSSB, GemmSSBScalar, 400))), // k range param_type_names); -INSTANTIATE_TEST_CASE_P(GemmSSB, GemmSSBComplex, +INSTANTIATE_TEST_SUITE_P(GemmSSB, GemmSSBComplex, ::testing::Combine(::testing::Values(SplaProcessingUnit::SPLA_PU_HOST #if defined(SPLA_CUDA) || defined(SPLA_ROCM) , SplaProcessingUnit::SPLA_PU_GPU #endif ), - ::testing::Values(1, 4), // number of threads ::testing::Values(1, 64), // coloumn block size ::testing::Values(1, 64), // row block size ::testing::Values(1, 13, 32, 263), // m @@ -511,7 +507,7 @@ INSTANTIATE_TEST_CASE_P(GemmSSB, GemmSSBComplex, 400))), // k range param_type_names); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( LargeGemmSSB, GemmSSBScalar, ::testing::Combine(::testing::Values(SplaProcessingUnit::SPLA_PU_HOST #if defined(SPLA_CUDA) || defined(SPLA_ROCM) @@ -519,7 +515,6 @@ INSTANTIATE_TEST_CASE_P( SplaProcessingUnit::SPLA_PU_GPU #endif ), - ::testing::Values(2), // number of threads ::testing::Values(64), // coloumn block size ::testing::Values(128), // row block size ::testing::Values(3000), // m From 8e6b57e52fd8a849ddc314a1ad7481d0d7f7af67 Mon Sep 17 00:00:00 2001 From: Simon Frasch Date: Wed, 17 Apr 2024 18:59:26 +0200 Subject: [PATCH 2/2] use FindBLAS --- .github/workflows/ci.yml | 6 +- CMakeLists.txt | 321 +++++++------------------ README.md | 20 +- cmake/SPLAConfigVersion.cmake | 5 +- cmake/SPLASharedConfig.cmake | 43 ++-- cmake/SPLAStaticConfig.cmake | 147 ++++++----- cmake/SPLATargets.cmake | 5 +- cmake/modules/FindARMPL.cmake | 93 ------- cmake/modules/FindATLAS.cmake | 80 ------ cmake/modules/FindBLIS.cmake | 80 ------ cmake/modules/FindGenericBLAS.cmake | 86 ------- cmake/modules/FindMKL.cmake | 310 ------------------------ cmake/modules/FindOPENBLAS.cmake | 80 ------ cmake/modules/FindSCI.cmake | 3 +- cmake/util/blas_dgemm_symbol.h | 12 - include/spla/config.h.in | 10 +- include/spla/spla.f90 | 12 - src/CMakeLists.txt | 202 ++++++++-------- src/gemm/gemm_host.cpp | 3 +- src/util/blas_interface.cpp | 144 +++++++---- src/util/blas_interface.hpp | 17 +- tests/CMakeLists.txt | 57 ++--- tests/programs/benchmark.cpp | 1 - tests/programs/benchmark_scalapack.cpp | 19 +- tests/test_gemm.cpp | 12 +- tests/test_gemm_sbs.cpp | 1 - tests/test_gemm_ssb.cpp | 1 - 27 files changed, 436 insertions(+), 1334 deletions(-) delete mode 100644 cmake/modules/FindARMPL.cmake delete mode 100644 cmake/modules/FindATLAS.cmake delete mode 100644 cmake/modules/FindBLIS.cmake delete mode 100644 cmake/modules/FindGenericBLAS.cmake delete mode 100644 cmake/modules/FindMKL.cmake delete mode 100644 cmake/modules/FindOPENBLAS.cmake delete mode 100644 cmake/util/blas_dgemm_symbol.h diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fd5234c..aa543ac 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -87,15 +87,15 @@ jobs: run: | apt-get update DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get install -y make g++ libopenblas-dev libscalapack-mpi-dev wget git make - cd ${HOME} && wget https://github.com/Kitware/CMake/releases/download/v3.14.1/cmake-3.14.1-Linux-x86_64.tar.gz && tar -xzvf cmake-3.14.1-Linux-x86_64.tar.gz + cd ${HOME} && wget https://github.com/Kitware/CMake/releases/download/v3.18.0/cmake-3.18.0-Linux-x86_64.tar.gz && tar -xzvf cmake-3.18.0-Linux-x86_64.tar.gz - name: Build run: | cd ${GITHUB_WORKSPACE} mkdir -p build cd build - ${HOME}/cmake-3.14.1-Linux-x86_64/bin/cmake .. -DSPLA_BUILD_TESTS=ON -DSPLA_GPU_BACKEND=CUDA - make -j2 + ${HOME}/cmake-3.18.0-Linux-x86_64/bin/cmake .. -DSPLA_BUILD_TESTS=ON -DSPLA_GPU_BACKEND=CUDA + make -j2 VERBOSE=1 ################# # Build with ROCm diff --git a/CMakeLists.txt b/CMakeLists.txt index a8a9f64..6ec0694 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,24 +1,24 @@ -cmake_minimum_required(VERSION 3.10 FATAL_ERROR) +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) project(spla LANGUAGES CXX VERSION 1.5.5) set(SPLA_SO_VERSION 1) set(SPLA_VERSION ${PROJECT_VERSION}) # allow {module}_ROOT variables to be set if(POLICY CMP0074) - cmake_policy(SET CMP0074 NEW) + cmake_policy(SET CMP0074 NEW) endif() # use INTERFACE_LINK_LIBRARIES property if available if(POLICY CMP0022) - cmake_policy(SET CMP0022 NEW) + cmake_policy(SET CMP0022 NEW) endif() # set default build type to RELEASE if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type" FORCE) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS - "Debug" "Release" "MinSizeRel" "RelWithDebInfo" - ) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type" FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS + "Debug" "Release" "MinSizeRel" "RelWithDebInfo" + ) endif() # set language and standard @@ -44,48 +44,36 @@ cmake_dependent_option(SPLA_BUNDLED_CLI11 "Use bundled cli11 lib" ON "SPLA_BUNDL set(SPLA_GPU_BACKEND "OFF" CACHE STRING "GPU backend") set_property(CACHE SPLA_GPU_BACKEND PROPERTY STRINGS - "OFF" "CUDA" "ROCM" - ) -set(_SPLA_HOST_BLAS_LIST "AUTO" "MKL" "ARMPL" "OPENBLAS" "BLIS" "CRAY_LIBSCI" "GENERIC") -set(SPLA_HOST_BLAS "AUTO" CACHE STRING "Blas library for computations on host") -set_property(CACHE SPLA_HOST_BLAS PROPERTY STRINGS ${_SPLA_HOST_BLAS_LIST}) - + "OFF" "CUDA" "ROCM" + ) # Options combination check set(SPLA_CUDA OFF) set(SPLA_ROCM OFF) if(SPLA_GPU_BACKEND) - if(SPLA_GPU_BACKEND STREQUAL "CUDA") - set(SPLA_CUDA ON) - elseif(SPLA_GPU_BACKEND STREQUAL "ROCM") - set(SPLA_ROCM ON) - else() - message(FATAL_ERROR "Invalid GPU backend") - endif() + if(SPLA_GPU_BACKEND STREQUAL "CUDA") + set(SPLA_CUDA ON) + elseif(SPLA_GPU_BACKEND STREQUAL "ROCM") + set(SPLA_ROCM ON) + else() + message(FATAL_ERROR "Invalid GPU backend") + endif() endif() -if(NOT ${SPLA_HOST_BLAS} IN_LIST _SPLA_HOST_BLAS_LIST) - message(FATAL_ERROR "Invalid Host BLAS backend") +if(SPLA_HOST_BLAS) + message(WARNING "SPLA_HOST_BLAS is no longer in use. Use BLA_VENDOR instead. Check CMake documentation for FindBLAS.") endif() -set(SPLA_BLAS_MKL OFF) -set(SPLA_BLAS_BLIS OFF) -set(SPLA_BLAS_OPENBLAS OFF) -set(SPLA_BLAS_SCI OFF) -set(SPLA_BLAS_ATLAS OFF) -set(SPLA_BLAS_GENERIC OFF) -set(SPLA_BLAS_UNKNOWN OFF) - # Fortran if(SPLA_FORTRAN) - enable_language(Fortran) + enable_language(Fortran) endif() # set preferred library type if (SPLA_STATIC) - set(SPLA_LIBRARY_TYPE STATIC) + set(SPLA_LIBRARY_TYPE STATIC) else() - set(SPLA_LIBRARY_TYPE SHARED) + set(SPLA_LIBRARY_TYPE SHARED) endif() set(SPLA_DEFINITIONS) @@ -94,47 +82,41 @@ set(SPLA_EXTERNAL_LIBS) set(SPLA_INCLUDE_DIRS) set(SPLA_EXTERNAL_INCLUDE_DIRS) set(SPLA_EXTERNAL_PKG_PACKAGES) -set(SPLA_BLAS_OPENBLAS OFF) -set(SPLA_BLAS_MKL OFF) -set(SPLA_BLAS_ARMPL OFF) -set(SPLA_BLAS_BLIS OFF) -set(SPLA_BLAS_SCI OFF) -set(SPLA_BLAS_ATLAS OFF) set(SPLA_RESTRICT_ATTR " ") # check if restrict attribute is available include(CheckCXXSourceCompiles) check_cxx_source_compiles( - " - int f(double *__restrict__ x); - int main(void) {return 0;} - " - HAVE___RESTRICT__ - ) + " + int f(double *__restrict__ x); + int main(void) {return 0;} + " + HAVE___RESTRICT__ + ) if(${HAVE___RESTRICT__}) - set(SPLA_RESTRICT_ATTR "__restrict__") + set(SPLA_RESTRICT_ATTR "__restrict__") else() - check_cxx_source_compiles( - " - int f(double *__restrict x); - int main(void) {return 0;} - " - HAVE___RESTRICT - ) - if(HAVE___RESTRICT) - set(SPLA_RESTRICT_ATTR "__restrict") - else() - check_cxx_source_compiles( - " - int f(double *restrict x); - int main(void) {return 0;} - " - HAVE_RESTRICT - ) - if(${HAVE_RESTRICT}) - set(SPLA_RESTRICT_ATTR "restrict") - endif() - endif() + check_cxx_source_compiles( + " + int f(double *__restrict x); + int main(void) {return 0;} + " + HAVE___RESTRICT + ) + if(HAVE___RESTRICT) + set(SPLA_RESTRICT_ATTR "__restrict") + else() + check_cxx_source_compiles( + " + int f(double *restrict x); + int main(void) {return 0;} + " + HAVE_RESTRICT + ) + if(${HAVE_RESTRICT}) + set(SPLA_RESTRICT_ATTR "restrict") + endif() + endif() endif() @@ -142,186 +124,57 @@ endif() find_package(MPI COMPONENTS CXX REQUIRED) list(APPEND SPLA_EXTERNAL_LIBS MPI::MPI_CXX) - -if(SPLA_OMP) - find_package(OpenMP REQUIRED COMPONENTS CXX) - list(APPEND SPLA_EXTERNAL_LIBS OpenMP::OpenMP_CXX) -endif() - # CUDA if(SPLA_CUDA) - if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.17.0") - find_package(CUDAToolkit REQUIRED) - else() - enable_language(CUDA) - find_library(CUDA_CUDART_LIBRARY cudart PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - if(NOT TARGET CUDA::cudart) - add_library(CUDA::cudart INTERFACE IMPORTED) - endif() - set_property(TARGET CUDA::cudart PROPERTY INTERFACE_LINK_LIBRARIES ${CUDA_CUDART_LIBRARY}) - set_property(TARGET CUDA::cudart PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - - find_library(CUDA_CUBLAS_LIBRARY cublas PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - if(NOT TARGET CUDA::cublas) - add_library(CUDA::cublas INTERFACE IMPORTED) - endif() - set_property(TARGET CUDA::cublas PROPERTY INTERFACE_LINK_LIBRARIES ${CUDA_CUBLAS_LIBRARY}) - set_property(TARGET CUDA::cublas PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - endif() - - list(APPEND SPLA_EXTERNAL_LIBS CUDA::cudart CUDA::cublas) + find_package(CUDAToolkit REQUIRED) + list(APPEND SPLA_EXTERNAL_LIBS CUDA::cudart CUDA::cublas) endif() # ROCm if(SPLA_ROCM) - find_package(hip CONFIG REQUIRED) - find_package(rocblas CONFIG REQUIRED) + find_package(hip CONFIG REQUIRED) + find_package(rocblas CONFIG REQUIRED) list(APPEND SPLA_EXTERNAL_LIBS hip::host roc::rocblas) endif() -# find BLAS backend for Host computation. -set(_SPLA_BLAS_FOUND FALSE) - -if(${SPLA_HOST_BLAS} STREQUAL "AUTO" OR ${SPLA_HOST_BLAS} STREQUAL "MKL") - find_package(MKL QUIET) - if(SPLA_OMP) - if(TARGET mkl::intel_32bit_omp_dyn) - set(SPLA_MKL_TARGET intel_32bit_omp_dyn) - elseif(TARGET mkl::intel_32bit_omp_st) - set(SPLA_MKL_TARGET intel_32bit_omp_st) - endif() - else() - if(TARGET mkl::intel_32bit_seq_dyn) - set(SPLA_MKL_TARGET intel_32bit_seq_dyn) - elseif(TARGET mkl::intel_32bit_seq_st) - set(SPLA_MKL_TARGET intel_32bit_seq_st) - endif() - endif() - if(TARGET mkl::${SPLA_MKL_TARGET}) - message(STATUS "Host BLAS Backend: MKL") - list(APPEND SPLA_EXTERNAL_LIBS mkl::${SPLA_MKL_TARGET}) - set(SPLA_BLAS_MKL ON) - set(SPLA_BLAS_HEADER_NAME mkl.h) - set(_SPLA_BLAS_FOUND TRUE) - endif() -endif() - -if(NOT ${_SPLA_BLAS_FOUND} AND (${SPLA_HOST_BLAS} STREQUAL "AUTO" OR ${SPLA_HOST_BLAS} STREQUAL "ARMPL")) - find_package(ARMPL) - if(TARGET ARM::pl) - message(STATUS "Host BLAS Backend: ARMPL") - list(APPEND SPLA_EXTERNAL_LIBS ARM::pl) - set(SPLA_BLAS_ARMPL ON) - set(SPLA_BLAS_HEADER_NAME armpl.h) - set(_SPLA_BLAS_FOUND TRUE) - endif() -endif() - -if(NOT ${_SPLA_BLAS_FOUND} AND (${SPLA_HOST_BLAS} STREQUAL "AUTO" OR ${SPLA_HOST_BLAS} STREQUAL "BLIS")) - find_package(BLIS) - if(TARGET BLIS::blis) - message(STATUS "Host BLAS Backend: BLIS") - list(APPEND SPLA_EXTERNAL_LIBS BLIS::blis) - set(SPLA_BLAS_BLIS ON) - set(SPLA_BLAS_HEADER_NAME blis.h) - set(_SPLA_BLAS_FOUND TRUE) - endif() -endif() - -if(NOT ${_SPLA_BLAS_FOUND} AND (${SPLA_HOST_BLAS} STREQUAL "AUTO" OR ${SPLA_HOST_BLAS} STREQUAL "OPENBLAS")) - find_package(OPENBLAS) - if(TARGET OPENBLAS::openblas) - message(STATUS "Host BLAS Backend: OPENBLAS") - list(APPEND SPLA_EXTERNAL_LIBS OPENBLAS::openblas) - set(SPLA_BLAS_OPENBLAS ON) - set(_SPLA_BLAS_FOUND TRUE) - # try to find openblas header file - find_file(_BLAS_HEADER NAMES cblas_openblas.h cblas-openblas.h cblas.h HINTS ${OPENBLAS_INCLUDE_DIRS}) - if(_BLAS_HEADER) - get_filename_component(SPLA_BLAS_HEADER_NAME ${_BLAS_HEADER} NAME) - endif() - endif() -endif() - -if(NOT ${_SPLA_BLAS_FOUND} AND (${SPLA_HOST_BLAS} STREQUAL "AUTO" OR ${SPLA_HOST_BLAS} STREQUAL "CRAY_LIBSCI")) - find_package(SCI) - if(TARGET SCI::sci) - message(STATUS "Host BLAS Backend: CRAY_LIBSCI") - list(APPEND SPLA_EXTERNAL_LIBS SCI::sci) - set(SPLA_BLAS_SCI ON) - set(SPLA_BLAS_HEADER_NAME cblas.h) - set(_SPLA_BLAS_FOUND TRUE) - endif() -endif() -if(NOT ${_SPLA_BLAS_FOUND} AND (${SPLA_HOST_BLAS} STREQUAL "AUTO" OR ${SPLA_HOST_BLAS} STREQUAL "ATLAS")) - find_package(ATLAS) - if(TARGET ATLAS::atlas) - message(STATUS "Host BLAS Backend: ATLAS") - list(APPEND SPLA_EXTERNAL_LIBS ATLAS::atlas) - set(SPLA_BLAS_ATLAS ON) - set(_SPLA_BLAS_FOUND TRUE) - # try to find header file - find_file(_BLAS_HEADER NAMES cblas_atlas.h cblas-atlas.h cblas.h HINTS ${ATLAS_INCLUDE_DIRS}) - if(_BLAS_HEADER) - get_filename_component(SPLA_BLAS_HEADER_NAME ${_BLAS_HEADER} NAME) - endif() - endif() -endif() - -if(NOT ${_SPLA_BLAS_FOUND} AND (${SPLA_HOST_BLAS} STREQUAL "AUTO" OR ${SPLA_HOST_BLAS} STREQUAL "GENERIC")) - find_package(GenericBLAS) - if(TARGET GenericBLAS::blas) - message(STATUS "Host BLAS Backend: GENERIC") - message(STATUS "Host BLAS libs: ${GenericBLAS_LIBRARIES}") - list(APPEND SPLA_EXTERNAL_LIBS GenericBLAS::blas) - set(SPLA_BLAS_GENERIC ON) - set(_SPLA_BLAS_FOUND TRUE) - # try to find header file - find_file(_BLAS_HEADER NAMES cblas.h HINTS ${GenericBLAS_INCLUDE_DIRS}) - if(_BLAS_HEADER) - get_filename_component(SPLA_BLAS_HEADER_NAME ${_BLAS_HEADER} NAME) - endif() - endif() -endif() - -if(NOT _SPLA_BLAS_FOUND AND NOT ${SPLA_HOST_BLAS} STREQUAL "AUTO") - message(FATAL_ERROR - "Could not find selected host blas backend \"${SPLA_HOST_BLAS}\". Set root path or CMAKE_PREFIX_PATH correctly or use \"AUTO\" blas backend for fall back mode.") +# BLAS +set(BLA_SIZEOF_INTEGER 4) +if(BLA_VENDOR AND "${BLA_VENDOR}" STREQUAL "CRAY_LIBSCI") + find_package(SCI MODULE REQUIRED) + list(APPEND SPLA_EXTERNAL_LIBS SCI::sci) +elseif(NOT BLA_VENDOR AND NOT BLAS_LIBRARIES) + # search in custom order first + set(_BLAS_VENDOR_LIST Intel10_64lp AOCL_mt Arm_mp OpenBLAS FLAME) + foreach(BLA_VENDOR IN LISTS _BLAS_VENDOR_LIST) + find_package(BLAS MODULE QUIET) + if(BLAS_LIBRARIES) + message(STATUS "Found BLAS library ${BLA_VENDOR}: ${BLAS_LIBRARIES}") + break() + endif() + message(STATUS "Could NOT find BLAS library ${BLA_VENDOR}") + endforeach() + # if not found, search for any BLAS library + if(NOT BLAS_LIBRARIES) + unset(BLA_VENDOR) + find_package(BLAS MODULE REQUIRED) + endif() + list(APPEND SPLA_EXTERNAL_LIBS BLAS::BLAS) +else() + find_package(BLAS MODULE REQUIRED) + list(APPEND SPLA_EXTERNAL_LIBS BLAS::BLAS) endif() -# Fall back to CMake provided FindBLAS as last resort -if(NOT _SPLA_BLAS_FOUND) - find_package(BLAS REQUIRED) - message(STATUS "Host BLAS Backend: ${BLAS_LIBRARIES}") - find_file(_BLAS_HEADER NAMES cblas.h) - if(_BLAS_HEADER) - get_filename_component(SPLA_BLAS_HEADER_NAME ${_BLAS_HEADER} NAME) - endif() - set(_SPLA_BLAS_FOUND TRUE) - if(NOT TARGET BLAS::BLAS) - # target is only available with CMake 3.18.0 and later - add_library(BLAS::BLAS INTERFACE IMPORTED) - set_property(TARGET BLAS::BLAS PROPERTY INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES} ${BLAS_LINKER_FLAGS}) - endif() - list(APPEND SPLA_EXTERNAL_LIBS BLAS::BLAS) - set(SPLA_BLAS_UNKNOWN ON) +if(TARGET BLAS::BLAS) + # some CMAKE versions (3.18-3.19) don't include libaries in target + target_link_libraries(BLAS::BLAS INTERFACE ${BLAS_LIBRARIES} ${BLAS_LINKER_FLAGS}) endif() - -# make sure cblas symbols exist in blas library -include(CheckCXXSymbolExists) +# check if cblas available set(CMAKE_REQUIRED_LIBRARIES ${SPLA_EXTERNAL_LIBS}) -unset(_SPLA_CBLAS_FOUND CACHE) # Result is cached, so change of library will not lead to a new check automatically -if(SPLA_BLAS_HEADER_NAME) - check_cxx_symbol_exists(cblas_dgemm ${SPLA_BLAS_HEADER_NAME} _SPLA_CBLAS_FOUND) -else() - set(CMAKE_REQUIRED_INCLUDES ${PROJECT_SOURCE_DIR}/cmake/util) - check_cxx_symbol_exists(cblas_dgemm blas_dgemm_symbol.h _SPLA_CBLAS_FOUND) -endif() -if(NOT _SPLA_CBLAS_FOUND) - message(FATAL_ERROR "CBlas symbols are required but not found in blas library!") -endif() +include(CheckFunctionExists) +unset(SPLA_CBLAS CACHE) # Result is cached, so change of library will not lead to a new check automatically +CHECK_FUNCTION_EXISTS(cblas_zgemm SPLA_CBLAS) # generate config.h configure_file(include/spla/config.h.in ${PROJECT_BINARY_DIR}/spla/config.h) @@ -337,10 +190,10 @@ list(APPEND SPLA_EXTERNAL_INCLUDE_DIRS ${PROJECT_SOURCE_DIR}/ext) add_subdirectory(src) if(SPLA_BUILD_EXAMPLES) - add_subdirectory(examples) + add_subdirectory(examples) endif() # add tests for developement if(SPLA_BUILD_TESTS) - add_subdirectory(tests) + add_subdirectory(tests) endif() diff --git a/README.md b/README.md index 9876689..cd33201 100644 --- a/README.md +++ b/README.md @@ -57,16 +57,16 @@ make -j8 install ``` ### CMake options -| Option | Values | Default | Description | -|------------------------|----------------------------------------------------------|---------|--------------------------------------------------| -| SPLA_OMP | ON, OFF | ON | Enable multi-threading with OpenMP | -| SPLA_HOST_BLAS | AUTO, MKL, OPENBLAS, BLIS, CRAY_LIBSCI, ATLAS, GENERIC | AUTO | BLAS library for computations on host | -| SPLA_GPU_BACKEND | OFF, CUDA, ROCM | OFF | Select GPU backend | -| SPLA_BUILD_TESTS | ON, OFF | OFF | Build test executables | -| SPLA_BUNDLED_TEST_LIBS | ON, OFF | ON | Download libraries required for tests | -| SPLA_BUILD_EXAMPLES | ON, OFF | OFF | Build examples | -| SPLA_INSTALL | ON, OFF | ON | Add library to install target | -| SPLA_FORTRAN | ON, OFF | OFF | Build Fortan module | +| Option | Values | Default | Description | +|------------------------|----------------------------------------------------------|---------|-----------------------------------------------------------------------| +| SPLA_OMP | ON, OFF | ON | Enable multi-threading with OpenMP | +| SPLA_GPU_BACKEND | OFF, CUDA, ROCM | OFF | Select GPU backend | +| SPLA_BUILD_TESTS | ON, OFF | OFF | Build test executables | +| SPLA_BUNDLED_TEST_LIBS | ON, OFF | ON | Download libraries required for tests | +| SPLA_BUILD_EXAMPLES | ON, OFF | OFF | Build examples | +| SPLA_INSTALL | ON, OFF | ON | Add library to install target | +| SPLA_FORTRAN | ON, OFF | OFF | Build Fortan module | +| BLA_VENDOR | Check CMake FindBLAS documentation. For Cray Libsci, set to "CRAY_LIBSCI". | | BLAS library for computations on host | ## Implementation Details The implementation is based on a ring communication pattern as described in the paper [Accelerating large-scale excited-state GW calculations on leadership HPC systems](https://dl.acm.org/doi/10.5555/3433701.3433706) by Mauro Del Ben Et Al. For distributed matrix-matrix multiplications with distributions as used in the `pgemm_ssb` function, each process contributes to the result of every element. Therefore, some form of reduction operation is required. Compared to other reduction schemes, a ring requires more communication volume. However, by splitting up the result and computing multiple reductions concurrently, all processes share the work load at every step and more opportunities for communication - computation overlap arise. diff --git a/cmake/SPLAConfigVersion.cmake b/cmake/SPLAConfigVersion.cmake index ce0a44b..7752429 100644 --- a/cmake/SPLAConfigVersion.cmake +++ b/cmake/SPLAConfigVersion.cmake @@ -1,7 +1,6 @@ - # Prefer shared library if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/SPLASharedConfigVersion.cmake") - include("${CMAKE_CURRENT_LIST_DIR}/SPLASharedConfigVersion.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/SPLASharedConfigVersion.cmake") else() - include("${CMAKE_CURRENT_LIST_DIR}/SPLAStaticConfigVersion.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/SPLAStaticConfigVersion.cmake") endif() diff --git a/cmake/SPLASharedConfig.cmake b/cmake/SPLASharedConfig.cmake index 67f58be..e1e1e4c 100644 --- a/cmake/SPLASharedConfig.cmake +++ b/cmake/SPLASharedConfig.cmake @@ -1,14 +1,13 @@ include(CMakeFindDependencyMacro) macro(find_dependency_components) - if(${ARGV0}_FOUND AND ${CMAKE_VERSION} VERSION_LESS "3.15.0") - # find_dependency does not handle new components correctly before 3.15.0 - set(${ARGV0}_FOUND FALSE) - endif() - find_dependency(${ARGV}) + if(${ARGV0}_FOUND AND ${CMAKE_VERSION} VERSION_LESS "3.15.0") + # find_dependency does not handle new components correctly before 3.15.0 + set(${ARGV0}_FOUND FALSE) + endif() + find_dependency(${ARGV}) endmacro() # options used for building library -set(SPLA_OMP @SPLA_OMP@) set(SPLA_STATIC @SPLA_STATIC@) set(SPLA_GPU_BACKEND @SPLA_GPU_BACKEND@) set(SPLA_BUILD_TESTS @SPLA_BUILD_TESTS@) @@ -20,32 +19,32 @@ set(SPLA_FORTRAN @SPLA_FORTRAN@) # Only look for MPI if header matching language is possibly used get_property(_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) if("CXX" IN_LIST _LANGUAGES AND NOT TARGET MPI::MPI_CXX) - find_dependency_components(MPI COMPONENTS CXX) + find_dependency_components(MPI COMPONENTS CXX) endif() if("C" IN_LIST _LANGUAGES AND NOT TARGET MPI::MPI_C) - find_dependency_components(MPI COMPONENTS C) + find_dependency_components(MPI COMPONENTS C) endif() if("Fortran" IN_LIST _LANGUAGES AND NOT TARGET MPI::MPI_Fortran) - find_dependency_components(MPI COMPONENTS Fortran) + find_dependency_components(MPI COMPONENTS Fortran) endif() # find_dependency may set SPLA_FOUND to false, so only add spla if everything required was found if(NOT DEFINED SPLA_FOUND OR SPLA_FOUND) - # add version of package - include("${CMAKE_CURRENT_LIST_DIR}/SPLASharedConfigVersion.cmake") + # add version of package + include("${CMAKE_CURRENT_LIST_DIR}/SPLASharedConfigVersion.cmake") - # add library target - include("${CMAKE_CURRENT_LIST_DIR}/SPLASharedTargets.cmake") + # add library target + include("${CMAKE_CURRENT_LIST_DIR}/SPLASharedTargets.cmake") - if(TARGET MPI::MPI_CXX) - target_link_libraries(SPLA::spla INTERFACE MPI::MPI_CXX) - endif() - if(TARGET MPI::MPI_C) - target_link_libraries(SPLA::spla INTERFACE MPI::MPI_C) - endif() - if(TARGET MPI::MPI_Fortran) - target_link_libraries(SPLA::spla INTERFACE MPI::MPI_Fortran) - endif() + if(TARGET MPI::MPI_CXX) + target_link_libraries(SPLA::spla INTERFACE MPI::MPI_CXX) + endif() + if(TARGET MPI::MPI_C) + target_link_libraries(SPLA::spla INTERFACE MPI::MPI_C) + endif() + if(TARGET MPI::MPI_Fortran) + target_link_libraries(SPLA::spla INTERFACE MPI::MPI_Fortran) + endif() endif() diff --git a/cmake/SPLAStaticConfig.cmake b/cmake/SPLAStaticConfig.cmake index d8ff8ff..9f9f872 100644 --- a/cmake/SPLAStaticConfig.cmake +++ b/cmake/SPLAStaticConfig.cmake @@ -1,10 +1,10 @@ include(CMakeFindDependencyMacro) macro(find_dependency_components) - if(${ARGV0}_FOUND AND ${CMAKE_VERSION} VERSION_LESS "3.15.0") - # find_dependency does not handle new components correctly before 3.15.0 - set(${ARGV0}_FOUND FALSE) - endif() - find_dependency(${ARGV}) + if(${ARGV0}_FOUND AND ${CMAKE_VERSION} VERSION_LESS "3.15.0") + # find_dependency does not handle new components correctly before 3.15.0 + set(${ARGV0}_FOUND FALSE) + endif() + find_dependency(${ARGV}) endmacro() # Only look for modules we installed and save value @@ -12,112 +12,109 @@ set(_CMAKE_MODULE_PATH_SAVE ${CMAKE_MODULE_PATH}) set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules") # options used for building library -set(SPLA_OMP @SPLA_OMP@) set(SPLA_STATIC @SPLA_STATIC@) set(SPLA_GPU_BACKEND @SPLA_GPU_BACKEND@) set(SPLA_BUILD_TESTS @SPLA_BUILD_TESTS@) set(SPLA_TIMING @SPLA_TIMING@) set(SPLA_FORTRAN @SPLA_FORTRAN@) - -# internal variables, determining blas library -set(SPLA_BLAS_MKL @SPLA_BLAS_MKL@) -set(SPLA_BLAS_ARMPL @SPLA_BLAS_ARMPL@) -set(SPLA_BLAS_BLIS @SPLA_BLAS_BLIS@) -set(SPLA_BLAS_OPENBLAS @SPLA_BLAS_OPENBLAS@) -set(SPLA_BLAS_SCI @SPLA_BLAS_SCI@) -set(SPLA_BLAS_ATLAS @SPLA_BLAS_ATLAS@) -set(SPLA_BLAS_GENERIC @SPLA_BLAS_GENERIC@) -set(SPLA_BLAS_UNKNOWN @SPLA_BLAS_UNKNOWN@) - +set(SPLA_BLA_VENDOR @BLA_VENDOR@) # make sure CXX is enabled get_property(_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) if(SPLA_FIND_REQUIRED AND NOT "CXX" IN_LIST _LANGUAGES) - message(FATAL_ERROR "SPLA requires CXX language to be enabled for static linking.") + message(FATAL_ERROR "SPLA requires CXX language to be enabled for static linking.") endif() # find required targets if(NOT TARGET MPI::MPI_CXX) - find_dependency_components(MPI COMPONENTS CXX) + find_dependency_components(MPI COMPONENTS CXX) endif() if("C" IN_LIST _LANGUAGES AND NOT TARGET MPI::MPI_C) - find_dependency_components(MPI COMPONENTS C) + find_dependency_components(MPI COMPONENTS C) endif() if("Fortran" IN_LIST _LANGUAGES AND NOT TARGET MPI::MPI_Fortran) - find_dependency_components(MPI COMPONENTS Fortran) + find_dependency_components(MPI COMPONENTS Fortran) endif() if(SPLA_OMP) - if(NOT TARGET OpenMP::OpenMP_CXX) - find_dependency_components(OpenMP COMPONENTS CXX) - endif() + if(NOT TARGET OpenMP::OpenMP_CXX) + find_dependency_components(OpenMP COMPONENTS CXX) + endif() endif() if(SPLA_ROCM) - find_dependency(hip CONFIG) - find_dependency(rocblas CONFIG) + find_dependency(hip CONFIG) + find_dependency(rocblas CONFIG) endif() if(SPLA_CUDA) - if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.17.0") - find_dependency(CUDAToolkit) - else() - enable_language(CUDA) - find_library(CUDA_CUDART_LIBRARY cudart PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - if(NOT TARGET CUDA::cudart) - add_library(CUDA::cudart INTERFACE IMPORTED) - endif() - set_property(TARGET CUDA::cudart PROPERTY INTERFACE_LINK_LIBRARIES ${CUDA_CUDART_LIBRARY}) - set_property(TARGET CUDA::cudart PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - - find_library(CUDA_CUBLAS_LIBRARY cublas PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - if(NOT TARGET CUDA::cublas) - add_library(CUDA::cublas INTERFACE IMPORTED) - endif() - set_property(TARGET CUDA::cublas PROPERTY INTERFACE_LINK_LIBRARIES ${CUDA_CUBLAS_LIBRARY}) - set_property(TARGET CUDA::cublas PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - endif() + if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.17.0") + find_dependency(CUDAToolkit) + else() + enable_language(CUDA) + find_library(CUDA_CUDART_LIBRARY cudart PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) + if(NOT TARGET CUDA::cudart) + add_library(CUDA::cudart INTERFACE IMPORTED) + endif() + set_property(TARGET CUDA::cudart PROPERTY INTERFACE_LINK_LIBRARIES ${CUDA_CUDART_LIBRARY}) + set_property(TARGET CUDA::cudart PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + + find_library(CUDA_CUBLAS_LIBRARY cublas PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) + if(NOT TARGET CUDA::cublas) + add_library(CUDA::cublas INTERFACE IMPORTED) + endif() + set_property(TARGET CUDA::cublas PROPERTY INTERFACE_LINK_LIBRARIES ${CUDA_CUBLAS_LIBRARY}) + set_property(TARGET CUDA::cublas PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + endif() +endif() + +if(NOT BLA_VENDOR AND SPLA_BLA_VENDOR) + set(BLA_VENDOR ${SPLA_BLA_VENDOR}) endif() -if(SPLA_BLAS_MKL) - find_dependency(MKL) -elseif(SPLA_BLAS_BLIS) - find_dependency(BLIS) -elseif(SPLA_BLAS_OPENBLAS) - find_dependency(OPENBLAS) -elseif(SPLA_BLAS_SCI) - find_dependency(SCI) -elseif(SPLA_BLAS_ATLAS) - find_dependency(ATLAS) -elseif(SPLA_BLAS_GENERIC) - find_dependency(GenericBLAS) +set(BLA_SIZEOF_INTEGER 4) +if(BLA_VENDOR AND "${BLA_VENDOR}" STREQUAL "CRAY_LIBSCI") + find_dependency(SCI MODULE) +elseif(NOT BLA_VENDOR AND NOT BLAS_LIBRARIES) + # search in custom order first + set(_BLAS_VENDOR_LIST Intel10_64lp AOCL_mt Arm_mp OpenBLAS FLAME) + foreach(BLA_VENDOR IN LISTS _BLAS_VENDOR_LIST) + if(NOT BLAS_LIBRARIES) + find_package(BLAS MODULE QUIET) + endif() + endforeach() + # if not found, search for any BLAS library + if(NOT BLAS_LIBRARIES) + unset(BLA_VENDOR) + find_dependency(BLAS MODULE) + endif() else() - find_dependency(BLAS) - if(NOT TARGET BLAS::blas) - # target is only available with CMake 3.18.0 and later - add_library(BLAS::blas INTERFACE IMPORTED) - set_property(TARGET BLAS::blas PROPERTY INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES} ${BLAS_LINKER_FLAGS}) - endif() + find_dependency(BLAS MODULE) +endif() + +if(TARGET BLAS::BLAS) + # some CMAKE versions (3.18-3.19) don't include libaries in target + target_link_libraries(BLAS::BLAS INTERFACE ${BLAS_LIBRARIES} ${BLAS_LINKER_FLAGS}) endif() set(CMAKE_MODULE_PATH ${_CMAKE_MODULE_PATH_SAVE}) # restore module path # find_dependency may set SPLA_FOUND to false, so only add spla if everything required was found if(NOT DEFINED SPLA_FOUND OR SPLA_FOUND) - # add version of package - include("${CMAKE_CURRENT_LIST_DIR}/SPLAStaticConfigVersion.cmake") - - # add library target - include("${CMAKE_CURRENT_LIST_DIR}/SPLAStaticTargets.cmake") - - target_link_libraries(SPLA::spla INTERFACE MPI::MPI_CXX) - if(TARGET MPI::MPI_C) - target_link_libraries(SPLA::spla INTERFACE MPI::MPI_C) - endif() - if(TARGET MPI::MPI_Fortran) - target_link_libraries(SPLA::spla INTERFACE MPI::MPI_Fortran) - endif() + # add version of package + include("${CMAKE_CURRENT_LIST_DIR}/SPLAStaticConfigVersion.cmake") + + # add library target + include("${CMAKE_CURRENT_LIST_DIR}/SPLAStaticTargets.cmake") + + target_link_libraries(SPLA::spla INTERFACE MPI::MPI_CXX) + if(TARGET MPI::MPI_C) + target_link_libraries(SPLA::spla INTERFACE MPI::MPI_C) + endif() + if(TARGET MPI::MPI_Fortran) + target_link_libraries(SPLA::spla INTERFACE MPI::MPI_Fortran) + endif() endif() diff --git a/cmake/SPLATargets.cmake b/cmake/SPLATargets.cmake index 6aeb7c3..03774f1 100644 --- a/cmake/SPLATargets.cmake +++ b/cmake/SPLATargets.cmake @@ -1,7 +1,6 @@ - # Prefer shared library if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/SPLASharedTargets.cmake") - include("${CMAKE_CURRENT_LIST_DIR}/SPLASharedTargets.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/SPLASharedTargets.cmake") else() - include("${CMAKE_CURRENT_LIST_DIR}/SPLAStaticTargets.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/SPLAStaticTargets.cmake") endif() diff --git a/cmake/modules/FindARMPL.cmake b/cmake/modules/FindARMPL.cmake deleted file mode 100644 index 499ef9c..0000000 --- a/cmake/modules/FindARMPL.cmake +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2019 ETH Zurich, Simon Frasch -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# 3. Neither the name of the copyright holder nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - - -#.rst: -# FindARMPL -# ----------- -# -# This module searches for the 32-bit integer ARM library. -# -# -# The following variables are set -# -# :: -# -# ARMPL_FOUND - True if double precision fftw library is found -# ARMPL_LIBRARIES - The required libraries -# ARMPL_INCLUDE_DIRS - The required include directory -# -# The following import target is created -# -# :: -# -# ARM::pl - -# set paths to look for ARM -set(_ARMPL_PATHS ${ARMPL_ROOT} $ENV{ARMPL_ROOT} $ENV{ARMPL_DIR}) - -set(_ARMPL_DEFAULT_PATH_SWITCH) - -if(_ARMPL_PATHS) - # do not look at any default paths if a custom path was set - set(_ARMPL_DEFAULT_PATH_SWITCH NO_DEFAULT_PATH) -else() - set(_ARMPL_PATHS /opt/arm) -endif() - - -# find all ARM libraries / include directories -find_library( - ARMPL_LIBRARIES - NAMES "armpl_lp64_mp" "armpl_mp" "armpl_lp64" "armpl" - HINTS ${_ARMPL_PATHS} - PATH_SUFFIXES "lib" "lib64" - ${_ARMPL_DEFAULT_PATH_SWITCH} -) -find_path(ARMPL_INCLUDE_DIRS - NAMES "armpl.h" - HINTS ${_ARMPL_PATHS} - PATH_SUFFIXES "include_lp64_mp" "include_lp64" "include" - ${_ARMPL_DEFAULT_PATH_SWITCH} -) - -# check if found -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(ARMPL REQUIRED_VARS ARMPL_LIBRARIES ARMPL_INCLUDE_DIRS) - -# add target to link against -if(ARMPL_FOUND) - # create interface target - if(NOT TARGET ARM::pl) - add_library(ARM::pl INTERFACE IMPORTED) - endif() - set_property(TARGET ARM::pl PROPERTY INTERFACE_LINK_LIBRARIES ${ARMPL_LIBRARIES}) - set_property(TARGET ARM::pl PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${ARMPL_INCLUDE_DIRS}) -endif() - -# prevent clutter in gui -MARK_AS_ADVANCED(ARMPL_LIBRARIES ARMPL_INCLUDE_DIRS) - diff --git a/cmake/modules/FindATLAS.cmake b/cmake/modules/FindATLAS.cmake deleted file mode 100644 index c91cc0b..0000000 --- a/cmake/modules/FindATLAS.cmake +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) 2019 ETH Zurich -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# 3. Neither the name of the copyright holder nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - - -#.rst: -# FindATLAS -# ----------- -# -# This module tries to find the ATLAS library. -# -# The following variables are set -# -# :: -# -# ATLAS_FOUND - True if atlas is found -# ATLAS_LIBRARIES - The required libraries -# ATLAS_INCLUDE_DIRS - The required include directory -# -# The following import target is created -# -# :: -# -# ATLAS::atlas - -#set paths to look for library from ROOT variables.If new policy is set, find_library() automatically uses them. -if(NOT POLICY CMP0074) - set(_ATLAS_PATHS ${ATLAS_ROOT} $ENV{ATLAS_ROOT}) -endif() - -find_library( - ATLAS_LIBRARIES - NAMES "atlas" - HINTS ${_ATLAS_PATHS} - PATH_SUFFIXES "atlas/lib" "atlas/lib64" "atlas" -) -find_path( - ATLAS_INCLUDE_DIRS - NAMES "cblas-atlas.h" "cblas_atlas.h" "cblas.h" - HINTS ${_ATLAS_PATHS} - PATH_SUFFIXES "atlas" "atlas/include" "include/atlas" -) - -# check if found -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(ATLAS REQUIRED_VARS ATLAS_INCLUDE_DIRS ATLAS_LIBRARIES) - -# add target to link against -if(ATLAS_FOUND) - if(NOT TARGET ATLAS::atlas) - add_library(ATLAS::atlas INTERFACE IMPORTED) - endif() - set_property(TARGET ATLAS::atlas PROPERTY INTERFACE_LINK_LIBRARIES ${ATLAS_LIBRARIES}) - set_property(TARGET ATLAS::atlas PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${ATLAS_INCLUDE_DIRS}) -endif() - -# prevent clutter in cache -MARK_AS_ADVANCED(ATLAS_FOUND ATLAS_LIBRARIES ATLAS_INCLUDE_DIRS) diff --git a/cmake/modules/FindBLIS.cmake b/cmake/modules/FindBLIS.cmake deleted file mode 100644 index e8a3dfb..0000000 --- a/cmake/modules/FindBLIS.cmake +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) 2019 ETH Zurich -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# 3. Neither the name of the copyright holder nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - - -#.rst: -# FindBLIS -# ----------- -# -# This module tries to find the BLIS library. -# -# The following variables are set -# -# :: -# -# BLIS_FOUND - True if blis is found -# BLIS_LIBRARIES - The required libraries -# BLIS_INCLUDE_DIRS - The required include directory -# -# The following import target is created -# -# :: -# -# BLIS::blis - -#set paths to look for library from ROOT variables.If new policy is set, find_library() automatically uses them. -if(NOT POLICY CMP0074) - set(_BLIS_PATHS ${BLIS_ROOT} $ENV{BLIS_ROOT}) -endif() - -find_library( - BLIS_LIBRARIES - NAMES "blis-mt" "blis" - HINTS ${_BLIS_PATHS} - PATH_SUFFIXES "blis/lib" "blis/lib64" "blis" -) -find_path( - BLIS_INCLUDE_DIRS - NAMES "blis.h" - HINTS ${_BLIS_PATHS} - PATH_SUFFIXES "blis" "blis/include" "include/blis" -) - -# check if found -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(BLIS REQUIRED_VARS BLIS_INCLUDE_DIRS BLIS_LIBRARIES) - -# add target to link against -if(BLIS_FOUND) - if(NOT TARGET BLIS::blis) - add_library(BLIS::blis INTERFACE IMPORTED) - endif() - set_property(TARGET BLIS::blis PROPERTY INTERFACE_LINK_LIBRARIES ${BLIS_LIBRARIES}) - set_property(TARGET BLIS::blis PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${BLIS_INCLUDE_DIRS}) -endif() - -# prevent clutter in cache -MARK_AS_ADVANCED(BLIS_FOUND BLIS_LIBRARIES BLIS_INCLUDE_DIRS) diff --git a/cmake/modules/FindGenericBLAS.cmake b/cmake/modules/FindGenericBLAS.cmake deleted file mode 100644 index d542e58..0000000 --- a/cmake/modules/FindGenericBLAS.cmake +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2019 ETH Zurich -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# 3. Neither the name of the copyright holder nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - - -#.rst: -# FindGenericBLAS -# ----------- -# -# This module tries to find the GenericBLAS library. -# -# The following variables are set -# -# :: -# -# GenericBLAS_FOUND - True if blas is found -# GenericBLAS_LIBRARIES - The required libraries -# GenericBLAS_INCLUDE_DIRS - The required include directory -# -# The following import target is created -# -# :: -# -# GenericBLAS::blas - -#set paths to look for library from ROOT variables.If new policy is set, find_library() automatically uses them. -if(NOT POLICY CMP0074) - set(_GenericBLAS_PATHS ${GenericBLAS_ROOT} $ENV{GenericBLAS_ROOT}) -endif() - -find_library( - GenericBLAS_LIBRARIES - NAMES "blas" - HINTS ${_GenericBLAS_PATHS} -) -find_library( # optinally look for cblas library - not required - GenericBLAS_CBLAS_LIBRARIES - NAMES "cblas" - HINTS ${_GenericBLAS_PATHS} -) -find_path( - GenericBLAS_INCLUDE_DIRS - NAMES "cblas.h" - HINTS ${_GenericBLAS_PATHS} -) - -# check if found -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(GenericBLAS REQUIRED_VARS GenericBLAS_INCLUDE_DIRS GenericBLAS_LIBRARIES) -if(GenericBLAS_CBLAS_LIBRARIES) - list(APPEND GenericBLAS_LIBRARIES ${GenericBLAS_CBLAS_LIBRARIES}) -endif() - -# add target to link against -if(GenericBLAS_FOUND) - if(NOT TARGET GenericBLAS::blas) - add_library(GenericBLAS::blas INTERFACE IMPORTED) - endif() - set_property(TARGET GenericBLAS::blas PROPERTY INTERFACE_LINK_LIBRARIES ${GenericBLAS_LIBRARIES}) - set_property(TARGET GenericBLAS::blas PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${GenericBLAS_INCLUDE_DIRS}) -endif() - -# prevent clutter in cache -MARK_AS_ADVANCED(GenericBLAS_FOUND GenericBLAS_LIBRARIES GenericBLAS_INCLUDE_DIRS GenericBLAS_CBLAS_LIBRARIES) diff --git a/cmake/modules/FindMKL.cmake b/cmake/modules/FindMKL.cmake deleted file mode 100644 index 583b6e5..0000000 --- a/cmake/modules/FindMKL.cmake +++ /dev/null @@ -1,310 +0,0 @@ -# -# CMake recipes -# https://github.com/eth-cscs/cmake-recipes -# -# Copyright (c) 2018-2019, ETH Zurich -# BSD 3-Clause License. All rights reserved. -# -# Author: Teodor Nikolov (teodor.nikolov22@gmail.com) -# -#[=======================================================================[.rst: -FindMKL -------- - -The following conventions are used: - -intel / INTEL - Bindings for everything except GNU Fortran -gf / GF - GNU Fortran bindings -seq / SEQ - sequential MKL -omp / OMP - threaded MKL with OpenMP back end -tbb / TBB - threaded MKL with TBB back end -32bit / 32BIT - MKL 32 bit integer interface (used most often) -64bit / 64BIT - MKL 64 bit integer interface -mpich / MPICH - MPICH / IntelMPI BLACS back end -ompi / OMPI - OpenMPI BLACS back end -st / ST - static libraries -dyn / DYN - dynamic libraries - -The module attempts to define a target for each MKL configuration. The -configuration will not be available if there are missing library files or a -missing dependency. - -MKL Link line advisor: - https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor - -Note: Mixing GCC and Intel OpenMP backends is a bad idea. - -Search variables -^^^^^^^^^^^^^^^^ - -``MKLROOT`` - Environment variable set to MKL's root directory - -``MKL_ROOT`` - CMake variable set to MKL's root directory - -Example usage -^^^^^^^^^^^^^ - -To Find MKL: - - find_package(MKL REQUIRED) - -To check if target is available: - - if (TARGET mkl::scalapack_mpich_intel_32bit_omp_dyn) - ... - endif() - -To link to an available target (see list below): - - target_link_libraries(... mkl::scalapack_mpich_intel_32bit_omp_dyn) - -Note: dependencies are handled for you (MPI, OpenMP, ...) - -Imported targets -^^^^^^^^^^^^^^^^ - -MKL (BLAS, LAPACK, FFT) tarets: - - mkl::[gf|intel]_[32bit|64bit]_[seq|omp|tbb]_[st|dyn] e.g. - - mkl::mkl_intel_32bit_omp_dyn - -BLACS targets: - - mkl::blacs_[mpich|ompi]_[gf|intel]_[32bit|64bit]_[seq|omp|tbb]_[st|dyn] e.g. - - mkl::blacs_intel_mpich_32bit_seq_st - -ScaLAPACK targets: - - mkl::scalapack_[mpich|ompi]_[gf|intel]_[32bit|64bit]_[seq|omp|tbb]_[st|dyn] e.g. - - mkl::scalapack_mpich_intel_64bit_omp_dyn - -Result variables -^^^^^^^^^^^^^^^^ - -MKL_FOUND - -Not supported -^^^^^^^^^^^^^ - -- F95 interfaces - -#]=======================================================================] - -cmake_minimum_required(VERSION 3.10) - -# Modules -# -include(FindPackageHandleStandardArgs) - -if(NOT (CMAKE_C_COMPILER_LOADED OR - CMAKE_CXX_COMPILER_LOADED OR - CMAKE_Fortran_COMPILER_LOADED)) - message(FATAL_ERROR "FindMKL requires Fortran, C, or C++ to be enabled.") -endif() - -# Dependencies -# -find_package(Threads) -find_package(MPI COMPONENTS CXX) -find_package(OpenMP COMPONENTS CXX) - -# If MKL_ROOT is not set, set it via the env variable MKLROOT. -# -if(NOT DEFINED MKL_ROOT) - set(MKL_ROOT $ENV{MKLROOT} CACHE PATH "MKL's root directory.") -endif() - -# Determine MKL's library folder -# -set(_mkl_libpath_suffix "intel64" "lib/intel64" "lib" "lib64") -if(CMAKE_SIZEOF_VOID_P EQUAL 4) # 32 bit - set(_mkl_libpath_suffix "lib/ia32") -endif() - -if(WIN32) - list(APPEND _mkl_libpath_suffix "intel64_win" "lib/intel64_win") - set(_mkl_libname_prefix "") - set(_mkl_shared_lib "_dll.lib") - set(_mkl_static_lib ".lib") -elseif(APPLE) - list(APPEND _mkl_libpath_suffix "intel64_mac" "lib/intel64_mac") - set(_mkl_libname_prefix "lib") - set(_mkl_shared_lib ".dylib") - set(_mkl_static_lib ".a") -else() # LINUX - list(APPEND _mkl_libpath_suffix "intel64_lin" "lib/intel64_lin") - set(_mkl_libname_prefix "lib") - set(_mkl_shared_lib ".so") - set(_mkl_static_lib ".a") -endif() -set(_mkl_search_paths "${MKL_ROOT}" - "${MKL_ROOT}/lib" - "${MKL_ROOT}/mkl/lib" - "${MKL_ROOT}/compiler/lib") - -# Functions: finds both static and shared MKL libraries -# -function(__mkl_find_library _varname _libname) - find_library(${_varname}_DYN - NAMES ${_mkl_libname_prefix}${_libname}${_mkl_shared_lib} - HINTS ${_mkl_search_paths} - PATH_SUFFIXES ${_mkl_libpath_suffix}) - mark_as_advanced(${_varname}_DYN) - - find_library(${_varname}_ST - NAMES ${_mkl_libname_prefix}${_libname}${_mkl_static_lib} - HINTS ${_mkl_search_paths} - PATH_SUFFIXES ${_mkl_libpath_suffix}) - mark_as_advanced(${_varname}_ST) -endfunction() - -# Find MKL headers -# -find_path(MKL_INCLUDE_DIR mkl.h - HINTS ${MKL_ROOT}/include - ${MKL_ROOT}/mkl/include) -mark_as_advanced(MKL_INCLUDE_DIR) - -# Group flags for static libraries on Linux (GNU, PGI, ICC -> same linker) -# -if(UNIX AND NOT APPLE) - set(_mkl_linker_pre_flags_ST "-Wl,--start-group") - set(_mkl_linker_post_flags_ST "-Wl,--end-group") -endif() - -# Core MKL -# -__mkl_find_library(MKL_CORE_LIB mkl_core) - -# Interface -# -__mkl_find_library(MKL_INTERFACE_INTEL_32BIT_LIB mkl_intel_lp64) -__mkl_find_library(MKL_INTERFACE_INTEL_64BIT_LIB mkl_intel_ilp64) -if(NOT APPLE AND CMAKE_Fortran_COMPILER_LOADED - AND CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") - __mkl_find_library(MKL_INTERFACE_GF_32BIT_LIB mkl_gf_lp64) - __mkl_find_library(MKL_INTERFACE_GF_64BIT_LIB mkl_gf_ilp64) -endif() - -# Threading -# -__mkl_find_library(MKL_SEQ_LIB mkl_sequential) -if(NOT APPLE AND (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR - CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR - CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")) - __mkl_find_library(MKL_OMP_LIB mkl_gnu_thread) -else() - __mkl_find_library(MKL_OMP_LIB mkl_intel_thread) -endif() -__mkl_find_library(MKL_TBB_LIB mkl_tbb_thread) - -# BLACS -# -if(APPLE) - __mkl_find_library(MKL_BLACS_MPICH_32BIT_LIB mkl_blacs_mpich_lp64) - __mkl_find_library(MKL_BLACS_MPICH_64BIT_LIB mkl_blacs_mpich_ilp64) -else() - __mkl_find_library(MKL_BLACS_MPICH_32BIT_LIB mkl_blacs_intelmpi_lp64) - __mkl_find_library(MKL_BLACS_MPICH_64BIT_LIB mkl_blacs_intelmpi_ilp64) -endif() -__mkl_find_library(MKL_BLACS_OMPI_32BIT_LIB mkl_blacs_openmpi_lp64) -__mkl_find_library(MKL_BLACS_OMPI_64BIT_LIB mkl_blacs_openmpi_ilp64) - -# ScaLAPACK -# -__mkl_find_library(MKL_SCALAPACK_32BIT_LIB mkl_scalapack_lp64) -__mkl_find_library(MKL_SCALAPACK_64BIT_LIB mkl_scalapack_ilp64) - -# Check if core libs were found -# -find_package_handle_standard_args(MKL REQUIRED_VARS MKL_INCLUDE_DIR - Threads_FOUND) - -# Sequential has no threading dependency. There is currently no TBB module -# shipped with CMake. The dependency is not accounted for. (FIXME) -# -set(_mkl_dep_found_SEQ TRUE) -set(_mkl_dep_found_TBB TRUE) -if (TARGET OpenMP::OpenMP_CXX) - set(_mkl_dep_OMP ${OpenMP_CXX_LIBRARIES}) - set(_mkl_dep_found_OMP TRUE) -endif() - -# Define all blas, blacs and scalapack -# -foreach(_libtype "ST" "DYN") - set(_mkl_core_lib ${MKL_CORE_LIB_${_libtype}}) - foreach(_bits "32BIT" "64BIT") - set(_mkl_scalapack_lib ${MKL_SCALAPACK_${_bits}_LIB_${_libtype}}) - foreach(_iface "INTEL" "GF") - set(_mkl_interface_lib ${MKL_INTERFACE_${_iface}_${_bits}_LIB_${_libtype}}) - foreach(_threading "SEQ" "OMP" "TBB") - set(_mkl_threading_lib ${MKL_${_threading}_LIB_${_libtype}}) - - string(TOLOWER "${_iface}_${_bits}_${_threading}_${_libtype}" _tgt_config) - set(_mkl_tgt mkl::${_tgt_config}) - - if(MKL_FOUND - AND _mkl_interface_lib - AND _mkl_threading_lib - AND _mkl_core_lib - AND _mkl_dep_found_${_threading} - AND NOT TARGET ${_mkl_tgt}) - set(_mkl_libs "${_mkl_linker_pre_flags_${_threading}}" - "${_mkl_interface_lib}" - "${_mkl_threading_lib}" - "${_mkl_core_lib}" - "${_mkl_linker_post_flags_${_threading}}" - "${_mkl_dep_${_threading}}" - "Threads::Threads") - add_library(${_mkl_tgt} INTERFACE IMPORTED) - set_target_properties(${_mkl_tgt} PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${MKL_INCLUDE_DIR}" - INTERFACE_LINK_LIBRARIES "${_mkl_libs}") - endif() - - foreach(_mpi_impl "MPICH" "OMPI") - set(_mkl_blacs_lib ${MKL_BLACS_${_mpi_impl}_${_bits}_LIB_${_libtype}}) - - string(TOLOWER "${_mpi_impl}_${_iface}_${_bits}_${_threading}_${_libtype}" _tgt_config) - set(_blacs_tgt mkl::blacs_${_tgt_config}) - set(_scalapack_tgt mkl::scalapack_${_tgt_config}) - - if(_mkl_blacs_lib - AND TARGET ${_mkl_tgt} - AND TARGET MPI::MPI_CXX - AND NOT TARGET ${_blacs_tgt}) - set(_blacs_libs "${_mkl_linker_pre_flags_${_libtype}}" - "${_mkl_interface_lib}" - "${_mkl_threading_lib}" - "${_mkl_core_lib}" - "${_mkl_blacs_lib}" - "${_mkl_linker_post_flags_${_libtype}}" - "MPI::MPI_CXX" - "${_mkl_dep_${_threading}}" - "Threads::Threads") - add_library(${_blacs_tgt} INTERFACE IMPORTED) - set_target_properties(${_blacs_tgt} PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${MKL_INCLUDE_DIR}" - INTERFACE_LINK_LIBRARIES "${_blacs_libs}") - endif() - - if(_mkl_scalapack_lib - AND TARGET ${_blacs_tgt} - AND NOT TARGET ${_scalapack_tgt}) - set(_scalapack_libs "${_mkl_scalapack_lib}" - "${_blacs_tgt}") - add_library(${_scalapack_tgt} INTERFACE IMPORTED) - set_target_properties(${_scalapack_tgt} PROPERTIES - INTERFACE_LINK_LIBRARIES "${_scalapack_libs}") - endif() - endforeach() - endforeach() - endforeach() - endforeach() -endforeach() diff --git a/cmake/modules/FindOPENBLAS.cmake b/cmake/modules/FindOPENBLAS.cmake deleted file mode 100644 index 16ab239..0000000 --- a/cmake/modules/FindOPENBLAS.cmake +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) 2019 ETH Zurich -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# 3. Neither the name of the copyright holder nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - - -#.rst: -# FindOPENBLAS -# ----------- -# -# This module tries to find the OPENBLAS library. -# -# The following variables are set -# -# :: -# -# OPENBLAS_FOUND - True if openblas is found -# OPENBLAS_LIBRARIES - The required libraries -# OPENBLAS_INCLUDE_DIRS - The required include directory -# -# The following import target is created -# -# :: -# -# OPENBLAS::openblas - -#set paths to look for library from ROOT variables.If new policy is set, find_library() automatically uses them. -if(NOT POLICY CMP0074) - set(_OPENBLAS_PATHS ${OPENBLAS_ROOT} $ENV{OPENBLAS_ROOT}) -endif() - -find_library( - OPENBLAS_LIBRARIES - NAMES "openblas" - HINTS ${_OPENBLAS_PATHS} - PATH_SUFFIXES "openblas/lib" "openblas/lib64" "openblas" -) -find_path( - OPENBLAS_INCLUDE_DIRS - NAMES "cblas-openblas.h" "cblas_openblas.h" "cblas.h" - HINTS ${_OPENBLAS_PATHS} - PATH_SUFFIXES "openblas" "openblas/include" "include/openblas" -) - -# check if found -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(OPENBLAS REQUIRED_VARS OPENBLAS_INCLUDE_DIRS OPENBLAS_LIBRARIES) - -# add target to link against -if(OPENBLAS_FOUND) - if(NOT TARGET OPENBLAS::openblas) - add_library(OPENBLAS::openblas INTERFACE IMPORTED) - endif() - set_property(TARGET OPENBLAS::openblas PROPERTY INTERFACE_LINK_LIBRARIES ${OPENBLAS_LIBRARIES}) - set_property(TARGET OPENBLAS::openblas PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${OPENBLAS_INCLUDE_DIRS}) -endif() - -# prevent clutter in cache -MARK_AS_ADVANCED(OPENBLAS_FOUND OPENBLAS_LIBRARIES OPENBLAS_INCLUDE_DIRS) diff --git a/cmake/modules/FindSCI.cmake b/cmake/modules/FindSCI.cmake index 031e195..260f07a 100644 --- a/cmake/modules/FindSCI.cmake +++ b/cmake/modules/FindSCI.cmake @@ -124,6 +124,5 @@ if(SCI_FOUND) endif() endif() - # prevent clutter in cache -MARK_AS_ADVANCED(SCI_FOUND SCI_LIBRARIES SCI_INCLUDE_DIRS) +MARK_AS_ADVANCED(SCI_FOUND SCI_LIBRARIES SCI_MPI_LIBRARIES SCI_INCLUDE_DIRS) diff --git a/cmake/util/blas_dgemm_symbol.h b/cmake/util/blas_dgemm_symbol.h deleted file mode 100644 index 3b20ebb..0000000 --- a/cmake/util/blas_dgemm_symbol.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef SPLA_BLAS_GEMM_SYMBOL_H -extern "C" { - -enum CBLAS_ORDER { CblasRowMajor = 101, CblasColMajor = 102 }; -enum CBLAS_TRANSPOSE { CblasNoTrans = 111, CblasTrans = 112, CblasConjTrans = 113 }; - -void cblas_dgemm(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE transA, enum CBLAS_TRANSPOSE transB, - int M, int N, int K, double alpha, const double *A, int lda, const double *B, - int ldb, double beta, double *C, int ldc); -} - -#endif diff --git a/include/spla/config.h.in b/include/spla/config.h.in index bee896a..44f8f71 100644 --- a/include/spla/config.h.in +++ b/include/spla/config.h.in @@ -36,15 +36,7 @@ #cmakedefine SPLA_CUDA #cmakedefine SPLA_ROCM #cmakedefine SPLA_OMP -#cmakedefine SPLA_BLAS_MKL -#cmakedefine SPLA_BLAS_ARMPL -#cmakedefine SPLA_BLAS_OPENBLAS -#cmakedefine SPLA_BLAS_ATLAS -#cmakedefine SPLA_BLAS_SCI -#cmakedefine SPLA_BLAS_BLIS -#cmakedefine SPLA_BLAS_GENERIC -#cmakedefine SPLA_BLAS_UNKNOWN -#cmakedefine SPLA_BLAS_HEADER_NAME <@SPLA_BLAS_HEADER_NAME@> +#cmakedefine SPLA_CBLAS #cmakedefine SPLA_RESTRICT_ATTR @SPLA_RESTRICT_ATTR@ #include "spla/spla_export.h" diff --git a/include/spla/spla.f90 b/include/spla/spla.f90 index fe32448..98b2f6d 100644 --- a/include/spla/spla.f90 +++ b/include/spla/spla.f90 @@ -87,12 +87,6 @@ integer(c_int) function spla_ctx_processing_unit(ctx, processingUnit) bind(C) integer(c_int), intent(out) :: processingUnit end function - integer(c_int) function spla_ctx_num_threads(ctx, numThreads) bind(C) - use iso_c_binding - type(c_ptr), value :: ctx - integer(c_int), intent(out) :: numThreads - end function - integer(c_int) function spla_ctx_num_tiles(ctx, numTiles) bind(C) use iso_c_binding type(c_ptr), value :: ctx @@ -141,12 +135,6 @@ integer(c_int) function spla_ctx_allocated_memory_gpu(ctx, memSize) bind(C) integer(c_int_least64_t), intent(out) :: memSize end function - integer(c_int) function spla_ctx_set_num_threads(ctx, numThreads) bind(C) - use iso_c_binding - type(c_ptr), value :: ctx - integer(c_int), value :: numThreads - end function - integer(c_int) function spla_ctx_set_num_tiles(ctx, numTiles) bind(C) use iso_c_binding type(c_ptr), value :: ctx diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e1841e9..fadba90 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,69 +1,69 @@ set(SPLA_SOURCE_FILES - block_generation/block_cyclic_generator.cpp - util/blas_interface.cpp - timing/rt_graph.cpp - timing/timing.cpp - pgemm_ssb/pgemm_ssb_host.cpp - pgemm_ssb/ring_ssb_host.cpp - pgemm_ssb/add_kernel.cpp - pgemm_sbs/pgemm_sbs_host.cpp - pgemm_sbs/ring_sbs_host.cpp - gemm/gemm_host.cpp - spla/matrix_distribution.cpp - spla/matrix_distribution_internal.cpp - spla/pgemm_ssb.cpp - spla/pgemm_ssbtr.cpp - spla/pgemm_sbs.cpp - spla/context.cpp - spla/gemm.cpp - ) + block_generation/block_cyclic_generator.cpp + util/blas_interface.cpp + timing/rt_graph.cpp + timing/timing.cpp + pgemm_ssb/pgemm_ssb_host.cpp + pgemm_ssb/ring_ssb_host.cpp + pgemm_ssb/add_kernel.cpp + pgemm_sbs/pgemm_sbs_host.cpp + pgemm_sbs/ring_sbs_host.cpp + gemm/gemm_host.cpp + spla/matrix_distribution.cpp + spla/matrix_distribution_internal.cpp + spla/pgemm_ssb.cpp + spla/pgemm_ssbtr.cpp + spla/pgemm_sbs.cpp + spla/context.cpp + spla/gemm.cpp + ) if(SPLA_CUDA OR SPLA_ROCM) - list(APPEND SPLA_SOURCE_FILES - pgemm_ssb/ring_ssb_gpu.cpp - pgemm_ssb/pgemm_ssb_gpu.cpp - pgemm_sbs/pgemm_sbs_gpu.cpp - pgemm_sbs/ring_sbs_gpu.cpp - gemm/gemm_gpu.cpp - gpu_util/multiply_gpu.cpp - ) + list(APPEND SPLA_SOURCE_FILES + pgemm_ssb/ring_ssb_gpu.cpp + pgemm_ssb/pgemm_ssb_gpu.cpp + pgemm_sbs/pgemm_sbs_gpu.cpp + pgemm_sbs/ring_sbs_gpu.cpp + gemm/gemm_gpu.cpp + gpu_util/multiply_gpu.cpp + ) endif() # Creates spla library with given name. All common target modifications should be done here. macro(spla_create_library _TARGET_NAME) - add_library(${_TARGET_NAME} ${SPLA_LIBRARY_TYPE} - ${SPLA_SOURCE_FILES} - ) + add_library(${_TARGET_NAME} ${SPLA_LIBRARY_TYPE} + ${SPLA_SOURCE_FILES} + ) - set_property(TARGET ${_TARGET_NAME} PROPERTY VERSION ${SPLA_VERSION}) - set_property(TARGET ${_TARGET_NAME} PROPERTY SOVERSION ${SPLA_SO_VERSION}) + set_property(TARGET ${_TARGET_NAME} PROPERTY VERSION ${SPLA_VERSION}) + set_property(TARGET ${_TARGET_NAME} PROPERTY SOVERSION ${SPLA_SO_VERSION}) - # Don't export any symbols of external static libaries. Only works on linux. - if(UNIX AND NOT APPLE) - if(${CMAKE_VERSION} VERSION_LESS "3.13.5") - target_link_libraries(${_TARGET_NAME} PRIVATE "-Wl,--exclude-libs,ALL") - else() - target_link_options(${_TARGET_NAME} PRIVATE "-Wl,--exclude-libs,ALL") - endif() - endif() + # Don't export any symbols of external static libaries. Only works on linux. + if(UNIX AND NOT APPLE) + if(${CMAKE_VERSION} VERSION_LESS "3.13.5") + target_link_libraries(${_TARGET_NAME} PRIVATE "-Wl,--exclude-libs,ALL") + else() + target_link_options(${_TARGET_NAME} PRIVATE "-Wl,--exclude-libs,ALL") + endif() + endif() - target_compile_options(${_TARGET_NAME} PRIVATE ${SPLA_DEFINITIONS} ${SPLA_EXTERNAL_COMPILE_OPTIONS}) - target_include_directories(${_TARGET_NAME} PRIVATE ${SPLA_EXTERNAL_INCLUDE_DIRS} ${SPLA_INCLUDE_DIRS}) + target_compile_options(${_TARGET_NAME} PRIVATE ${SPLA_DEFINITIONS} ${SPLA_EXTERNAL_COMPILE_OPTIONS}) + target_include_directories(${_TARGET_NAME} PRIVATE ${SPLA_EXTERNAL_INCLUDE_DIRS} ${SPLA_INCLUDE_DIRS}) - target_link_libraries(${_TARGET_NAME} PRIVATE ${SPLA_EXTERNAL_LIBS}) - # SPLA inteface needs MPI to be compiled. Avoid requiring CXX language for installed cmake config files, so only make public for build. - target_link_libraries(${_TARGET_NAME} PUBLIC $) + target_link_libraries(${_TARGET_NAME} PRIVATE ${SPLA_EXTERNAL_LIBS}) + # SPLA inteface needs MPI to be compiled. Avoid requiring CXX language for installed cmake config files, so only make public for build. + target_link_libraries(${_TARGET_NAME} PUBLIC $) - target_include_directories(${_TARGET_NAME} INTERFACE ${SPLA_INTERFACE_INCLUDE_DIRS}) - target_include_directories(${_TARGET_NAME} INTERFACE $) # for install(EXPORT ...) - target_include_directories(${_TARGET_NAME} INTERFACE $ $) # for export(...) - if(${SPLA_FORTRAN}) # Add include directory for fortran module - target_include_directories(${_TARGET_NAME} INTERFACE $) - target_include_directories(${_TARGET_NAME} INTERFACE $) - endif() + target_include_directories(${_TARGET_NAME} INTERFACE ${SPLA_INTERFACE_INCLUDE_DIRS}) + target_include_directories(${_TARGET_NAME} INTERFACE $) # for install(EXPORT ...) + target_include_directories(${_TARGET_NAME} INTERFACE $ $) # for export(...) + if(${SPLA_FORTRAN}) # Add include directory for fortran module + target_include_directories(${_TARGET_NAME} INTERFACE $) + target_include_directories(${_TARGET_NAME} INTERFACE $) + endif() endmacro() @@ -71,94 +71,94 @@ spla_create_library(spla) set_target_properties(spla PROPERTIES VISIBILITY_INLINES_HIDDEN TRUE CXX_VISIBILITY_PRESET hidden) if(SPLA_BUILD_TESTS) - # create library with default visibility if tests are build, to allow linking to internal symbols - spla_create_library(spla_test) - set_target_properties(spla_test PROPERTIES VISIBILITY_INLINES_HIDDEN FALSE CXX_VISIBILITY_PRESET default) - target_compile_options(spla_test PUBLIC -DSPLA_STATIC_DEFINE) - # enable internal timings - target_compile_options(spla_test PUBLIC -DSPLA_TIMING) + # create library with default visibility if tests are build, to allow linking to internal symbols + spla_create_library(spla_test) + set_target_properties(spla_test PROPERTIES VISIBILITY_INLINES_HIDDEN FALSE CXX_VISIBILITY_PRESET default) + target_compile_options(spla_test PUBLIC -DSPLA_STATIC_DEFINE) + # enable internal timings + target_compile_options(spla_test PUBLIC -DSPLA_TIMING) endif() # generate export header to control symbol visibility include(GenerateExportHeader) generate_export_header(spla) configure_file("${CMAKE_CURRENT_BINARY_DIR}/spla_export.h" - "${PROJECT_BINARY_DIR}/spla/spla_export.h" - COPYONLY + "${PROJECT_BINARY_DIR}/spla/spla_export.h" + COPYONLY ) # build fortran module if(SPLA_FORTRAN) - add_library(spla_fortran OBJECT ${PROJECT_SOURCE_DIR}/include/spla/spla.f90) + add_library(spla_fortran OBJECT ${PROJECT_SOURCE_DIR}/include/spla/spla.f90) endif() # set packge config names get_target_property(_LIB_TYPE spla TYPE) if(_LIB_TYPE STREQUAL "STATIC_LIBRARY") - set(SPLA_VERSION_FILE "SPLAStaticConfigVersion.cmake") - set(SPLA_CONFIG_FILE "SPLAStaticConfig.cmake") - set(SPLA_TARGETS_FILE "SPLAStaticTargets.cmake") + set(SPLA_VERSION_FILE "SPLAStaticConfigVersion.cmake") + set(SPLA_CONFIG_FILE "SPLAStaticConfig.cmake") + set(SPLA_TARGETS_FILE "SPLAStaticTargets.cmake") else() - set(SPLA_VERSION_FILE "SPLASharedConfigVersion.cmake") - set(SPLA_CONFIG_FILE "SPLASharedConfig.cmake") - set(SPLA_TARGETS_FILE "SPLASharedTargets.cmake") + set(SPLA_VERSION_FILE "SPLASharedConfigVersion.cmake") + set(SPLA_CONFIG_FILE "SPLASharedConfig.cmake") + set(SPLA_TARGETS_FILE "SPLASharedTargets.cmake") endif() # generate cmake package include(CMakePackageConfigHelpers) write_basic_package_version_file( - "${PROJECT_BINARY_DIR}/${SPLA_VERSION_FILE}" - VERSION ${Upstream_VERSION} - COMPATIBILITY AnyNewerVersion + "${PROJECT_BINARY_DIR}/${SPLA_VERSION_FILE}" + VERSION ${Upstream_VERSION} + COMPATIBILITY AnyNewerVersion ) export(TARGETS spla NAMESPACE SPLA:: FILE ${PROJECT_BINARY_DIR}/${SPLA_TARGETS_FILE}) configure_file(${PROJECT_SOURCE_DIR}/cmake/${SPLA_CONFIG_FILE} - "${PROJECT_BINARY_DIR}/${SPLA_CONFIG_FILE}" - @ONLY + "${PROJECT_BINARY_DIR}/${SPLA_CONFIG_FILE}" + @ONLY ) configure_file(${PROJECT_SOURCE_DIR}/cmake/SPLAConfig.cmake - "${PROJECT_BINARY_DIR}/SPLAConfig.cmake" - COPYONLY + "${PROJECT_BINARY_DIR}/SPLAConfig.cmake" + COPYONLY ) configure_file(${PROJECT_SOURCE_DIR}/cmake/SPLAConfigVersion.cmake - "${PROJECT_BINARY_DIR}/SPLAConfigVersion.cmake" - COPYONLY + "${PROJECT_BINARY_DIR}/SPLAConfigVersion.cmake" + COPYONLY ) configure_file(${PROJECT_SOURCE_DIR}/cmake/SPLATargets.cmake - "${PROJECT_BINARY_DIR}/SPLATargets.cmake" - COPYONLY + "${PROJECT_BINARY_DIR}/SPLATargets.cmake" + COPYONLY ) configure_file(${PROJECT_SOURCE_DIR}/cmake/SPLA.pc.in - "${PROJECT_BINARY_DIR}/SPLA.pc" - @ONLY + "${PROJECT_BINARY_DIR}/SPLA.pc" + @ONLY ) # installation commands if(SPLA_INSTALL) - install(TARGETS spla DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT SPLATargets) - install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/spla DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" PATTERN "*.f90") - install(FILES ${PROJECT_BINARY_DIR}/spla/config.h "${PROJECT_BINARY_DIR}/spla/spla_export.h" DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/spla) - install(EXPORT SPLATargets NAMESPACE SPLA:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/SPLA FILE ${SPLA_TARGETS_FILE}) - install( - FILES - "${PROJECT_BINARY_DIR}/SPLAConfig.cmake" - "${PROJECT_BINARY_DIR}/SPLATargets.cmake" - "${PROJECT_BINARY_DIR}/SPLAConfigVersion.cmake" - "${PROJECT_BINARY_DIR}/${SPLA_CONFIG_FILE}" - "${PROJECT_BINARY_DIR}/${SPLA_VERSION_FILE}" - DESTINATION - ${CMAKE_INSTALL_LIBDIR}/cmake/SPLA - ) - - install(FILES ${PROJECT_BINARY_DIR}/SPLA.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) - - install(DIRECTORY "${PROJECT_SOURCE_DIR}/cmake/modules" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/SPLA" + install(TARGETS spla DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT SPLATargets) + install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/spla DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" PATTERN "*.f90") + install(FILES ${PROJECT_BINARY_DIR}/spla/config.h "${PROJECT_BINARY_DIR}/spla/spla_export.h" DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/spla) + install(EXPORT SPLATargets NAMESPACE SPLA:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/SPLA FILE ${SPLA_TARGETS_FILE}) + install( + FILES + "${PROJECT_BINARY_DIR}/SPLAConfig.cmake" + "${PROJECT_BINARY_DIR}/SPLATargets.cmake" + "${PROJECT_BINARY_DIR}/SPLAConfigVersion.cmake" + "${PROJECT_BINARY_DIR}/${SPLA_CONFIG_FILE}" + "${PROJECT_BINARY_DIR}/${SPLA_VERSION_FILE}" + DESTINATION + ${CMAKE_INSTALL_LIBDIR}/cmake/SPLA + ) + + install(FILES ${PROJECT_BINARY_DIR}/SPLA.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) + + install(DIRECTORY "${PROJECT_SOURCE_DIR}/cmake/modules" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/SPLA" FILES_MATCHING PATTERN "*.cmake") - if(SPLA_FORTRAN) - install(FILES ${PROJECT_BINARY_DIR}/src/spla.mod DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/spla) - endif() + if(SPLA_FORTRAN) + install(FILES ${PROJECT_BINARY_DIR}/src/spla.mod DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/spla) + endif() endif() diff --git a/src/gemm/gemm_host.cpp b/src/gemm/gemm_host.cpp index 85402ea..6e5cbea 100644 --- a/src/gemm/gemm_host.cpp +++ b/src/gemm/gemm_host.cpp @@ -68,8 +68,7 @@ void gemm_host(SplaOperation opA, SplaOperation opB, IntType m, IntType n, IntTy if (ldb < 1) ldb = 1; if (ldc < 1) ldc = 1; - blas::gemm(blas::Order::COL_MAJOR, opBlasA, opBlasB, m, n, k, alpha, A, lda, B, ldb, beta, C, - ldc); + blas::gemm(opBlasA, opBlasB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } template auto gemm_host(SplaOperation opA, SplaOperation opB, IntType m, IntType n, diff --git a/src/util/blas_interface.cpp b/src/util/blas_interface.cpp index a88c567..3fd9f95 100644 --- a/src/util/blas_interface.cpp +++ b/src/util/blas_interface.cpp @@ -31,37 +31,9 @@ #include "spla/config.h" -namespace spla { -namespace blas { -// OpenBlas uses different types -#if defined(SPLA_BLAS_OPENBLAS) - -using FloatComplex = float; -using DoubleComplex = double; - -#elif defined(SPLA_BLAS_ARMPL) - -using FloatComplex = armpl_singlecomplex_t; -using DoubleComplex = armpl_doublecomplex_t; - -#else - -using FloatComplex = void; -using DoubleComplex = void; - -#endif -} // namespace blas -} // namespace spla - -// use blas header if found -#if defined(SPLA_BLAS_HEADER_NAME) - -#include SPLA_BLAS_HEADER_NAME - -#else - extern "C" { +#ifdef SPLA_CBLAS enum CBLAS_ORDER { CblasRowMajor = 101, CblasColMajor = 102 }; enum CBLAS_TRANSPOSE { CblasNoTrans = 111, CblasTrans = 112, CblasConjTrans = 113 }; @@ -80,13 +52,31 @@ void cblas_cgemm(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE transA, enum CBLAS void cblas_zgemm(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE transA, enum CBLAS_TRANSPOSE transB, int M, int N, int K, const void *alpha, const void *A, int lda, const void *B, int ldb, const void *beta, void *C, int ldc); -} +#else + +void sgemm_(const char* TRANSA, const char* TRANSB, const int* M, const int* N, const int* K, + const void* ALPHA, const void* A, const int* LDA, const void* B, const int* LDB, + const void* BETA, void* C, const int* LDC, int TRANSA_len, int TRANSB_len); + +void dgemm_(const char* TRANSA, const char* TRANSB, const int* M, const int* N, const int* K, + const void* ALPHA, const void* A, const int* LDA, const void* B, const int* LDB, + const void* BETA, void* C, const int* LDC, int TRANSA_len, int TRANSB_len); + +void cgemm_(const char* TRANSA, const char* TRANSB, const int* M, const int* N, const int* K, + const void* ALPHA, const void* A, const int* LDA, const void* B, const int* LDB, + const void* BETA, void* C, const int* LDC, int TRANSA_len, int TRANSB_len); + +void zgemm_(const char* TRANSA, const char* TRANSB, const int* M, const int* N, const int* K, + const void* ALPHA, const void* A, const int* LDA, const void* B, const int* LDB, + const void* BETA, void* C, const int* LDC, int TRANSA_len, int TRANSB_len); #endif +} namespace spla { namespace blas { +#ifdef SPLA_CBLAS static auto convert_operation(const Operation &op) -> CBLAS_TRANSPOSE { switch (op) { case Operation::TRANS: @@ -97,58 +87,108 @@ static auto convert_operation(const Operation &op) -> CBLAS_TRANSPOSE { return CblasNoTrans; } } +#else +static auto convert_operation(const Operation& op) -> const char* { + switch (op) { + case Operation::TRANS: + return "T"; + case Operation::CONJ_TRANS: + return "C"; + default: + return "N"; + } +} +#endif -auto gemm(Order order, Operation transA, Operation transB, IntType M, IntType N, IntType K, - float alpha, const float *A, IntType lda, const float *B, IntType ldb, float beta, - float *C, IntType ldc) -> void { - CBLAS_ORDER cblasOrder = order == Order::COL_MAJOR ? CblasColMajor : CblasRowMajor; +auto gemm(Operation transA, Operation transB, IntType M, IntType N, IntType K, float alpha, + const float *A, IntType lda, const float *B, IntType ldb, float beta, float *C, + IntType ldc) -> void { +#ifdef SPLA_CBLAS + CBLAS_ORDER cblasOrder = CblasColMajor; CBLAS_TRANSPOSE cblasTransA = convert_operation(transA); CBLAS_TRANSPOSE cblasTransB = convert_operation(transB); cblas_sgemm(cblasOrder, cblasTransA, cblasTransB, static_cast(M), static_cast(N), static_cast(K), alpha, A, static_cast(lda), B, static_cast(ldb), beta, C, static_cast(ldc)); +#else + auto intM = static_cast(M); + auto intN = static_cast(N); + auto intK = static_cast(K); + auto intLda = static_cast(lda); + auto intLdb = static_cast(ldb); + auto intLdc = static_cast(ldc); + sgemm_(convert_operation(transA), convert_operation(transB), &intM, &intN, &intK, &alpha, A, + &intLda, B, &intLdb, &beta, C, &intLdc, 1, 1); +#endif } -auto gemm(Order order, Operation transA, Operation transB, IntType M, IntType N, IntType K, - double alpha, const double *A, IntType lda, const double *B, IntType ldb, double beta, - double *C, IntType ldc) -> void { - CBLAS_ORDER cblasOrder = order == Order::COL_MAJOR ? CblasColMajor : CblasRowMajor; +auto gemm(Operation transA, Operation transB, IntType M, IntType N, IntType K, double alpha, + const double *A, IntType lda, const double *B, IntType ldb, double beta, double *C, + IntType ldc) -> void { +#ifdef SPLA_CBLAS + CBLAS_ORDER cblasOrder = CblasColMajor; CBLAS_TRANSPOSE cblasTransA = convert_operation(transA); CBLAS_TRANSPOSE cblasTransB = convert_operation(transB); cblas_dgemm(cblasOrder, cblasTransA, cblasTransB, static_cast(M), static_cast(N), static_cast(K), alpha, A, static_cast(lda), B, static_cast(ldb), beta, C, static_cast(ldc)); +#else + auto intM = static_cast(M); + auto intN = static_cast(N); + auto intK = static_cast(K); + auto intLda = static_cast(lda); + auto intLdb = static_cast(ldb); + auto intLdc = static_cast(ldc); + dgemm_(convert_operation(transA), convert_operation(transB), &intM, &intN, &intK, &alpha, A, + &intLda, B, &intLdb, &beta, C, &intLdc, 1, 1); +#endif } -auto gemm(Order order, Operation transA, Operation transB, IntType M, IntType N, IntType K, +auto gemm(Operation transA, Operation transB, IntType M, IntType N, IntType K, std::complex alpha, const std::complex *A, IntType lda, const std::complex *B, IntType ldb, std::complex beta, std::complex *C, IntType ldc) -> void { - CBLAS_ORDER cblasOrder = order == Order::COL_MAJOR ? CblasColMajor : CblasRowMajor; +#ifdef SPLA_CBLAS + CBLAS_ORDER cblasOrder = CblasColMajor; CBLAS_TRANSPOSE cblasTransA = convert_operation(transA); CBLAS_TRANSPOSE cblasTransB = convert_operation(transB); cblas_cgemm(cblasOrder, cblasTransA, cblasTransB, static_cast(M), static_cast(N), - static_cast(K), reinterpret_cast(&alpha), - reinterpret_cast(A), static_cast(lda), - reinterpret_cast(B), static_cast(ldb), - reinterpret_cast(&beta), reinterpret_cast(C), - static_cast(ldc)); + static_cast(K), &alpha, A, static_cast(lda), B, static_cast(ldb), + &beta, C, static_cast(ldc)); +#else + auto intM = static_cast(M); + auto intN = static_cast(N); + auto intK = static_cast(K); + auto intLda = static_cast(lda); + auto intLdb = static_cast(ldb); + auto intLdc = static_cast(ldc); + cgemm_(convert_operation(transA), convert_operation(transB), &intM, &intN, &intK, &alpha, A, + &intLda, B, &intLdb, &beta, C, &intLdc, 1, 1); +#endif } -auto gemm(Order order, Operation transA, Operation transB, IntType M, IntType N, IntType K, +auto gemm(Operation transA, Operation transB, IntType M, IntType N, IntType K, std::complex alpha, const std::complex *A, IntType lda, const std::complex *B, IntType ldb, std::complex beta, std::complex *C, IntType ldc) -> void { - CBLAS_ORDER cblasOrder = order == Order::COL_MAJOR ? CblasColMajor : CblasRowMajor; +#ifdef SPLA_CBLAS + CBLAS_ORDER cblasOrder = CblasColMajor; CBLAS_TRANSPOSE cblasTransA = convert_operation(transA); CBLAS_TRANSPOSE cblasTransB = convert_operation(transB); cblas_zgemm(cblasOrder, cblasTransA, cblasTransB, static_cast(M), static_cast(N), - static_cast(K), reinterpret_cast(&alpha), - reinterpret_cast(A), static_cast(lda), - reinterpret_cast(B), static_cast(ldb), - reinterpret_cast(&beta), reinterpret_cast(C), - static_cast(ldc)); + static_cast(K), &alpha, A, static_cast(lda), B, static_cast(ldb), + &beta, C, static_cast(ldc)); +#else + auto intM = static_cast(M); + auto intN = static_cast(N); + auto intK = static_cast(K); + auto intLda = static_cast(lda); + auto intLdb = static_cast(ldb); + auto intLdc = static_cast(ldc); + zgemm_(convert_operation(transA), convert_operation(transB), &intM, &intN, &intK, &alpha, A, + &intLda, B, &intLdb, &beta, C, &intLdc, 1, 1); +#endif } } // namespace blas diff --git a/src/util/blas_interface.hpp b/src/util/blas_interface.hpp index d3cfb32..ae5c3ec 100644 --- a/src/util/blas_interface.hpp +++ b/src/util/blas_interface.hpp @@ -36,23 +36,22 @@ namespace spla { namespace blas { -enum class Order { ROW_MAJOR = 101, COL_MAJOR = 102 }; enum class Operation { NONE = 111, TRANS = 112, CONJ_TRANS = 113 }; -auto gemm(Order order, Operation transA, Operation transB, IntType M, IntType N, IntType K, - float alpha, const float *A, IntType lda, const float *B, IntType ldb, float beta, - float *C, IntType ldc) -> void; +auto gemm(Operation transA, Operation transB, IntType M, IntType N, IntType K, float alpha, + const float *A, IntType lda, const float *B, IntType ldb, float beta, float *C, + IntType ldc) -> void; -auto gemm(Order order, Operation transA, Operation transB, IntType M, IntType N, IntType K, - double alpha, const double *A, IntType lda, const double *B, IntType ldb, double beta, - double *C, IntType ldc) -> void; +auto gemm(Operation transA, Operation transB, IntType M, IntType N, IntType K, double alpha, + const double *A, IntType lda, const double *B, IntType ldb, double beta, double *C, + IntType ldc) -> void; -auto gemm(Order order, Operation transA, Operation transB, IntType M, IntType N, IntType K, +auto gemm(Operation transA, Operation transB, IntType M, IntType N, IntType K, std::complex alpha, const std::complex *A, IntType lda, const std::complex *B, IntType ldb, std::complex beta, std::complex *C, IntType ldc) -> void; -auto gemm(Order order, Operation transA, Operation transB, IntType M, IntType N, IntType K, +auto gemm(Operation transA, Operation transB, IntType M, IntType N, IntType K, std::complex alpha, const std::complex *A, IntType lda, const std::complex *B, IntType ldb, std::complex beta, std::complex *C, IntType ldc) -> void; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7649385..d9be89e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -3,7 +3,7 @@ set(SPLA_TEST_LIBRARIES) # update time stamps when using FetchContent if(POLICY CMP0135) - cmake_policy(SET CMP0135 NEW) + cmake_policy(SET CMP0135 NEW) endif() set(BUILD_GMOCK OFF CACHE BOOL "") @@ -13,12 +13,12 @@ include(FetchContent) # add googletest if(SPLA_BUNDLED_GOOGLETEST) - FetchContent_Declare( - googletest - URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz - URL_MD5 95b29f0038ec84a611df951d74d99897 - ) - FetchContent_MakeAvailable(googletest) + FetchContent_Declare( + googletest + URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz + URL_MD5 95b29f0038ec84a611df951d74d99897 + ) + FetchContent_MakeAvailable(googletest) else() find_package(GTest CONFIG REQUIRED) endif() @@ -26,41 +26,26 @@ list(APPEND SPLA_TEST_LIBRARIES GTest::gtest) # add command line parser if(SPLA_BUNDLED_CLI11) - FetchContent_Declare( - cli11 - URL https://github.com/CLIUtils/CLI11/archive/refs/tags/v2.3.2.tar.gz - URL_MD5 b80cb645dee25982110b068b426363ff - ) - FetchContent_MakeAvailable(cli11) + FetchContent_Declare( + cli11 + URL https://github.com/CLIUtils/CLI11/archive/refs/tags/v2.3.2.tar.gz + URL_MD5 b80cb645dee25982110b068b426363ff + ) + FetchContent_MakeAvailable(cli11) else() - find_package(CLI11 CONFIG REQUIRED) + find_package(CLI11 CONFIG REQUIRED) endif() list(APPEND SPLA_TEST_LIBRARIES CLI11::CLI11) - -if(SPLA_BLAS_MKL) - set(_MKL_MPI mpich) - if(MPIEXEC_EXECUTABLE) - execute_process(COMMAND ${MPIEXEC_EXECUTABLE} --version OUTPUT_VARIABLE _MPIRUN_OUTPUT) - string(FIND "${_MPIRUN_OUTPUT}" "Open MPI" _OMPI_POS) - if(NOT _OMPI_POS STREQUAL "-1") - set(_MKL_MPI ompi) - endif() - endif() - list(APPEND SPLA_TEST_LIBRARIES mkl::scalapack_${_MKL_MPI}_${SPLA_MKL_TARGET}) -elseif(SPLA_BLAS_SCI) - list(APPEND SPLA_TEST_LIBRARIES SCI::sci_mpi) -else() - find_package(SCALAPACK REQUIRED) - list(APPEND SPLA_TEST_LIBRARIES SCALAPACK::SCALAPACK) -endif() +find_package(SCALAPACK MODULE REQUIRED) +list(APPEND SPLA_TEST_LIBRARIES SCALAPACK::SCALAPACK) if(UNIX AND NOT APPLE) - # on Daint, dl library appears to be required - find_library(SPLA_DL_LIBRARY dl) - if(SPLA_DL_LIBRARY) - list(APPEND SPLA_TEST_LIBRARIES ${SPLA_DL_LIBRARY}) - endif() + # on Daint, dl library appears to be required + find_library(SPLA_DL_LIBRARY dl) + if(SPLA_DL_LIBRARY) + list(APPEND SPLA_TEST_LIBRARIES ${SPLA_DL_LIBRARY}) + endif() endif() add_executable(run_tests programs/run_tests.cpp test_pool_allocator.cpp test_gemm.cpp test_gemm_ssb.cpp test_gemm_sbs.cpp gtest_mpi.cpp) diff --git a/tests/programs/benchmark.cpp b/tests/programs/benchmark.cpp index 9c55bef..d0f89c6 100644 --- a/tests/programs/benchmark.cpp +++ b/tests/programs/benchmark.cpp @@ -119,7 +119,6 @@ int main(int argc, char** argv) { int m = 5; int n = 5; int k = 5; - int numThreads = 6; int blacsBlockSize = 64; int lengthTarget = 256; std::string procName; diff --git a/tests/programs/benchmark_scalapack.cpp b/tests/programs/benchmark_scalapack.cpp index 94ef035..aa07852 100644 --- a/tests/programs/benchmark_scalapack.cpp +++ b/tests/programs/benchmark_scalapack.cpp @@ -83,8 +83,8 @@ static void call_pdgemr2d(int m, int n, double* a, int ia, int ja, int* desca, d template void run_gemm(const std::shared_ptr>& allocator, - spla::Context& ctx, int globalRows, int colsA, int colsB, int numThreads, - int blacsBlockSize, int numRepeats) { + spla::Context& ctx, int globalRows, int colsA, int colsB, int blacsBlockSize, + int numRepeats) { int worldRank, worldSize; MPI_Comm_rank(MPI_COMM_WORLD, &worldRank); MPI_Comm_size(MPI_COMM_WORLD, &worldSize); @@ -151,7 +151,6 @@ int main(int argc, char** argv) { int colsA = 5; int colsB = 5; int rows = 5; - int numThreads = 6; int blacsBlockSize = 256; std::string procName; std::string typeName; @@ -189,21 +188,19 @@ int main(int argc, char** argv) { if (ctx.processing_unit() == SPLA_PU_GPU) { #if defined(SPLA_CUDA) || defined(SPLA_ROCM) if (typeName == "scalar") - run_gemm(allocators.pinned(), ctx, rows, colsA, colsB, numThreads, blacsBlockSize, - repeats); + run_gemm(allocators.pinned(), ctx, rows, colsA, colsB, blacsBlockSize, repeats); else - run_gemm>(allocators.pinned(), ctx, rows, colsA, colsB, numThreads, - blacsBlockSize, repeats); + run_gemm>(allocators.pinned(), ctx, rows, colsA, colsB, blacsBlockSize, + repeats); #else throw spla::GPUSupportError(); #endif } else { if (typeName == "scalar") - run_gemm(allocators.host(), ctx, rows, colsA, colsB, numThreads, blacsBlockSize, - repeats); + run_gemm(allocators.host(), ctx, rows, colsA, colsB, blacsBlockSize, repeats); else - run_gemm>(allocators.host(), ctx, rows, colsA, colsB, numThreads, - blacsBlockSize, repeats); + run_gemm>(allocators.host(), ctx, rows, colsA, colsB, blacsBlockSize, + repeats); } int worldRank; diff --git a/tests/test_gemm.cpp b/tests/test_gemm.cpp index 389fb05..11899ac 100644 --- a/tests/test_gemm.cpp +++ b/tests/test_gemm.cpp @@ -69,8 +69,8 @@ class GemmTest Context ctx(SPLA_PU_HOST); // compute reference by calling blas library directly - ::spla::blas::gemm(::spla::blas::Order::COL_MAJOR, convert_op(opA_), convert_op(opB_), m_, n_, - k_, 2.0, vecA_.data(), lda_, vecB_.data(), ldb_, 3.0, vecCRef_.data(), ldc_); + ::spla::blas::gemm(convert_op(opA_), convert_op(opB_), m_, n_, k_, 2.0, vecA_.data(), lda_, + vecB_.data(), ldb_, 3.0, vecCRef_.data(), ldc_); // compute with public gemm interface gemm(opA_, opB_, m_, n_, k_, 2.0, vecA_.data(), lda_, vecB_.data(), ldb_, 3.0, vecC_.data(), @@ -87,8 +87,8 @@ class GemmTest Context ctx(SPLA_PU_GPU); // compute reference by calling blas library directly - ::spla::blas::gemm(::spla::blas::Order::COL_MAJOR, convert_op(opA_), convert_op(opB_), m_, n_, - k_, 2.0, vecA_.data(), lda_, vecB_.data(), ldb_, 3.0, vecCRef_.data(), ldc_); + ::spla::blas::gemm(convert_op(opA_), convert_op(opB_), m_, n_, k_, 2.0, vecA_.data(), lda_, + vecB_.data(), ldb_, 3.0, vecCRef_.data(), ldc_); // compute with public gemm interface gemm(opA_, opB_, m_, n_, k_, 2.0, vecA_.data(), lda_, vecB_.data(), ldb_, 3.0, vecC_.data(), @@ -104,8 +104,8 @@ class GemmTest Context ctx(SPLA_PU_GPU); // compute reference by calling blas library directly - ::spla::blas::gemm(::spla::blas::Order::COL_MAJOR, convert_op(opA_), convert_op(opB_), m_, n_, - k_, 2.0, vecA_.data(), lda_, vecB_.data(), ldb_, 3.0, vecCRef_.data(), ldc_); + ::spla::blas::gemm(convert_op(opA_), convert_op(opB_), m_, n_, k_, 2.0, vecA_.data(), lda_, + vecB_.data(), ldb_, 3.0, vecCRef_.data(), ldc_); Buffer gpuBufferA(allocators_.gpu(), vecA_.size()); Buffer gpuBufferB(allocators_.gpu(), vecB_.size()); diff --git a/tests/test_gemm_sbs.cpp b/tests/test_gemm_sbs.cpp index 60799e6..31c0564 100644 --- a/tests/test_gemm_sbs.cpp +++ b/tests/test_gemm_sbs.cpp @@ -155,7 +155,6 @@ static auto find_rectangle(int n) -> std::pair { return {n, 1}; } -// numThreads, rowBlockSize, colBlockSize, colsA, colsB, numLocalRows template class GemmSBSTest : public ::testing::TestWithParam< diff --git a/tests/test_gemm_ssb.cpp b/tests/test_gemm_ssb.cpp index 8bfadaf..287c91f 100644 --- a/tests/test_gemm_ssb.cpp +++ b/tests/test_gemm_ssb.cpp @@ -151,7 +151,6 @@ static auto find_rectangle(int n) -> std::pair { return {n, 1}; } -// numThreads, rowBlockSize, colBlockSize, colsA, colsB, numLocalRows template class GemmSSBTest : public ::testing::TestWithParam<