From 0276fda69e4c9092e4bee9d280a7e803e75eb7da Mon Sep 17 00:00:00 2001 From: Roland Haas Date: Mon, 6 Mar 2017 16:04:35 -0600 Subject: [PATCH 1/2] CarpetInterp2: use thorn Vectors for vectorization --- CarpetInterp2/configuration.ccl | 2 +- CarpetInterp2/interface.ccl | 1 + CarpetInterp2/src/fasterp.cc | 45 +++++++++++++++++---------------- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/CarpetInterp2/configuration.ccl b/CarpetInterp2/configuration.ccl index f365e894e..e321fd7df 100644 --- a/CarpetInterp2/configuration.ccl +++ b/CarpetInterp2/configuration.ccl @@ -1,3 +1,3 @@ # Configuration definitions for thorn CarpetInterp2 -REQUIRES Carpet CarpetLib +REQUIRES Carpet CarpetLib Vectors diff --git a/CarpetInterp2/interface.ccl b/CarpetInterp2/interface.ccl index b790f9c02..deafef93a 100644 --- a/CarpetInterp2/interface.ccl +++ b/CarpetInterp2/interface.ccl @@ -16,6 +16,7 @@ USES INCLUDE HEADER: vect.hh USES INCLUDE HEADER: carpet.hh uses include header: Timer.hh +uses include header: vectors.h # Get access to communicators diff --git a/CarpetInterp2/src/fasterp.cc b/CarpetInterp2/src/fasterp.cc index b66885d6e..fec22a98b 100644 --- a/CarpetInterp2/src/fasterp.cc +++ b/CarpetInterp2/src/fasterp.cc @@ -21,6 +21,7 @@ #include #include "fasterp.hh" +#include "vectors.h" #ifdef CARPETINTERP2_CHECK #define CI2C(x, y) x, y @@ -285,6 +286,14 @@ void fasterp_src_loc_t::interpolate(ivect const &ash, size_t const dj = di * ash[0]; size_t const dk = dj * ash[1]; + // we pad coeffs with zeros so that the extra vector elements don't spoil the + // sum + CCTK_REAL coeffs0[O0 + 1 + CCTK_REAL_VEC_SIZE-1] + CCTK_ATTRIBUTE_ALIGNED(CCTK_REAL_VEC_SIZE * sizeof(CCTK_REAL)) = {0.}; + for (size_t i = 0 ; i <= O0; ++i) { + coeffs0[i] = coeffs[0][i]; + } + #ifdef CARPETINTERP2_CHECK assert(all(ind >= 0 and ind + ivect(O0, O1, O2) < lsh)); assert(ind3d == index(ash, ind)); @@ -292,38 +301,30 @@ void fasterp_src_loc_t::interpolate(ivect const &ash, #endif for (size_t v = 0; v < varptrs.size(); ++v) { - CCTK_REAL tmp = 0.0; + CCTK_REAL_VEC tmp = vec_set1(0.0); const CCTK_REAL *restrict const varptr = &varptrs.AT(v)[ind3d]; for (size_t k = 0; k <= O2; ++k) { CCTK_REAL const coeff_k = O2 == 0 ? 1.0 : coeffs[2][k]; for (size_t j = 0; j <= O1; ++j) { CCTK_REAL const coeff_jk = coeff_k * (O1 == 0 ? 1.0 : coeffs[1][j]); -#ifndef __BIGGEST_ALIGNMENT__ // this is a GCC extension -#define __BIGGEST_ALIGNMENT__ 64 // a guess -#endif - CCTK_REAL buffer[O0 + 1] - __attribute__((__aligned__(__BIGGEST_ALIGNMENT__))); -#ifdef __INTEL_COMPILER -#pragma vector always -#else -#pragma omp simd -#endif - for (size_t i = 0; i <= O0; ++i) { - buffer[i] = varptr[i * di + j * dj + k * dk]; + CCTK_REAL_VEC buffer[(O0 + 1) / CCTK_REAL_VEC_SIZE + 1]; + for (size_t i = 0, ii = 0; i <= O0; i += CCTK_REAL_VEC_SIZE, ++ii) { + buffer[ii] = vec_loadu(varptr[i * di + j * dj + k * dk]); } -#ifdef __INTEL_COMPILER -#pragma vector always -#else -#pragma omp simd reduction(+ : tmp) -#endif - for (size_t i = 0; i <= O0; ++i) { - CCTK_REAL const coeff_ijk = coeff_jk * (O0 == 0 ? 1.0 : coeffs[0][i]); - tmp += coeff_ijk * buffer[i]; + for (size_t i = 0, ii = 0; i <= O0; i += CCTK_REAL_VEC_SIZE, ++ii) { + CCTK_REAL_VEC const coeff_ijk = + coeff_jk * (O0 == 0 ? vec_set1(1.0) : vec_load(coeffs0[i])); + tmp += coeff_ijk * buffer[ii]; } } } - vals[v] = tmp; + // there is no horizontal sum in Vectors + CCTK_REAL tmp2 = 0.; + for(size_t d = 0; d < CCTK_REAL_VEC_SIZE; ++d) { + tmp2 += vec_elt(tmp, d); + } + vals[v] = tmp2; } // for v #ifdef CARPETINTERP2_CHECK From 8deb455b3172bd1c9c63ab9778d53775561dadfb Mon Sep 17 00:00:00 2001 From: Roland Haas Date: Mon, 6 Mar 2017 18:19:43 -0600 Subject: [PATCH 2/2] CarpetInterp2: remove buffer array --- CarpetInterp2/src/fasterp.cc | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/CarpetInterp2/src/fasterp.cc b/CarpetInterp2/src/fasterp.cc index fec22a98b..ee7949467 100644 --- a/CarpetInterp2/src/fasterp.cc +++ b/CarpetInterp2/src/fasterp.cc @@ -288,9 +288,9 @@ void fasterp_src_loc_t::interpolate(ivect const &ash, // we pad coeffs with zeros so that the extra vector elements don't spoil the // sum - CCTK_REAL coeffs0[O0 + 1 + CCTK_REAL_VEC_SIZE-1] - CCTK_ATTRIBUTE_ALIGNED(CCTK_REAL_VEC_SIZE * sizeof(CCTK_REAL)) = {0.}; - for (size_t i = 0 ; i <= O0; ++i) { + CCTK_REAL coeffs0[O0 + 1 + CCTK_REAL_VEC_SIZE - 1] CCTK_ATTRIBUTE_ALIGNED( + CCTK_REAL_VEC_SIZE * sizeof(CCTK_REAL)) = {0.}; + for (size_t i = 0; i <= O0; ++i) { coeffs0[i] = coeffs[0][i]; } @@ -307,21 +307,20 @@ void fasterp_src_loc_t::interpolate(ivect const &ash, for (size_t k = 0; k <= O2; ++k) { CCTK_REAL const coeff_k = O2 == 0 ? 1.0 : coeffs[2][k]; for (size_t j = 0; j <= O1; ++j) { - CCTK_REAL const coeff_jk = coeff_k * (O1 == 0 ? 1.0 : coeffs[1][j]); - CCTK_REAL_VEC buffer[(O0 + 1) / CCTK_REAL_VEC_SIZE + 1]; - for (size_t i = 0, ii = 0; i <= O0; i += CCTK_REAL_VEC_SIZE, ++ii) { - buffer[ii] = vec_loadu(varptr[i * di + j * dj + k * dk]); - } + CCTK_REAL_VEC const coeff_jk = + vec_set1(coeff_k * (O1 == 0 ? 1.0 : coeffs[1][j])); for (size_t i = 0, ii = 0; i <= O0; i += CCTK_REAL_VEC_SIZE, ++ii) { CCTK_REAL_VEC const coeff_ijk = coeff_jk * (O0 == 0 ? vec_set1(1.0) : vec_load(coeffs0[i])); - tmp += coeff_ijk * buffer[ii]; + CCTK_REAL_VEC const buffer = + vec_loadu(varptr[i * di + j * dj + k * dk]); + tmp += coeff_ijk * buffer; } } } // there is no horizontal sum in Vectors CCTK_REAL tmp2 = 0.; - for(size_t d = 0; d < CCTK_REAL_VEC_SIZE; ++d) { + for (size_t d = 0; d < CCTK_REAL_VEC_SIZE; ++d) { tmp2 += vec_elt(tmp, d); } vals[v] = tmp2;