Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vectorize CarpetInterp2's fasterp interpolation #5

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CarpetInterp2/configuration.ccl
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Configuration definitions for thorn CarpetInterp2

REQUIRES Carpet CarpetLib
REQUIRES Carpet CarpetLib Vectors
1 change: 1 addition & 0 deletions CarpetInterp2/interface.ccl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ USES INCLUDE HEADER: vect.hh
USES INCLUDE HEADER: carpet.hh
uses include header: Timer.hh

uses include header: vectors.h


# Get access to communicators
Expand Down
48 changes: 24 additions & 24 deletions CarpetInterp2/src/fasterp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <vect.hh>

#include "fasterp.hh"
#include "vectors.h"

#ifdef CARPETINTERP2_CHECK
#define CI2C(x, y) x, y
Expand Down Expand Up @@ -285,45 +286,44 @@ void fasterp_src_loc_t::interpolate(ivect const &ash,
size_t const dj = di * ash[0];
size_t const dk = dj * ash[1];

// we pad coeffs with zeros so that the extra vector elements don't spoil the
// sum
CCTK_REAL coeffs0[O0 + 1 + CCTK_REAL_VEC_SIZE - 1] CCTK_ATTRIBUTE_ALIGNED(
CCTK_REAL_VEC_SIZE * sizeof(CCTK_REAL)) = {0.};
for (size_t i = 0; i <= O0; ++i) {
coeffs0[i] = coeffs[0][i];
}

#ifdef CARPETINTERP2_CHECK
assert(all(ind >= 0 and ind + ivect(O0, O1, O2) < lsh));
assert(ind3d == index(ash, ind));
assert(int(di * ind[0] + dj * ind[1] + dk * ind[2]) == index(ash, ind));
#endif

for (size_t v = 0; v < varptrs.size(); ++v) {
CCTK_REAL tmp = 0.0;
CCTK_REAL_VEC tmp = vec_set1(0.0);
const CCTK_REAL *restrict const varptr = &varptrs.AT(v)[ind3d];

for (size_t k = 0; k <= O2; ++k) {
CCTK_REAL const coeff_k = O2 == 0 ? 1.0 : coeffs[2][k];
for (size_t j = 0; j <= O1; ++j) {
CCTK_REAL const coeff_jk = coeff_k * (O1 == 0 ? 1.0 : coeffs[1][j]);
#ifndef __BIGGEST_ALIGNMENT__ // this is a GCC extension
#define __BIGGEST_ALIGNMENT__ 64 // a guess
#endif
CCTK_REAL buffer[O0 + 1]
__attribute__((__aligned__(__BIGGEST_ALIGNMENT__)));
#ifdef __INTEL_COMPILER
#pragma vector always
#else
#pragma omp simd
#endif
for (size_t i = 0; i <= O0; ++i) {
buffer[i] = varptr[i * di + j * dj + k * dk];
}
#ifdef __INTEL_COMPILER
#pragma vector always
#else
#pragma omp simd reduction(+ : tmp)
#endif
for (size_t i = 0; i <= O0; ++i) {
CCTK_REAL const coeff_ijk = coeff_jk * (O0 == 0 ? 1.0 : coeffs[0][i]);
tmp += coeff_ijk * buffer[i];
CCTK_REAL_VEC const coeff_jk =
vec_set1(coeff_k * (O1 == 0 ? 1.0 : coeffs[1][j]));
for (size_t i = 0, ii = 0; i <= O0; i += CCTK_REAL_VEC_SIZE, ++ii) {
CCTK_REAL_VEC const coeff_ijk =
coeff_jk * (O0 == 0 ? vec_set1(1.0) : vec_load(coeffs0[i]));
CCTK_REAL_VEC const buffer =
vec_loadu(varptr[i * di + j * dj + k * dk]);
tmp += coeff_ijk * buffer;
}
}
}
vals[v] = tmp;
// there is no horizontal sum in Vectors
CCTK_REAL tmp2 = 0.;
for (size_t d = 0; d < CCTK_REAL_VEC_SIZE; ++d) {
tmp2 += vec_elt(tmp, d);
}
vals[v] = tmp2;
} // for v

#ifdef CARPETINTERP2_CHECK
Expand Down