From a082917d879af44b7fc1992a1b485f9cf8bfd62e Mon Sep 17 00:00:00 2001 From: Malachi Phillips Date: Mon, 1 Aug 2022 16:15:19 -0500 Subject: [PATCH] Additional options for Chebyshev-accelerated Jacobi smoothing. Variant 2: Standard, 1st Kind Chebyshev smoothing with $D^{-1}A$. No restriction on the Chebyshev order. Variant 3: 4th Kind Chebyshev smoother with $D^{-1}A$. No restriction on the Chebyshev order. Variant 4: Optimized 4th Kind Chebyshev smoother with $D^{-1}A$. Chebyshev order must be in [1,16] due to needing to compute the $\beta_k$ coefficients. Variants 3-4 are from https://arxiv.org/pdf/2202.08830.pdf. These correspond to relaxation type 16. In addition, relaxation type 19 has been implemented as a "no-op" smoother. This can be used in the case a user wishes to omit the post smoothing step, for example. --- src/parcsr_ls/HYPRE_parcsr_ls.h | 5 + src/parcsr_ls/par_cheby.c | 517 ++++++++++++++++++++++++++----- src/parcsr_ls/par_cheby_device.c | 430 ++++++++++++++++++++----- src/parcsr_ls/par_cycle.c | 4 + 4 files changed, 805 insertions(+), 151 deletions(-) diff --git a/src/parcsr_ls/HYPRE_parcsr_ls.h b/src/parcsr_ls/HYPRE_parcsr_ls.h index efc325cee1..0ce70ae004 100644 --- a/src/parcsr_ls/HYPRE_parcsr_ls.h +++ b/src/parcsr_ls/HYPRE_parcsr_ls.h @@ -886,6 +886,11 @@ HYPRE_Int HYPRE_BoomerAMGSetChebyScale (HYPRE_Solver solver, /** * (Optional) Defines which polynomial variant should be used. * The default is 0 (i.e., scaled). + * If variant=1, the polynomial is defined as: + * T(t)* f(t) where f(t) = (1-b/t) + * If variant=2, use standard Chebyshev polynomial with $D^{-1}A$, not $D^{-1/2}AD^{-1/2}$. + * If variant=3, use 4th-kind Chebyshev polynomial from https://arxiv.org/abs/2202.08830. + * If variant=4, use optimized 4th-kind Chebyshev polynomial from https://arxiv.org/abs/2202.08830. **/ HYPRE_Int HYPRE_BoomerAMGSetChebyVariant (HYPRE_Solver solver, HYPRE_Int variant); diff --git a/src/parcsr_ls/par_cheby.c b/src/parcsr_ls/par_cheby.c index 626ee69f69..9d0b3c8bbd 100644 --- a/src/parcsr_ls/par_cheby.c +++ b/src/parcsr_ls/par_cheby.c @@ -32,12 +32,205 @@ this is rlx 11 if scale = 0, and 16 if scale == 1 variant 1: modified cheby: T(t)* f(t) where f(t) = (1-b/t) this is rlx 15 if scale = 0, and 17 if scale == 1 +variant=2: standard chebyshev, with $D^{-1}A$ instead of $D^{-1/2}AD^{-1/2}$ + +variant=3: use 4th-kind Chebyshev polynomial from https://arxiv.org/abs/2202.08830. + +variant=4: use optimized 4th-kind Chebyshev polynomial from https://arxiv.org/abs/2202.08830. + ratio indicates the percentage of the whole spectrum to use (so .5 means half, and .1 means 10percent) *******************************************************************************/ +static void optimalWeightsImpl (HYPRE_Real* betas, const HYPRE_Int order) +{ + if (order == 1){ + betas[0] = 1.12500000000000; + } + + if (order == 2){ + betas[0] = 1.02387287570313; + betas[1] = 1.26408905371085; + } + + if (order == 3){ + betas[0] = 1.00842544782028; + betas[1] = 1.08867839208730; + betas[2] = 1.33753125909618; + } + + if (order == 4){ + betas[0] = 1.00391310427285; + betas[1] = 1.04035811188593; + betas[2] = 1.14863498546254; + betas[3] = 1.38268869241000; + } + + if (order == 5){ + betas[0] = 1.00212930146164; + betas[1] = 1.02173711549260; + betas[2] = 1.07872433192603; + betas[3] = 1.19810065292663; + betas[4] = 1.41322542791682; + } + + if (order == 6){ + betas[0] = 1.00128517255940; + betas[1] = 1.01304293035233; + betas[2] = 1.04678215124113; + betas[3] = 1.11616489419675; + betas[4] = 1.23829020218444; + betas[5] = 1.43524297106744; + } + + if (order == 7){ + betas[0] = 1.00083464397912; + betas[1] = 1.00843949430122; + betas[2] = 1.03008707768713; + betas[3] = 1.07408384092003; + betas[4] = 1.15036186707366; + betas[5] = 1.27116474046139; + betas[6] = 1.45186658649364; + } + + if (order == 8){ + betas[0] = 1.00057246631197; + betas[1] = 1.00577427662415; + betas[2] = 1.02050187922941; + betas[3] = 1.05019803444565; + betas[4] = 1.10115572984941; + betas[5] = 1.18086042806856; + betas[6] = 1.29838585382576; + betas[7] = 1.46486073151099; + } + + if (order == 9){ + betas[0] = 1.00040960072832; + betas[1] = 1.00412439506106; + betas[2] = 1.01460212148266; + betas[3] = 1.03561113626671; + betas[4] = 1.07139972529194; + betas[5] = 1.12688273710962; + betas[6] = 1.20785219140729; + betas[7] = 1.32121930716746; + betas[8] = 1.47529642820699; + } + + if (order == 10){ + betas[0] = 1.00030312229652; + betas[1] = 1.00304840660796; + betas[2] = 1.01077022715387; + betas[3] = 1.02619011597640; + betas[4] = 1.05231724933755; + betas[5] = 1.09255743207549; + betas[6] = 1.15083376663972; + betas[7] = 1.23172250870894; + betas[8] = 1.34060802024460; + betas[9] = 1.48386124407011; + } + + if (order == 11){ + betas[0] = 1.00023058595209; + betas[1] = 1.00231675024028; + betas[2] = 1.00817245396304; + betas[3] = 1.01982986566342; + betas[4] = 1.03950210235324; + betas[5] = 1.06965042700541; + betas[6] = 1.11305754295742; + betas[7] = 1.17290876275564; + betas[8] = 1.25288300576792; + betas[9] = 1.35725579919519; + betas[10] = 1.49101672564139; + } + + if (order == 12){ + betas[0] = 1.00017947200828; + betas[1] = 1.00180189139619; + betas[2] = 1.00634861907307; + betas[3] = 1.01537864566306; + betas[4] = 1.03056942830760; + betas[5] = 1.05376019693943; + betas[6] = 1.08699862592072; + betas[7] = 1.13259183097913; + betas[8] = 1.19316273358172; + betas[9] = 1.27171293675110; + betas[10] = 1.37169337969799; + betas[11] = 1.49708418575562; + } + + if (order == 13){ + betas[0] = 1.00014241921559; + betas[1] = 1.00142906932629; + betas[2] = 1.00503028986298; + betas[3] = 1.01216910518495; + betas[4] = 1.02414874342792; + betas[5] = 1.04238158880820; + betas[6] = 1.06842008128700; + betas[7] = 1.10399010936759; + betas[8] = 1.15102748242645; + betas[9] = 1.21171811910125; + betas[10] = 1.28854264865128; + betas[11] = 1.38432619380991; + betas[12] = 1.50229418757368; + } + + if (order == 14){ + betas[0] = 1.00011490538261; + betas[1] = 1.00115246376914; + betas[2] = 1.00405357333264; + betas[3] = 1.00979590573153; + betas[4] = 1.01941300472994; + betas[5] = 1.03401425035436; + betas[6] = 1.05480599606629; + betas[7] = 1.08311420301813; + betas[8] = 1.12040891660892; + betas[9] = 1.16833095655446; + betas[10] = 1.22872122288238; + betas[11] = 1.30365305707817; + betas[12] = 1.39546814053678; + betas[13] = 1.50681646209583; + } + + if (order == 15){ + betas[0] = 1.00009404750752; + betas[1] = 1.00094291696343; + betas[2] = 1.00331449056444; + betas[3] = 1.00800294833816; + betas[4] = 1.01584236259140; + betas[5] = 1.02772083317705; + betas[6] = 1.04459535422831; + betas[7] = 1.06750761206125; + betas[8] = 1.09760092545889; + betas[9] = 1.13613855366157; + betas[10] = 1.18452361426236; + betas[11] = 1.24432087304475; + betas[12] = 1.31728069083392; + betas[13] = 1.40536543893560; + betas[14] = 1.51077872501845; + } + + if (order == 16){ + betas[0] = 1.00007794828179; + betas[1] = 1.00078126847253; + betas[2] = 1.00274487974401; + betas[3] = 1.00662291017015; + betas[4] = 1.01309858836971; + betas[5] = 1.02289448329337; + betas[6] = 1.03678321409983; + betas[7] = 1.05559875719896; + betas[8] = 1.08024848405560; + betas[9] = 1.11172607131497; + betas[10] = 1.15112543431072; + betas[11] = 1.19965584614973; + betas[12] = 1.25865841744946; + betas[13] = 1.32962412656664; + betas[14] = 1.41421360695576; + betas[15] = 1.51427891730346; + } +} + /** * @brief Setups of coefficients (and optional diagonal scaling elements) for * Chebyshev relaxation @@ -75,14 +268,20 @@ hypre_ParCSRRelax_Cheby_Setup(hypre_ParCSRMatrix *A, /* matrix to relax HYPRE_Real *ds_data = NULL; /* u = u + p(A)r */ - if (order > 4) + if (order < 1) { - order = 4; + order = 1; } - if (order < 1) + HYPRE_Int maxOrder = 4; + if(variant == 2 || variant == 3){ + maxOrder = INT_MAX; + } + if(variant == 4) maxOrder = 16; // due to writing out coefficients + + if (order > maxOrder) { - order = 1; + order = maxOrder; } coefs = hypre_CTAlloc(HYPRE_Real, order + 1, HYPRE_MEMORY_HOST); @@ -154,7 +353,7 @@ hypre_ParCSRRelax_Cheby_Setup(hypre_ParCSRMatrix *A, /* matrix to relax } } - else /* standard chebyshev */ + else if (variant == 0) /* standard chebyshev */ { switch (cheby_order) /* these are the corresponding cheby polynomials: u = u_o + s(A)r_0 - so order is @@ -192,13 +391,40 @@ hypre_ParCSRRelax_Cheby_Setup(hypre_ParCSRMatrix *A, /* matrix to relax break; } } + else { + + // for variants 3, 4 allocate additional space for coefficients + HYPRE_Int nCoeffs = 2; + if(variant == 3 || variant == 4){ + nCoeffs += order; + } + + hypre_TFree(coefs, HYPRE_MEMORY_HOST); + coefs = hypre_CTAlloc(HYPRE_Real, nCoeffs, HYPRE_MEMORY_HOST); + coefs[0] = upper_bound; + coefs[1] = lower_bound; // not actually used for variants 3, 4 + + if(variant == 4){ + optimalWeightsImpl(coefs + 2, order); + } + if(variant == 3){ + for(HYPRE_Int i = 0; i < order; ++i){ + coefs[2 + i] = 1.0; + } + } + } *coefs_ptr = coefs; if (scale) { /*grab 1/hypre_sqrt(abs(diagonal)) */ ds_data = hypre_CTAlloc(HYPRE_Real, num_rows, hypre_ParCSRMatrixMemoryLocation(A)); - hypre_CSRMatrixExtractDiagonal(hypre_ParCSRMatrixDiag(A), ds_data, 4); + if(variant == 0 || variant == 1){ + hypre_CSRMatrixExtractDiagonal(hypre_ParCSRMatrixDiag(A), ds_data, 4); + } else { + // 1/diagonal + hypre_CSRMatrixExtractDiagonal(hypre_ParCSRMatrixDiag(A), ds_data, 2); + } } /* end of scaling code */ *ds_ptr = ds_data; @@ -254,130 +480,259 @@ hypre_ParCSRRelax_Cheby_SolveHost(hypre_ParCSRMatrix *A, /* matrix to relax with /* u = u + p(A)r */ - - if (order > 4) - { - order = 4; - } if (order < 1) { order = 1; } + HYPRE_Int maxOrder = 4; + if(variant == 2 || variant == 3){ + maxOrder = 9999; + } + if(variant == 4) maxOrder = 16; // due to writing out coefficients + + if (order > maxOrder) + { + order = maxOrder; + } + /* we are using the order of p(A) */ cheby_order = order - 1; hypre_assert(hypre_VectorSize(hypre_ParVectorLocalVector(orig_u_vec)) >= num_rows); orig_u = hypre_VectorData(hypre_ParVectorLocalVector(orig_u_vec)); - if (!scale) - { - /* get residual: r = f - A*u */ - hypre_ParVectorCopy(f, r); - hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r); + if (variant == 0 || variant == 1){ + if (!scale) + { + /* get residual: r = f - A*u */ + hypre_ParVectorCopy(f, r); + hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r); + + /* o = u; u = r .* coef */ + for ( i = 0; i < num_rows; i++ ) + { + orig_u[i] = u_data[i]; + u_data[i] = r_data[i] * coefs[cheby_order]; + } + for (i = cheby_order - 1; i >= 0; i-- ) + { + hypre_ParCSRMatrixMatvec(1.0, A, u, 0.0, v); + mult = coefs[i]; + /* u = mult * r + v */ +#ifdef HYPRE_USING_OPENMP + #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE +#endif + for ( j = 0; j < num_rows; j++ ) + { + u_data[j] = mult * r_data[j] + v_data[j]; + } + } - /* o = u; u = r .* coef */ - for ( i = 0; i < num_rows; i++ ) - { - orig_u[i] = u_data[i]; - u_data[i] = r_data[i] * coefs[cheby_order]; - } - for (i = cheby_order - 1; i >= 0; i-- ) - { - hypre_ParCSRMatrixMatvec(1.0, A, u, 0.0, v); - mult = coefs[i]; - /* u = mult * r + v */ + /* u = o + u */ #ifdef HYPRE_USING_OPENMP - #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE + #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE #endif - for ( j = 0; j < num_rows; j++ ) - { - u_data[j] = mult * r_data[j] + v_data[j]; - } - } + for ( i = 0; i < num_rows; i++ ) + { + u_data[i] = orig_u[i] + u_data[i]; + } + } + else /* scaling! */ + { + + /*grab 1/sqrt(diagonal) */ + tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec)); + + /* get ds_data and get scaled residual: r = D^(-1/2)f - + * D^(-1/2)A*u */ + + hypre_ParCSRMatrixMatvec(-1.0, A, u, 0.0, tmp_vec); + /* r = ds .* (f + tmp) */ +#ifdef HYPRE_USING_OPENMP + #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE +#endif + for ( j = 0; j < num_rows; j++ ) + { + r_data[j] = ds_data[j] * (f_data[j] + tmp_data[j]); + } + + /* save original u, then start + the iteration by multiplying r by the cheby coef.*/ - /* u = o + u */ + /* o = u; u = r * coef */ #ifdef HYPRE_USING_OPENMP - #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE + #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE #endif - for ( i = 0; i < num_rows; i++ ) - { - u_data[i] = orig_u[i] + u_data[i]; - } + for ( j = 0; j < num_rows; j++ ) + { + orig_u[j] = u_data[j]; /* orig, unscaled u */ + + u_data[j] = r_data[j] * coefs[cheby_order]; + } + + /* now do the other coefficients */ + for (i = cheby_order - 1; i >= 0; i-- ) + { + /* v = D^(-1/2)AD^(-1/2)u */ + /* tmp = ds .* u */ +#ifdef HYPRE_USING_OPENMP + #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE +#endif + for ( j = 0; j < num_rows; j++ ) + { + tmp_data[j] = ds_data[j] * u_data[j]; + } + hypre_ParCSRMatrixMatvec(1.0, A, tmp_vec, 0.0, v); + + /* u_new = coef*r + v*/ + mult = coefs[i]; + + /* u = coef * r + ds .* v */ +#ifdef HYPRE_USING_OPENMP + #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE +#endif + for ( j = 0; j < num_rows; j++ ) + { + u_data[j] = mult * r_data[j] + ds_data[j] * v_data[j]; + } + + } /* end of cheby_order loop */ + + /* now we have to scale u_data before adding it to u_orig*/ + + /* u = orig_u + ds .* u */ +#ifdef HYPRE_USING_OPENMP + #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE +#endif + for ( j = 0; j < num_rows; j++ ) + { + u_data[j] = orig_u[j] + ds_data[j] * u_data[j]; + } + + }/* end of scaling code */ } - else /* scaling! */ - { + else if (variant == 2){ + const HYPRE_Real lambda_max = coefs[0]; + const HYPRE_Real lambda_min = coefs[1]; + + const HYPRE_Real theta = 0.5 * (lambda_max + lambda_min); + const HYPRE_Real delta = 0.5 * (lambda_max - lambda_min); + const HYPRE_Real sigma = theta / delta; + const HYPRE_Real invTheta = 1.0 / theta; + HYPRE_Real rho = 1.0 / sigma; - /*grab 1/hypre_sqrt(diagonal) */ tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec)); - /* get ds_data and get scaled residual: r = D^(-1/2)f - - * D^(-1/2)A*u */ + // r := f - A*u + hypre_ParVectorCopy(f, r); + hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r); + + // r = D^{-1} r + // v := 1/theta r +#ifdef HYPRE_USING_OPENMP + #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE +#endif + for ( j = 0; j < num_rows; j++ ) + { + const HYPRE_Real r = ds_data[j] * r_data[j]; + r_data[j] = r; + v_data[j] = r * invTheta; + } + + for(int i = 0; i < cheby_order; ++i){ + // tmp = Av + hypre_ParCSRMatrixMatvec(1.0, A, v, 0.0, tmp_vec); + + const HYPRE_Real rhoSave = rho; + rho = 1.0 / (2 * sigma - rho); + + const HYPRE_Real vcoef = rho * rhoSave; + const HYPRE_Real rcoef = 2.0 * rho / delta; + +#ifdef HYPRE_USING_OPENMP + #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE +#endif + for ( j = 0; j < num_rows; j++ ) + { + const HYPRE_Real v = v_data[j]; + u_data[j] += v; + + const HYPRE_Real r = r_data[j] - ds_data[j] * tmp_data[j]; + r_data[j] = r; + v_data[j] = vcoef * v + rcoef * r; + } + } - hypre_ParCSRMatrixMatvec(-1.0, A, u, 0.0, tmp_vec); - /* r = ds .* (f + tmp) */ + // u += v; #ifdef HYPRE_USING_OPENMP #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE #endif for ( j = 0; j < num_rows; j++ ) { - r_data[j] = ds_data[j] * (f_data[j] + tmp_data[j]); + u_data[j] += v_data[j]; } - /* save original u, then start - the iteration by multiplying r by the cheby coef.*/ + } + else if(variant == 3 || variant == 4) + { + const HYPRE_Real lambda_max = coefs[0]; + const HYPRE_Int coeffOffset = 2; + + tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec)); + + // r := f - A*u + hypre_ParVectorCopy(f, r); + hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r); - /* o = u; u = r * coef */ + // v := \dfrac{4}{3} \dfrac{1}{\rho(D^{-1}A)} D^{-1} r + HYPRE_Real coef = 4.0 / (3.0 * lambda_max); #ifdef HYPRE_USING_OPENMP #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE #endif for ( j = 0; j < num_rows; j++ ) { - orig_u[j] = u_data[j]; /* orig, unscaled u */ - - u_data[j] = r_data[j] * coefs[cheby_order]; + v_data[j] = coef * ds_data[j] * r_data[j]; } + + for(int i = 0; i < cheby_order; ++i){ + const HYPRE_Real beta_i = coefs[coeffOffset + i]; + // r = r - Av + hypre_ParCSRMatrixMatvec(-1.0, A, v, 1.0, r); + + // + 2 offset is due to two issues: + // + 1 is from https://arxiv.org/pdf/2202.08830.pdf being written in 1-based indexing + // + 1 is from pre-computing z_1 _outside_ of the loop + // u += \beta_i v + // v = \dfrac{(2i-3)}{(2i+1)} v + \dfrac{(8i-4)}{(2i+1)} \dfrac{1}{\rho(SA)} S r + const HYPRE_Int id = i + 2; + const HYPRE_Real vScale = (2.0 * id - 3.0) / (2.0 * id + 1.0); + const HYPRE_Real rScale = (8.0 * id - 4.0) / ((2.0 * id + 1.0) * lambda_max); - /* now do the other coefficients */ - for (i = cheby_order - 1; i >= 0; i-- ) - { - /* v = D^(-1/2)AD^(-1/2)u */ - /* tmp = ds .* u */ -#ifdef HYPRE_USING_OPENMP - #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE -#endif - for ( j = 0; j < num_rows; j++ ) - { - tmp_data[j] = ds_data[j] * u_data[j]; - } - hypre_ParCSRMatrixMatvec(1.0, A, tmp_vec, 0.0, v); - - /* u_new = coef*r + v*/ - mult = coefs[i]; - - /* u = coef * r + ds .* v */ #ifdef HYPRE_USING_OPENMP - #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE + #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE #endif - for ( j = 0; j < num_rows; j++ ) - { - u_data[j] = mult * r_data[j] + ds_data[j] * v_data[j]; - } + for ( j = 0; j < num_rows; j++ ) + { + const HYPRE_Real v = v_data[j]; + u_data[j] += beta_i * v; + v_data[j] = vScale * v + rScale * ds_data[j] * r_data[j]; + } - } /* end of cheby_order loop */ + } - /* now we have to scale u_data before adding it to u_orig*/ + const HYPRE_Real beta_order = coefs[coeffOffset + cheby_order]; - /* u = orig_u + ds .* u */ + // u += \beta v; #ifdef HYPRE_USING_OPENMP #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE #endif for ( j = 0; j < num_rows; j++ ) { - u_data[j] = orig_u[j] + ds_data[j] * u_data[j]; + u_data[j] += beta_order * v_data[j]; } - }/* end of scaling code */ + } return hypre_error_flag; } diff --git a/src/parcsr_ls/par_cheby_device.c b/src/parcsr_ls/par_cheby_device.c index e2ab368462..af7df44e12 100644 --- a/src/parcsr_ls/par_cheby_device.c +++ b/src/parcsr_ls/par_cheby_device.c @@ -17,6 +17,7 @@ #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) #include "_hypre_utilities.hpp" +#include /** * @brief waxpyz @@ -97,6 +98,171 @@ struct xpyz } }; +/** + * @brief scale + * + * Performs + * x = d .* x + * For scalars x, d + */ +template +struct scaleInPlace +{ + typedef thrust::tuple Tuple; + + __host__ __device__ void operator()(Tuple t) + { + thrust::get<1>(t) = thrust::get<0>(t) * thrust::get<1>(t); + } +}; + +/** + * @brief add + * + * Performs + * x = x + coef * y + * For scalars x, d + */ +template +struct add +{ + typedef thrust::tuple Tuple; + + const T coef; + add(T _coef = 1.0) : coef(_coef) {} + + __host__ __device__ void operator()(Tuple t) + { + thrust::get<1>(t) = coef * thrust::get<0>(t) + thrust::get<1>(t); + } +}; + +/** + * @brief add + * + * Performs + * x = x + coef * d.*y + * For scalars x, d + */ +template +struct scaledAdd +{ + typedef thrust::tuple Tuple; + const T scale; + scaledAdd(T _scale) : scale(_scale) {} + + __host__ __device__ void operator()(Tuple t) + { + thrust::get<2>(t) = thrust::get<2>(t) + scale * thrust::get<0>(t) * thrust::get<1>(t); + } +}; + +/** + * @brief add + * + * Performs + * r = r - D .* tmp + * v = coef0 * r + coef1 * v + * + */ +template +struct updateRAndV +{ + typedef thrust::tuple Tuple; + const T coef0; + const T coef1; + updateRAndV(T _coef0, T _coef1) : coef0(_coef0), coef1(_coef1) {} + + __host__ __device__ void operator()(Tuple t) + { + thrust::get<2>(t) = thrust::get<2>(t) - thrust::get<0>(t) * thrust::get<1>(t); + thrust::get<3>(t) = coef0 * thrust::get<2>(t) + coef1 * thrust::get<3>(t); + } +}; + +/** + * @brief scale + * + * Performs + * y = coef * d .* x + * For scalars x, d, y + */ +template +struct applySmoother +{ + typedef thrust::tuple Tuple; + + const T coef; + applySmoother(T _coef = 1.0) : coef(_coef) {} + + __host__ __device__ void operator()(Tuple t) + { + thrust::get<2>(t) = coef * thrust::get<0>(t) * thrust::get<1>(t); + } +}; + +/** + * @brief waxpyz + * + * Performs + * y = a * x + * constant a + */ +template +struct scaleConstant +{ + typedef thrust::tuple Tuple; + + const T scale; + scaleConstant(T _scale) : scale(_scale) {} + + __host__ __device__ void operator()(Tuple t) + { + thrust::get<1>(t) = scale * thrust::get<0>(t); + } +}; + +/** + * @brief update + * + * Performs + * d = scale0 * r + scale1 * d + */ +template +struct update +{ + typedef thrust::tuple Tuple; + + const T scale0; + const T scale1; + update(T _scale0, T _scale1) : scale0(_scale0), scale1(_scale1) {} + + __host__ __device__ void operator()(Tuple t) + { + thrust::get<1>(t) = scale1 * thrust::get<1>(t) + scale0 * thrust::get<0>(t); + } +}; + +/** + * @brief updateCol + * + * Performs + * d = scale0 * x.*r + scale1 * d + */ +template +struct updateCol +{ + typedef thrust::tuple Tuple; + + const T scale0; + const T scale1; + updateCol(T _scale0, T _scale1) : scale0(_scale0), scale1(_scale1) {} + + __host__ __device__ void operator()(Tuple t) + { + thrust::get<2>(t) = scale1 * thrust::get<2>(t) + scale0 * thrust::get<1>(t) * thrust::get<0>(t); + } +}; /** * @brief Solve using a chebyshev polynomial on the device * @@ -142,9 +308,21 @@ hypre_ParCSRRelax_Cheby_SolveDevice(hypre_ParCSRMatrix *A, /* matrix to relax wi HYPRE_Real *tmp_data; /* u = u + p(A)r */ + if (order < 1) + { + order = 1; + } + + HYPRE_Int maxOrder = 4; + if(variant == 2 || variant == 3){ + maxOrder = 9999; + } + if(variant == 4) maxOrder = 16; // due to writing out coefficients - if (order > 4) { order = 4; } - if (order < 1) { order = 1; } + if (order > maxOrder) + { + order = maxOrder; + } /* we are using the order of p(A) */ cheby_order = order - 1; @@ -152,92 +330,204 @@ hypre_ParCSRRelax_Cheby_SolveDevice(hypre_ParCSRMatrix *A, /* matrix to relax wi hypre_assert(hypre_VectorSize(hypre_ParVectorLocalVector(orig_u_vec)) >= num_rows); HYPRE_Real *orig_u = hypre_VectorData(hypre_ParVectorLocalVector(orig_u_vec)); - if (!scale) - { - /* get residual: r = f - A*u */ - hypre_ParVectorCopy(f, r); - hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r); - - /* o = u; u = r .* coef */ - HYPRE_THRUST_CALL( - for_each, - thrust::make_zip_iterator(thrust::make_tuple(orig_u, u_data, r_data)), - thrust::make_zip_iterator(thrust::make_tuple(orig_u + num_rows, u_data + num_rows, - r_data + num_rows)), - save_and_scale(coefs[cheby_order])); - - for (i = cheby_order - 1; i >= 0; i--) - { - hypre_ParCSRMatrixMatvec(1.0, A, u, 0.0, v); - mult = coefs[i]; - - /* u = mult * r + v */ - hypreDevice_ComplexAxpyn( r_data, num_rows, v_data, u_data, mult ); - } - - /* u = o + u */ - hypreDevice_ComplexAxpyn( orig_u, num_rows, u_data, u_data, 1.0); + if(variant == 0 || variant == 1){ + + if (!scale) + { + /* get residual: r = f - A*u */ + hypre_ParVectorCopy(f, r); + hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r); + + /* o = u; u = r .* coef */ + HYPRE_THRUST_CALL( + for_each, + thrust::make_zip_iterator(thrust::make_tuple(orig_u, u_data, r_data)), + thrust::make_zip_iterator(thrust::make_tuple(orig_u + num_rows, u_data + num_rows, + r_data + num_rows)), + save_and_scale(coefs[cheby_order])); + + for (i = cheby_order - 1; i >= 0; i--) + { + hypre_ParCSRMatrixMatvec(1.0, A, u, 0.0, v); + mult = coefs[i]; + + /* u = mult * r + v */ + hypreDevice_ComplexAxpyn( r_data, num_rows, v_data, u_data, mult ); + } + + /* u = o + u */ + hypreDevice_ComplexAxpyn( orig_u, num_rows, u_data, u_data, 1.0); + } + else /* scaling! */ + { + + /*grab 1/sqrt(diagonal) */ + + tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec)); + + /* get ds_data and get scaled residual: r = D^(-1/2)f - + * D^(-1/2)A*u */ + + hypre_ParCSRMatrixMatvec(-1.0, A, u, 0.0, tmp_vec); + /* r = ds .* (f + tmp) */ + + /* TODO: It might be possible to merge this and the next call to: + * r[j] = ds_data[j] * (f_data[j] + tmp_data[j]); o[j] = u[j]; u[j] = r[j] * coef */ + HYPRE_THRUST_CALL(for_each, + thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, f_data, tmp_data)), + thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, f_data, tmp_data)) + num_rows, + wxypz()); + + /* save original u, then start + the iteration by multiplying r by the cheby coef.*/ + + /* o = u; u = r * coef */ + HYPRE_THRUST_CALL(for_each, + thrust::make_zip_iterator(thrust::make_tuple(orig_u, u_data, r_data)), + thrust::make_zip_iterator(thrust::make_tuple(orig_u, u_data, r_data)) + num_rows, + save_and_scale(coefs[cheby_order])); + + /* now do the other coefficients */ + for (i = cheby_order - 1; i >= 0; i--) + { + /* v = D^(-1/2)AD^(-1/2)u */ + /* tmp = ds .* u */ + HYPRE_THRUST_CALL( transform, ds_data, ds_data + num_rows, u_data, tmp_data, _1 * _2 ); + + hypre_ParCSRMatrixMatvec(1.0, A, tmp_vec, 0.0, v); + + /* u_new = coef*r + v*/ + mult = coefs[i]; + + /* u = coef * r + ds .* v */ + HYPRE_THRUST_CALL(for_each, + thrust::make_zip_iterator(thrust::make_tuple(u_data, r_data, ds_data, v_data)), + thrust::make_zip_iterator(thrust::make_tuple(u_data, r_data, ds_data, v_data)) + num_rows, + waxpyz(mult)); + } /* end of cheby_order loop */ + + /* now we have to scale u_data before adding it to u_orig*/ + + /* u = orig_u + ds .* u */ + HYPRE_THRUST_CALL( + for_each, + thrust::make_zip_iterator(thrust::make_tuple(u_data, orig_u, ds_data)), + thrust::make_zip_iterator(thrust::make_tuple(u_data + num_rows, orig_u + num_rows, + ds_data + num_rows)), + xpyz()); + + + } /* end of scaling code */ } - else /* scaling! */ + else if(variant == 2) { + const auto lambda_max = coefs[0]; + const auto lambda_min = coefs[1]; - /*grab 1/hypre_sqrt(diagonal) */ + const auto theta = 0.5 * (lambda_max + lambda_min); + const auto delta = 0.5 * (lambda_max - lambda_min); + const auto sigma = theta / delta; + auto rho = 1.0 / sigma; tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec)); - /* get ds_data and get scaled residual: r = D^(-1/2)f - - * D^(-1/2)A*u */ + // r := f - A*u + hypre_ParVectorCopy(f, r); + hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r); - hypre_ParCSRMatrixMatvec(-1.0, A, u, 0.0, tmp_vec); - /* r = ds .* (f + tmp) */ + // TODO: consolidate two calls below - /* TODO: It might be possible to merge this and the next call to: - * r[j] = ds_data[j] * (f_data[j] + tmp_data[j]); o[j] = u[j]; u[j] = r[j] * coef */ + // r = D^{-1} r HYPRE_THRUST_CALL(for_each, - thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, f_data, tmp_data)), - thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, f_data, tmp_data)) + num_rows, - wxypz()); - - /* save original u, then start - the iteration by multiplying r by the cheby coef.*/ - - /* o = u; u = r * coef */ + thrust::make_zip_iterator(thrust::make_tuple(ds_data, r_data)), + thrust::make_zip_iterator(thrust::make_tuple(ds_data, r_data)) + num_rows, + scaleInPlace()); + + // v := 1/theta r HYPRE_THRUST_CALL(for_each, - thrust::make_zip_iterator(thrust::make_tuple(orig_u, u_data, r_data)), - thrust::make_zip_iterator(thrust::make_tuple(orig_u, u_data, r_data)) + num_rows, - save_and_scale(coefs[cheby_order])); + thrust::make_zip_iterator(thrust::make_tuple(r_data, v_data)), + thrust::make_zip_iterator(thrust::make_tuple(r_data, v_data)) + num_rows, + scaleConstant(1.0 / theta)); + + for(int i = 0; i < cheby_order; ++i){ + // u += v + HYPRE_THRUST_CALL(for_each, + thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)), + thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)) + num_rows, + add()); + // tmp = Av + hypre_ParCSRMatrixMatvec(1.0, A, v, 0.0, tmp_vec); + + const auto rhoSave = rho; + rho = 1.0 / (2 * sigma - rho); + + const auto vcoef = rho * rhoSave; + const auto rcoef = 2.0 * rho / delta; + + // r = r - D^{-1} Av + // v = rho_{k+1} rho_k * v + 2 rho_{k+1} / delta r + HYPRE_THRUST_CALL(for_each, + thrust::make_zip_iterator(thrust::make_tuple(ds_data, tmp_data, r_data, v_data)), + thrust::make_zip_iterator(thrust::make_tuple(ds_data, tmp_data, r_data, v_data)) + num_rows, + updateRAndV(rcoef, vcoef)); + } - /* now do the other coefficients */ - for (i = cheby_order - 1; i >= 0; i--) - { - /* v = D^(-1/2)AD^(-1/2)u */ - /* tmp = ds .* u */ - HYPRE_THRUST_CALL( transform, ds_data, ds_data + num_rows, u_data, tmp_data, _1 * _2 ); + // u += v; + HYPRE_THRUST_CALL(for_each, + thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)), + thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)) + num_rows, + add()); - hypre_ParCSRMatrixMatvec(1.0, A, tmp_vec, 0.0, v); + } + else if(variant == 3 || variant == 4) + { + const auto lambda_max = coefs[0]; + const auto coeffOffset = 2; - /* u_new = coef*r + v*/ - mult = coefs[i]; + tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec)); - /* u = coef * r + ds .* v */ - HYPRE_THRUST_CALL(for_each, - thrust::make_zip_iterator(thrust::make_tuple(u_data, r_data, ds_data, v_data)), - thrust::make_zip_iterator(thrust::make_tuple(u_data, r_data, ds_data, v_data)) + num_rows, - waxpyz(mult)); - } /* end of cheby_order loop */ + // r := f - A*u + hypre_ParVectorCopy(f, r); + hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r); - /* now we have to scale u_data before adding it to u_orig*/ + // v := \dfrac{4}{3} \dfrac{1}{\rho(D^{-1}A)} D^{-1} r + HYPRE_THRUST_CALL(for_each, + thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, v_data)), + thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, v_data)) + num_rows, + applySmoother(4.0 / 3.0 / lambda_max)); + + for(int i = 0; i < cheby_order; ++i){ + // u += \beta_k v + // since this is _not_ the optimized variant, \beta := 1.0 + HYPRE_THRUST_CALL(for_each, + thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)), + thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)) + num_rows, + add(coefs[coeffOffset + i])); + // r = r - Av + hypre_ParCSRMatrixMatvec(-1.0, A, v, 1.0, r); + + // + 2 offset is due to two issues: + // + 1 is from https://arxiv.org/pdf/2202.08830.pdf being written in 1-based indexing + // + 1 is from pre-computing z_1 _outside_ of the loop + // v = \dfrac{(2i-3)}{(2i+1)} v + \dfrac{(8i-4)}{(2i+1)} \dfrac{1}{\rho(SA)} S r + const auto id = i + 2; + const auto vScale = (2.0 * id - 3.0) / (2.0 * id + 1.0); + const auto rScale = (8.0 * id - 4.0) / (2.0 * id + 1.0) / lambda_max; + + HYPRE_THRUST_CALL(for_each, + thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, v_data)), + thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, v_data)) + num_rows, + updateCol(rScale, vScale)); - /* u = orig_u + ds .* u */ - HYPRE_THRUST_CALL( - for_each, - thrust::make_zip_iterator(thrust::make_tuple(u_data, orig_u, ds_data)), - thrust::make_zip_iterator(thrust::make_tuple(u_data + num_rows, orig_u + num_rows, - ds_data + num_rows)), - xpyz()); + } + // u += \beta v; + HYPRE_THRUST_CALL(for_each, + thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)), + thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)) + num_rows, + add(coefs[coeffOffset + cheby_order])); - } /* end of scaling code */ + } return hypre_error_flag; } diff --git a/src/parcsr_ls/par_cycle.c b/src/parcsr_ls/par_cycle.c index 9a8f79a96a..4baf601a24 100644 --- a/src/parcsr_ls/par_cycle.c +++ b/src/parcsr_ls/par_cycle.c @@ -549,6 +549,10 @@ hypre_BoomerAMGCycle( void *amg_vdata, cheby_order, scale, variant, Aux_U, Vtemp, Ztemp, Ptemp, Rtemp ); } + else if (relax_type == 19) + { + // no-op + } else if (relax_type == 17) { if (level == num_levels - 1)