Skip to content

Commit

Permalink
Merge branch 'develop' into feature_incompressible
Browse files Browse the repository at this point in the history
  • Loading branch information
Cristopher-Morales authored Dec 8, 2023
2 parents 1f26c1e + 2c9fbb6 commit f697ab8
Show file tree
Hide file tree
Showing 9 changed files with 120 additions and 6 deletions.
6 changes: 6 additions & 0 deletions Common/include/CConfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1173,6 +1173,7 @@ class CConfig {
string caseName; /*!< \brief Name of the current case */

unsigned long edgeColorGroupSize; /*!< \brief Size of the edge groups colored for OpenMP parallelization of edge loops. */
bool edgeColoringRelaxDiscAdj; /*!< \brief Allow fallback to smaller edge color group sizes and use more colors for the discrete adjoint. */

INLET_SPANWISE_INTERP Kind_InletInterpolationFunction; /*!brief type of spanwise interpolation function to use for the inlet face. */
INLET_INTERP_TYPE Kind_Inlet_InterpolationType; /*!brief type of spanwise interpolation data to use for the inlet face. */
Expand Down Expand Up @@ -9660,6 +9661,11 @@ class CConfig {
*/
unsigned long GetEdgeColoringGroupSize(void) const { return edgeColorGroupSize; }

/*!
* \brief Check if the discrete adjoint is allowed to relax the coloring, that is, allow smaller edge color group sizes and allow more colors.
*/
bool GetEdgeColoringRelaxDiscAdj() const { return edgeColoringRelaxDiscAdj; }

/*!
* \brief Get the ParMETIS load balancing tolerance.
*/
Expand Down
6 changes: 5 additions & 1 deletion Common/include/geometry/CGeometry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1720,10 +1720,14 @@ class CGeometry {
/*!
* \brief Get the edge coloring.
* \note This method computes the coloring if that has not been done yet.
* \note Can be instructed to determine and use the maximum edge color group size between 1 and
* CGeometry::edgeColorGroupSize that yields a coloring that is at least as efficient as #COLORING_EFF_THRESH.
* \param[out] efficiency - optional output of the coloring efficiency.
* \param[in] maximizeEdgeColorGroupSize - use the maximum edge color group size that gives an efficient coloring.
* \return Reference to the coloring.
*/
const CCompressedSparsePatternUL& GetEdgeColoring(su2double* efficiency = nullptr);
const CCompressedSparsePatternUL& GetEdgeColoring(su2double* efficiency = nullptr,
bool maximizeEdgeColorGroupSize = false);

/*!
* \brief Force the natural (sequential) edge coloring.
Expand Down
2 changes: 1 addition & 1 deletion Common/include/toolboxes/graph_toolbox.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ T createNaturalColoring(Index_t numInnerIndexes) {
* \param[out] indexColor - Optional, vector with colors given to the outer indices.
* \return Coloring in the same type of the input pattern.
*/
template <typename Color_t = char, size_t MaxColors = 64, size_t MaxMB = 128, class T>
template <typename Color_t = unsigned char, size_t MaxColors = 255, size_t MaxMB = 128, class T>
T colorSparsePattern(const T& pattern, size_t groupSize = 1, bool balanceColors = false,
std::vector<Color_t>* indexColor = nullptr) {
static_assert(std::is_integral<Color_t>::value, "");
Expand Down
3 changes: 3 additions & 0 deletions Common/src/CConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2925,6 +2925,9 @@ void CConfig::SetConfig_Options() {
/* DESCRIPTION: Size of the edge groups colored for thread parallel edge loops (0 forces the reducer strategy). */
addUnsignedLongOption("EDGE_COLORING_GROUP_SIZE", edgeColorGroupSize, 512);

/* DESCRIPTION: Allow fallback to smaller edge color group sizes for the discrete adjoint and allow more colors. */
addBoolOption("EDGE_COLORING_RELAX_DISC_ADJ", edgeColoringRelaxDiscAdj, true);

/*--- options that are used for libROM ---*/
/*!\par CONFIG_CATEGORY:libROM options \ingroup Config*/

Expand Down
57 changes: 55 additions & 2 deletions Common/src/geometry/CGeometry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3609,7 +3609,7 @@ const su2vector<unsigned long>& CGeometry::GetTransposeSparsePatternMap(Connecti
return pattern.transposePtr();
}

const CCompressedSparsePatternUL& CGeometry::GetEdgeColoring(su2double* efficiency) {
const CCompressedSparsePatternUL& CGeometry::GetEdgeColoring(su2double* efficiency, bool maximizeEdgeColorGroupSize) {
/*--- Check for dry run mode with dummy geometry. ---*/
if (nEdge == 0) return edgeColoring;

Expand Down Expand Up @@ -3637,7 +3637,60 @@ const CCompressedSparsePatternUL& CGeometry::GetEdgeColoring(su2double* efficien

/*--- Color the edges. ---*/
constexpr bool balanceColors = true;
edgeColoring = colorSparsePattern(pattern, edgeColorGroupSize, balanceColors);

/*--- If requested, find an efficient coloring with maximum color group size (up to edgeColorGroupSize). ---*/
if (maximizeEdgeColorGroupSize) {
auto upperEdgeColorGroupSize = edgeColorGroupSize + 1; /* upper bound that is deemed too large */
auto nextEdgeColorGroupSize = edgeColorGroupSize; /* next value that we are going to try */
auto lowerEdgeColorGroupSize = 1ul; /* lower bound that is known to work */

bool admissibleColoring = false; /* keep track wether the last tested coloring is admissible */

while (true) {
edgeColoring = colorSparsePattern(pattern, nextEdgeColorGroupSize, balanceColors);

/*--- If the coloring fails, reduce the color group size. ---*/
if (edgeColoring.empty()) {
upperEdgeColorGroupSize = nextEdgeColorGroupSize;
admissibleColoring = false;
}
/*--- If the coloring succeeds, check the efficiency. ---*/
else {
const su2double currentEfficiency =
coloringEfficiency(edgeColoring, omp_get_max_threads(), nextEdgeColorGroupSize);

/*--- If the coloring is not efficient, reduce the color group size. ---*/
if (currentEfficiency < COLORING_EFF_THRESH) {
upperEdgeColorGroupSize = nextEdgeColorGroupSize;
admissibleColoring = false;
}
/*--- Otherwise, enlarge the color group size. ---*/
else {
lowerEdgeColorGroupSize = nextEdgeColorGroupSize;
admissibleColoring = true;
}
}

const auto increment = (upperEdgeColorGroupSize - lowerEdgeColorGroupSize) / 2;
nextEdgeColorGroupSize = lowerEdgeColorGroupSize + increment;

/*--- Terminating condition. ---*/
if (increment == 0) {
break;
}
}

edgeColorGroupSize = nextEdgeColorGroupSize;

/*--- If the last tested coloring was not admissible, recompute the final coloring. ---*/
if (!admissibleColoring) {
edgeColoring = colorSparsePattern(pattern, edgeColorGroupSize, balanceColors);
}
}
/*--- No adaptivity. ---*/
else {
edgeColoring = colorSparsePattern(pattern, edgeColorGroupSize, balanceColors);
}

/*--- If the coloring fails use the natural coloring. This is a
* "soft" failure as this "bad" coloring should be detected
Expand Down
32 changes: 32 additions & 0 deletions SU2_CFD/include/solvers/CFVMFlowSolverBase.inl
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,16 @@ void CFVMFlowSolverBase<V, R>::HybridParallelInitialization(const CConfig& confi
* sum the fluxes for each cell and set the diagonal of the system matrix. ---*/

su2double parallelEff = 1.0;

#ifdef CODI_REVERSE_TYPE
/*--- For the discrete adjoint, the reducer strategy is costly. Prefer coloring, possibly with reduced edge color
* group size. Find the maximum edge color group size that yields an efficient coloring. Also, allow larger numbers
* of colors. ---*/
const bool relax = config.GetEdgeColoringRelaxDiscAdj();
const auto& coloring = geometry.GetEdgeColoring(&parallelEff, relax);
#else
const auto& coloring = geometry.GetEdgeColoring(&parallelEff);
#endif

/*--- The decision to use the strategy is local to each rank. ---*/
ReducerStrategy = parallelEff < COLORING_EFF_THRESH;
Expand Down Expand Up @@ -324,6 +333,29 @@ void CFVMFlowSolverBase<V, R>::HybridParallelInitialization(const CConfig& confi
<< "\n The memory usage of the discrete adjoint solver is higher when using the fallback."
#endif
<< endl;
} else {
if (SU2_MPI::GetRank() == MASTER_NODE) {
cout << "All ranks use edge coloring." << endl;
}
}

const su2double coloredParallelEff = ReducerStrategy ? 1.0 : parallelEff;
su2double minColoredParallelEff = 1.0;
SU2_MPI::Reduce(&coloredParallelEff, &minColoredParallelEff, 1, MPI_DOUBLE, MPI_MIN, MASTER_NODE, SU2_MPI::GetComm());

const unsigned long coloredNumColors = ReducerStrategy ? 0 : coloring.getOuterSize();
unsigned long maxColoredNumColors = 0;
SU2_MPI::Reduce(&coloredNumColors, &maxColoredNumColors, 1, MPI_UNSIGNED_LONG, MPI_MAX, MASTER_NODE, SU2_MPI::GetComm());

const unsigned long coloredEdgeColorGroupSize = ReducerStrategy ? 1 << 30 : geometry.GetEdgeColorGroupSize();
unsigned long minColoredEdgeColorGroupSize = 1 << 30;
SU2_MPI::Reduce(&coloredEdgeColorGroupSize, &minColoredEdgeColorGroupSize, 1, MPI_UNSIGNED_LONG, MPI_MIN, MASTER_NODE, SU2_MPI::GetComm());

if (SU2_MPI::GetRank() == MASTER_NODE && numRanksUsingReducer != SU2_MPI::GetSize()) {
cout << "Among the ranks that use edge coloring,\n"
<< " the minimum efficiency is " << minColoredParallelEff << ",\n"
<< " the maximum number of colors is " << maxColoredNumColors << ",\n"
<< " the minimum edge color group size is " << minColoredEdgeColorGroupSize << "." << endl;
}
}

Expand Down
8 changes: 8 additions & 0 deletions SU2_CFD/include/solvers/CScalarSolver.inl
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,15 @@ CScalarSolver<VariableType>::CScalarSolver(CGeometry* geometry, CConfig* config,
#ifdef HAVE_OMP
/*--- Get the edge coloring, see notes in CEulerSolver's constructor. ---*/
su2double parallelEff = 1.0;
#ifdef CODI_REVERSE_TYPE
/*--- For the discrete adjoint, the reducer strategy is costly. Prefer coloring, possibly with reduced edge color
* group size. Find the maximum edge color group size that yields an efficient coloring. Also, allow larger numbers
* of colors. ---*/
const bool relax = config->GetEdgeColoringRelaxDiscAdj();
const auto& coloring = geometry->GetEdgeColoring(&parallelEff, relax);
#else
const auto& coloring = geometry->GetEdgeColoring(&parallelEff);
#endif

ReducerStrategy = parallelEff < COLORING_EFF_THRESH;

Expand Down
4 changes: 2 additions & 2 deletions TestCases/hybrid_regression_AD.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def main():
pywrapper_FEA_AD_FlowLoad.test_vals_aarch64 = [-0.131745, -0.553214, -0.000364, -0.003101]
pywrapper_FEA_AD_FlowLoad.command = TestCase.Command(exec = "python", param = "run_adjoint.py --parallel -f")
pywrapper_FEA_AD_FlowLoad.timeout = 1600
pywrapper_FEA_AD_FlowLoad.tol = 5e-3
pywrapper_FEA_AD_FlowLoad.tol = 1e-2
pywrapper_FEA_AD_FlowLoad.new_output = False
pywrapper_FEA_AD_FlowLoad.enabled_with_tsan = False
test_list.append(pywrapper_FEA_AD_FlowLoad)
Expand All @@ -257,7 +257,7 @@ def main():
pywrapper_CFD_AD_MeshDisp.test_vals_aarch64 = [30.000000, -2.516536, 1.386443, 0.000000]
pywrapper_CFD_AD_MeshDisp.command = TestCase.Command(exec = "python", param = "run_adjoint.py --parallel -f")
pywrapper_CFD_AD_MeshDisp.timeout = 1600
pywrapper_CFD_AD_MeshDisp.tol = 1e-3
pywrapper_CFD_AD_MeshDisp.tol = 1e-2
pywrapper_CFD_AD_MeshDisp.new_output = False
pywrapper_CFD_AD_MeshDisp.enabled_with_tsan = False
test_list.append(pywrapper_CFD_AD_MeshDisp)
Expand Down
8 changes: 8 additions & 0 deletions config_template.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -2140,6 +2140,14 @@ UQ_DELTA_B= 1.0
% The optimum value/strategy is case-dependent.
EDGE_COLORING_GROUP_SIZE= 512
%
% Coloring tends to perform better for the discrete adjoint than reductions because
% it uses less memory and enables the shared reading optimization for color loops.
% This option allows an automatic fallback to smaller edge color group sizes on ranks
% where the requested edge color group size is not efficient. Specifically, the largest
% edge color group size up to EDGE_COLORING_GROUP_SIZE is chosen that is at least
% 0.875 efficient. Also, this option allows using more colors, up to 255 instead of up to 64.
EDGE_COLORING_RELAX_DISC_ADJ= YES
%
% Independent "threads per MPI rank" setting for LU-SGS and ILU preconditioners.
% For problems where time is spend mostly in the solution of linear systems (e.g. elasticity,
% very high CFL central schemes), AND, if the memory bandwidth of the machine is saturated
Expand Down

0 comments on commit f697ab8

Please sign in to comment.