Merge branch 'develop' into feature_incompressible

su2code · Dec 8, 2023 · f697ab8 · f697ab8
2 parents 1f26c1e + 2c9fbb6
commit f697ab8
Show file tree

Hide file tree

Showing 9 changed files with 120 additions and 6 deletions.
diff --git a/Common/include/CConfig.hpp b/Common/include/CConfig.hpp
@@ -1173,6 +1173,7 @@ class CConfig {
   string caseName;                 /*!< \brief Name of the current case */
 
   unsigned long edgeColorGroupSize; /*!< \brief Size of the edge groups colored for OpenMP parallelization of edge loops. */
+  bool edgeColoringRelaxDiscAdj;    /*!< \brief Allow fallback to smaller edge color group sizes and use more colors for the discrete adjoint. */
 
   INLET_SPANWISE_INTERP Kind_InletInterpolationFunction; /*!brief type of spanwise interpolation function to use for the inlet face. */
   INLET_INTERP_TYPE Kind_Inlet_InterpolationType;    /*!brief type of spanwise interpolation data to use for the inlet face. */
@@ -9660,6 +9661,11 @@ class CConfig {
    */
   unsigned long GetEdgeColoringGroupSize(void) const { return edgeColorGroupSize; }
 
+  /*!
+   * \brief Check if the discrete adjoint is allowed to relax the coloring, that is, allow smaller edge color group sizes and allow more colors.
+   */
+  bool GetEdgeColoringRelaxDiscAdj() const { return edgeColoringRelaxDiscAdj; }
+
   /*!
    * \brief Get the ParMETIS load balancing tolerance.
    */

diff --git a/Common/include/geometry/CGeometry.hpp b/Common/include/geometry/CGeometry.hpp
@@ -1720,10 +1720,14 @@ class CGeometry {
   /*!
    * \brief Get the edge coloring.
    * \note This method computes the coloring if that has not been done yet.
+   * \note Can be instructed to determine and use the maximum edge color group size between 1 and
+   * CGeometry::edgeColorGroupSize that yields a coloring that is at least as efficient as #COLORING_EFF_THRESH.
    * \param[out] efficiency - optional output of the coloring efficiency.
+   * \param[in] maximizeEdgeColorGroupSize - use the maximum edge color group size that gives an efficient coloring.
    * \return Reference to the coloring.
    */
-  const CCompressedSparsePatternUL& GetEdgeColoring(su2double* efficiency = nullptr);
+  const CCompressedSparsePatternUL& GetEdgeColoring(su2double* efficiency = nullptr,
+                                                    bool maximizeEdgeColorGroupSize = false);
 
   /*!
    * \brief Force the natural (sequential) edge coloring.

diff --git a/Common/include/toolboxes/graph_toolbox.hpp b/Common/include/toolboxes/graph_toolbox.hpp
@@ -484,7 +484,7 @@ T createNaturalColoring(Index_t numInnerIndexes) {
  * \param[out] indexColor - Optional, vector with colors given to the outer indices.
  * \return Coloring in the same type of the input pattern.
  */
-template <typename Color_t = char, size_t MaxColors = 64, size_t MaxMB = 128, class T>
+template <typename Color_t = unsigned char, size_t MaxColors = 255, size_t MaxMB = 128, class T>
 T colorSparsePattern(const T& pattern, size_t groupSize = 1, bool balanceColors = false,
                      std::vector<Color_t>* indexColor = nullptr) {
   static_assert(std::is_integral<Color_t>::value, "");

diff --git a/Common/src/CConfig.cpp b/Common/src/CConfig.cpp
@@ -2925,6 +2925,9 @@ void CConfig::SetConfig_Options() {
   /* DESCRIPTION: Size of the edge groups colored for thread parallel edge loops (0 forces the reducer strategy). */
   addUnsignedLongOption("EDGE_COLORING_GROUP_SIZE", edgeColorGroupSize, 512);
 
+  /* DESCRIPTION: Allow fallback to smaller edge color group sizes for the discrete adjoint and allow more colors. */
+  addBoolOption("EDGE_COLORING_RELAX_DISC_ADJ", edgeColoringRelaxDiscAdj, true);
+
   /*--- options that are used for libROM ---*/
   /*!\par CONFIG_CATEGORY:libROM options \ingroup Config*/
 

diff --git a/Common/src/geometry/CGeometry.cpp b/Common/src/geometry/CGeometry.cpp
@@ -3609,7 +3609,7 @@ const su2vector<unsigned long>& CGeometry::GetTransposeSparsePatternMap(Connecti
   return pattern.transposePtr();
 }
 
-const CCompressedSparsePatternUL& CGeometry::GetEdgeColoring(su2double* efficiency) {
+const CCompressedSparsePatternUL& CGeometry::GetEdgeColoring(su2double* efficiency, bool maximizeEdgeColorGroupSize) {
   /*--- Check for dry run mode with dummy geometry. ---*/
   if (nEdge == 0) return edgeColoring;
 
@@ -3637,7 +3637,60 @@ const CCompressedSparsePatternUL& CGeometry::GetEdgeColoring(su2double* efficien
 
     /*--- Color the edges. ---*/
     constexpr bool balanceColors = true;
-    edgeColoring = colorSparsePattern(pattern, edgeColorGroupSize, balanceColors);
+
+    /*--- If requested, find an efficient coloring with maximum color group size (up to edgeColorGroupSize). ---*/
+    if (maximizeEdgeColorGroupSize) {
+      auto upperEdgeColorGroupSize = edgeColorGroupSize + 1; /* upper bound that is deemed too large */
+      auto nextEdgeColorGroupSize = edgeColorGroupSize;      /* next value that we are going to try */
+      auto lowerEdgeColorGroupSize = 1ul;                    /* lower bound that is known to work */
+
+      bool admissibleColoring = false; /* keep track wether the last tested coloring is admissible */
+
+      while (true) {
+        edgeColoring = colorSparsePattern(pattern, nextEdgeColorGroupSize, balanceColors);
+
+        /*--- If the coloring fails, reduce the color group size. ---*/
+        if (edgeColoring.empty()) {
+          upperEdgeColorGroupSize = nextEdgeColorGroupSize;
+          admissibleColoring = false;
+        }
+        /*--- If the coloring succeeds, check the efficiency. ---*/
+        else {
+          const su2double currentEfficiency =
+              coloringEfficiency(edgeColoring, omp_get_max_threads(), nextEdgeColorGroupSize);
+
+          /*--- If the coloring is not efficient, reduce the color group size. ---*/
+          if (currentEfficiency < COLORING_EFF_THRESH) {
+            upperEdgeColorGroupSize = nextEdgeColorGroupSize;
+            admissibleColoring = false;
+          }
+          /*--- Otherwise, enlarge the color group size. ---*/
+          else {
+            lowerEdgeColorGroupSize = nextEdgeColorGroupSize;
+            admissibleColoring = true;
+          }
+        }
+
+        const auto increment = (upperEdgeColorGroupSize - lowerEdgeColorGroupSize) / 2;
+        nextEdgeColorGroupSize = lowerEdgeColorGroupSize + increment;
+
+        /*--- Terminating condition. ---*/
+        if (increment == 0) {
+          break;
+        }
+      }
+
+      edgeColorGroupSize = nextEdgeColorGroupSize;
+
+      /*--- If the last tested coloring was not admissible, recompute the final coloring. ---*/
+      if (!admissibleColoring) {
+        edgeColoring = colorSparsePattern(pattern, edgeColorGroupSize, balanceColors);
+      }
+    }
+    /*--- No adaptivity. ---*/
+    else {
+      edgeColoring = colorSparsePattern(pattern, edgeColorGroupSize, balanceColors);
+    }
 
     /*--- If the coloring fails use the natural coloring. This is a
      *    "soft" failure as this "bad" coloring should be detected

diff --git a/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl b/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl
@@ -288,7 +288,16 @@ void CFVMFlowSolverBase<V, R>::HybridParallelInitialization(const CConfig& confi
    *    sum the fluxes for each cell and set the diagonal of the system matrix. ---*/
 
   su2double parallelEff = 1.0;
+
+#ifdef CODI_REVERSE_TYPE
+  /*--- For the discrete adjoint, the reducer strategy is costly. Prefer coloring, possibly with reduced edge color
+   *    group size. Find the maximum edge color group size that yields an efficient coloring. Also, allow larger numbers
+   *    of colors. ---*/
+  const bool relax =  config.GetEdgeColoringRelaxDiscAdj();
+  const auto& coloring = geometry.GetEdgeColoring(&parallelEff, relax);
+#else
   const auto& coloring = geometry.GetEdgeColoring(&parallelEff);
+#endif
 
   /*--- The decision to use the strategy is local to each rank. ---*/
   ReducerStrategy = parallelEff < COLORING_EFF_THRESH;
@@ -324,6 +333,29 @@ void CFVMFlowSolverBase<V, R>::HybridParallelInitialization(const CConfig& confi
            << "\n         The memory usage of the discrete adjoint solver is higher when using the fallback."
 #endif
            << endl;
+    } else {
+      if (SU2_MPI::GetRank() == MASTER_NODE) {
+        cout << "All ranks use edge coloring." << endl;
+      }
+    }
+
+    const su2double coloredParallelEff = ReducerStrategy ? 1.0 : parallelEff;
+    su2double minColoredParallelEff = 1.0;
+    SU2_MPI::Reduce(&coloredParallelEff, &minColoredParallelEff, 1, MPI_DOUBLE, MPI_MIN, MASTER_NODE, SU2_MPI::GetComm());
+
+    const unsigned long coloredNumColors = ReducerStrategy ? 0 : coloring.getOuterSize();
+    unsigned long maxColoredNumColors = 0;
+    SU2_MPI::Reduce(&coloredNumColors, &maxColoredNumColors, 1, MPI_UNSIGNED_LONG, MPI_MAX, MASTER_NODE, SU2_MPI::GetComm());
+
+    const unsigned long coloredEdgeColorGroupSize = ReducerStrategy ? 1 << 30 : geometry.GetEdgeColorGroupSize();
+    unsigned long minColoredEdgeColorGroupSize = 1 << 30;
+    SU2_MPI::Reduce(&coloredEdgeColorGroupSize, &minColoredEdgeColorGroupSize, 1, MPI_UNSIGNED_LONG, MPI_MIN, MASTER_NODE, SU2_MPI::GetComm());
+
+    if (SU2_MPI::GetRank() == MASTER_NODE && numRanksUsingReducer != SU2_MPI::GetSize()) {
+      cout << "Among the ranks that use edge coloring,\n"
+           << "         the minimum efficiency is " << minColoredParallelEff << ",\n"
+           << "         the maximum number of colors is " << maxColoredNumColors << ",\n"
+           << "         the minimum edge color group size is " << minColoredEdgeColorGroupSize << "." << endl;
     }
   }
 

diff --git a/SU2_CFD/include/solvers/CScalarSolver.inl b/SU2_CFD/include/solvers/CScalarSolver.inl
@@ -46,7 +46,15 @@ CScalarSolver<VariableType>::CScalarSolver(CGeometry* geometry, CConfig* config,
 #ifdef HAVE_OMP
   /*--- Get the edge coloring, see notes in CEulerSolver's constructor. ---*/
   su2double parallelEff = 1.0;
+#ifdef CODI_REVERSE_TYPE
+  /*--- For the discrete adjoint, the reducer strategy is costly. Prefer coloring, possibly with reduced edge color
+   *    group size. Find the maximum edge color group size that yields an efficient coloring. Also, allow larger numbers
+   *    of colors. ---*/
+  const bool relax =  config->GetEdgeColoringRelaxDiscAdj();
+  const auto& coloring = geometry->GetEdgeColoring(&parallelEff, relax);
+#else
   const auto& coloring = geometry->GetEdgeColoring(&parallelEff);
+#endif
 
   ReducerStrategy = parallelEff < COLORING_EFF_THRESH;
 

diff --git a/TestCases/hybrid_regression_AD.py b/TestCases/hybrid_regression_AD.py
@@ -242,7 +242,7 @@ def main():
     pywrapper_FEA_AD_FlowLoad.test_vals_aarch64 = [-0.131745, -0.553214, -0.000364, -0.003101]
     pywrapper_FEA_AD_FlowLoad.command       = TestCase.Command(exec = "python", param = "run_adjoint.py --parallel -f")
     pywrapper_FEA_AD_FlowLoad.timeout       = 1600
-    pywrapper_FEA_AD_FlowLoad.tol           = 5e-3
+    pywrapper_FEA_AD_FlowLoad.tol           = 1e-2
     pywrapper_FEA_AD_FlowLoad.new_output    = False
     pywrapper_FEA_AD_FlowLoad.enabled_with_tsan = False
     test_list.append(pywrapper_FEA_AD_FlowLoad)
@@ -257,7 +257,7 @@ def main():
     pywrapper_CFD_AD_MeshDisp.test_vals_aarch64 = [30.000000, -2.516536, 1.386443, 0.000000]
     pywrapper_CFD_AD_MeshDisp.command       = TestCase.Command(exec = "python", param = "run_adjoint.py --parallel -f")
     pywrapper_CFD_AD_MeshDisp.timeout       = 1600
-    pywrapper_CFD_AD_MeshDisp.tol           = 1e-3
+    pywrapper_CFD_AD_MeshDisp.tol           = 1e-2
     pywrapper_CFD_AD_MeshDisp.new_output    = False
     pywrapper_CFD_AD_MeshDisp.enabled_with_tsan = False
     test_list.append(pywrapper_CFD_AD_MeshDisp)

diff --git a/config_template.cfg b/config_template.cfg
@@ -2140,6 +2140,14 @@ UQ_DELTA_B= 1.0
 % The optimum value/strategy is case-dependent.
 EDGE_COLORING_GROUP_SIZE= 512
 %
+% Coloring tends to perform better for the discrete adjoint than reductions because
+% it uses less memory and enables the shared reading optimization for color loops.
+% This option allows an automatic fallback to smaller edge color group sizes on ranks
+% where the requested edge color group size is not efficient. Specifically, the largest
+% edge color group size up to EDGE_COLORING_GROUP_SIZE is chosen that is at least
+% 0.875 efficient. Also, this option allows using more colors, up to 255 instead of up to 64.
+EDGE_COLORING_RELAX_DISC_ADJ= YES
+%
 % Independent "threads per MPI rank" setting for LU-SGS and ILU preconditioners.
 % For problems where time is spend mostly in the solution of linear systems (e.g. elasticity,
 % very high CFL central schemes), AND, if the memory bandwidth of the machine is saturated