From 52aa830974ff4d3b5a232945129ccfdcf02c2809 Mon Sep 17 00:00:00 2001 From: Gerald Paul Bowen Collom Date: Tue, 27 Jun 2023 23:06:19 -0700 Subject: [PATCH] Adding locality-aware mpi --- src/parcsr_ls/par_amg_setup.c | 16 + src/parcsr_mv/_hypre_parcsr_mv.h | 23 ++ src/parcsr_mv/new_commpkg.c | 74 +++++ src/parcsr_mv/par_csr_communication.c | 406 +++++++++++++++++++++++++- src/parcsr_mv/par_csr_communication.h | 17 ++ src/parcsr_mv/protos.h | 6 + src/test/ij.c | 10 + src/utilities/_hypre_utilities.h | 10 + src/utilities/general.c | 4 + src/utilities/handle.h | 10 + 10 files changed, 572 insertions(+), 4 deletions(-) diff --git a/src/parcsr_ls/par_amg_setup.c b/src/parcsr_ls/par_amg_setup.c index e21817d49b..07f77a66eb 100644 --- a/src/parcsr_ls/par_amg_setup.c +++ b/src/parcsr_ls/par_amg_setup.c @@ -3139,6 +3139,22 @@ hypre_BoomerAMGSetup( void *amg_vdata, #if defined(HYPRE_USING_GPU) if (exec == HYPRE_EXEC_HOST) #endif + +#ifdef HYPRE_USING_NODE_AWARE_MPI + //hypre_printf("%d\n", hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle())); + if (level >= hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle())) + { + hypre_ParCSRMatrixCommPkg(A_array[level])->use_neighbor = 1; + //hypre_printf("Level %d: use neighbor\n", level); + hypre_ParCSRCreateCommGraph( hypre_ParCSRMatrixFirstColDiag(A_array[level]), + hypre_ParCSRMatrixColMapOffd(A_array[level]), + hypre_ParCSRMatrixComm(A_array[level]), + hypre_ParCSRMatrixCommPkg(A_array[level])); + // Create comm handle in setup, so cost doesn't contribute to solve time + hypre_ParCSRCommPkgGetPersistentCommHandle(1, hypre_ParCSRMatrixCommPkg(A_array[level])); + } +#endif + { HYPRE_Real size = ((HYPRE_Real)fine_size) * .75; if (coarsen_type > 0 && coarse_size >= (HYPRE_BigInt)size) diff --git a/src/parcsr_mv/_hypre_parcsr_mv.h b/src/parcsr_mv/_hypre_parcsr_mv.h index 8a659feebc..2bee7205b9 100644 --- a/src/parcsr_mv/_hypre_parcsr_mv.h +++ b/src/parcsr_mv/_hypre_parcsr_mv.h @@ -24,6 +24,10 @@ extern "C" { #ifndef HYPRE_PAR_CSR_COMMUNICATION_HEADER #define HYPRE_PAR_CSR_COMMUNICATION_HEADER +#ifdef HYPRE_USING_NODE_AWARE_MPI +#include "mpi_advance.h" +#endif + /*-------------------------------------------------------------------------- * hypre_ParCSRCommPkg: * Structure containing information for doing communications @@ -60,6 +64,9 @@ typedef struct void *recv_data_buffer; HYPRE_Int num_requests; hypre_MPI_Request *requests; +#ifdef HYPRE_USING_NODE_AWARE_MPI + MPIX_Request *Xrequest; +#endif } hypre_ParCSRCommHandle; typedef hypre_ParCSRCommHandle hypre_ParCSRPersistentCommHandle; @@ -67,6 +74,11 @@ typedef hypre_ParCSRCommHandle hypre_ParCSRPersistentCommHandle; typedef struct _hypre_ParCSRCommPkg { MPI_Comm comm; +#ifdef HYPRE_USING_NODE_AWARE_MPI + MPIX_Comm *neighbor_comm; + MPIX_Topo *neighbor_topo; + MPIX_Topo *neighborT_topo; +#endif HYPRE_Int num_components; HYPRE_Int num_sends; HYPRE_Int *send_procs; @@ -76,6 +88,11 @@ typedef struct _hypre_ParCSRCommPkg HYPRE_Int num_recvs; HYPRE_Int *recv_procs; HYPRE_Int *recv_vec_starts; +#ifdef HYPRE_USING_NODE_AWARE_MPI + HYPRE_Int use_neighbor; + long *global_send_indices; + long *global_recv_indices; +#endif /* remote communication information */ hypre_MPI_Datatype *send_mpi_types; hypre_MPI_Datatype *recv_mpi_types; @@ -798,6 +815,12 @@ HYPRE_Int hypre_RangeFillResponseIJDetermineRecvProcs ( void *p_recv_contact_buf HYPRE_Int hypre_FillResponseIJDetermineSendProcs ( void *p_recv_contact_buf, HYPRE_Int contact_size, HYPRE_Int contact_proc, void *ro, MPI_Comm comm, void **p_send_response_buf, HYPRE_Int *response_message_size ); +#ifdef HYPRE_USING_NODE_AWARE_MPI +void hypre_ParCSRCreateCommGraph( HYPRE_BigInt first_col_diag, + HYPRE_BigInt *col_map_offd, + MPI_Comm comm, + hypre_ParCSRCommPkg *comm_pkg ); +#endif /* numbers.c */ hypre_NumbersNode *hypre_NumbersNewNode ( void ); diff --git a/src/parcsr_mv/new_commpkg.c b/src/parcsr_mv/new_commpkg.c index f835fa3e91..3c7dc6f84a 100644 --- a/src/parcsr_mv/new_commpkg.c +++ b/src/parcsr_mv/new_commpkg.c @@ -11,6 +11,9 @@ *-----------------------------------------------------*/ #include "_hypre_parcsr_mv.h" +#ifdef HYPRE_USING_NODE_AWARE_MPI +#include +#endif /* some debugging tools*/ #define mydebug 0 @@ -584,6 +587,18 @@ hypre_NewCommPkgDestroy( hypre_ParCSRMatrix *parcsr_A ) hypre_TFree(hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), HYPRE_MEMORY_HOST); } + HYPRE_ANNOTATE_REGION_BEGIN("%s", "MPI graph free"); + if (comm_pkg->neighbor_topo) { + MPIX_Topo_free(comm_pkg->neighbor_topo); + } + if (comm_pkg->neighborT_topo) { + MPIX_Topo_free(comm_pkg->neighborT_topo); + } + if (comm_pkg->neighbor_comm) { + MPIX_Comm_free(comm_pkg->neighbor_comm); + } + HYPRE_ANNOTATE_REGION_END("%s", "MPI graph free"); + hypre_TFree(comm_pkg, HYPRE_MEMORY_HOST); hypre_ParCSRMatrixCommPkg(parcsr_A) = NULL; @@ -757,3 +772,62 @@ hypre_FillResponseIJDetermineSendProcs(void *p_recv_contact_buf, return hypre_error_flag; } + +/*-------------------------------------------------------------------- + * hypre_ParCSRCreateCommGraph + * + * Create communication topology graph for MPI neighborhood + * collectives + *--------------------------------------------------------------------*/ + +#ifdef HYPRE_USING_NODE_AWARE_MPI +void +hypre_ParCSRCreateCommGraph(HYPRE_BigInt first_col_diag, + HYPRE_BigInt *col_map_offd, + MPI_Comm comm, + hypre_ParCSRCommPkg *comm_pkg) { + HYPRE_Int num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg); + HYPRE_Int num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg); + HYPRE_Int *send_map_starts = hypre_ParCSRCommPkgSendMapStarts(comm_pkg); + HYPRE_Int *recv_vec_starts = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg); + HYPRE_Int *send_map_elmts = hypre_ParCSRCommPkgSendMapElmts(comm_pkg); + + int *sendcounts = (int *)malloc(num_sends * sizeof(int)); + int *recvcounts = (int *)malloc(num_recvs * sizeof(int)); + for (int i = 0; i < num_sends; i++) { + sendcounts[i] = send_map_starts[i+1] - send_map_starts[i]; + } + for (int i = 0; i < num_recvs; i++) { + recvcounts[i] = recv_vec_starts[i+1] - recv_vec_starts[i]; + } + + HYPRE_ANNOTATE_REGION_BEGIN("%s", "MPI topo creation"); + MPIX_Comm_init(&comm_pkg->neighbor_comm, comm); + MPIX_Topo_dist_graph_create_adjacent(num_recvs, hypre_ParCSRCommPkgRecvProcs(comm_pkg), + recvcounts, + num_sends, hypre_ParCSRCommPkgSendProcs(comm_pkg), + sendcounts, + MPI_INFO_NULL, 0, &(comm_pkg->neighbor_topo)); + MPIX_Topo_dist_graph_create_adjacent(num_sends, hypre_ParCSRCommPkgSendProcs(comm_pkg), + sendcounts, + num_recvs, hypre_ParCSRCommPkgRecvProcs(comm_pkg), + recvcounts, + MPI_INFO_NULL, 0, &(comm_pkg->neighborT_topo)); + HYPRE_ANNOTATE_REGION_END("%s", "MPI topo creation"); + + HYPRE_Int num_send_elmts = send_map_starts[num_sends]; + comm_pkg->global_send_indices = hypre_CTAlloc(long, num_send_elmts, HYPRE_MEMORY_HOST); + for (int i = 0; i < num_sends; i++) { + for (int j = send_map_starts[i]; j < send_map_starts[i+1]; j++) { + comm_pkg->global_send_indices[j] = send_map_elmts[j] + first_col_diag; + } + } + HYPRE_Int num_recv_elmts = recv_vec_starts[num_recvs]; + comm_pkg->global_recv_indices = hypre_CTAlloc(long, num_recv_elmts, HYPRE_MEMORY_HOST); + for (int i = 0; i < num_recvs; i++) { + for (int j = recv_vec_starts[i]; j < recv_vec_starts[i+1]; j++) { + comm_pkg->global_recv_indices[j] = col_map_offd[j]; + } + } +} +#endif diff --git a/src/parcsr_mv/par_csr_communication.c b/src/parcsr_mv/par_csr_communication.c index d6d7a61832..e0cc5b48ac 100644 --- a/src/parcsr_mv/par_csr_communication.c +++ b/src/parcsr_mv/par_csr_communication.c @@ -7,6 +7,10 @@ #include "_hypre_parcsr_mv.h" +#ifdef HYPRE_USING_NODE_AWARE_MPI +#include "mpi_advance.h" +#endif + /*==========================================================================*/ #ifdef HYPRE_USING_PERSISTENT_COMM @@ -60,11 +64,41 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_Int num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg); MPI_Comm comm = hypre_ParCSRCommPkgComm(comm_pkg); + +#ifdef HYPRE_USING_NODE_AWARE_MPI + HYPRE_Int num_requests; + hypre_MPI_Request *requests; + + HYPRE_Int *send_sizes; + HYPRE_Int *recv_sizes; + HYPRE_BigInt num_send_elmts; + HYPRE_BigInt num_recv_elmts; + MPIX_Request *Xrequest; + + if (comm_pkg->use_neighbor) { + if (comm_pkg->neighbor_comm == NULL) { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Trying to communicate with a NULL communicator\n"); + } + + send_sizes = hypre_TAlloc(HYPRE_Int, num_sends, HYPRE_MEMORY_HOST); + recv_sizes = hypre_TAlloc(HYPRE_Int, num_recvs, HYPRE_MEMORY_HOST); + + num_send_elmts = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends); + num_recv_elmts = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs); + } else { + num_requests = num_sends + num_recvs; + requests = hypre_CTAlloc(hypre_MPI_Request, num_requests, HYPRE_MEMORY_HOST); + + hypre_ParCSRCommHandleNumRequests(comm_handle) = num_requests; + hypre_ParCSRCommHandleRequests(comm_handle) = requests; + } +#else HYPRE_Int num_requests = num_sends + num_recvs; hypre_MPI_Request *requests = hypre_CTAlloc(hypre_MPI_Request, num_requests, HYPRE_MEMORY_HOST); hypre_ParCSRCommHandleNumRequests(comm_handle) = num_requests; hypre_ParCSRCommHandleRequests(comm_handle) = requests; +#endif void *send_buff = NULL, *recv_buff = NULL; @@ -77,6 +111,47 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); recv_buff = hypre_TAlloc(HYPRE_Complex, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!comm_pkg->use_neighbor) { + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_Complex *)recv_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_Complex *)send_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, requests + num_recvs + i ); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_locality_topo_alltoallv_init( (HYPRE_Complex *)send_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + comm_pkg->global_send_indices, + HYPRE_MPI_COMPLEX, + (HYPRE_Complex *)recv_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + comm_pkg->global_recv_indices, + HYPRE_MPI_COMPLEX, comm_pkg->neighbor_topo, + comm_pkg->neighbor_comm, + MPI_INFO_NULL, &Xrequest); + } +#else for (i = 0; i < num_recvs; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); @@ -93,6 +168,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm hypre_MPI_Send_init( (HYPRE_Complex *)send_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, ip, 0, comm, requests + num_recvs + i ); } +#endif break; case HYPRE_COMM_PKG_JOB_COMPLEX_TRANSPOSE: @@ -102,6 +178,45 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); send_buff = hypre_TAlloc(HYPRE_Complex, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!comm_pkg->use_neighbor) { + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_Complex *)recv_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_Complex *)send_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, requests + num_sends + i ); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_part_locality_topo_alltoallv_init( (HYPRE_Complex *)send_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + HYPRE_MPI_COMPLEX, + (HYPRE_Complex *)recv_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + HYPRE_MPI_COMPLEX, comm_pkg->neighborT_topo, + comm_pkg->neighbor_comm, + MPI_INFO_NULL, &Xrequest); + } +#else for (i = 0; i < num_sends; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); @@ -118,6 +233,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm hypre_MPI_Send_init( (HYPRE_Complex *)send_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, ip, 0, comm, requests + num_sends + i ); } +#endif break; case HYPRE_COMM_PKG_JOB_INT: @@ -127,6 +243,47 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); recv_buff = hypre_TAlloc(HYPRE_Int, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!comm_pkg->use_neighbor) { + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_Int *)recv_buff + vec_start, vec_len, HYPRE_MPI_INT, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_Int *)send_buff + vec_start, vec_len, HYPRE_MPI_INT, + ip, 0, comm, requests + num_recvs + i ); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_locality_topo_alltoallv_init( (HYPRE_Int *)send_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + comm_pkg->global_send_indices, + HYPRE_MPI_INT, + (HYPRE_Int *)recv_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + comm_pkg->global_recv_indices, + HYPRE_MPI_INT, comm_pkg->neighbor_topo, + comm_pkg->neighbor_comm, + 0, &Xrequest); + } +#else for (i = 0; i < num_recvs; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); @@ -143,6 +300,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm hypre_MPI_Send_init( (HYPRE_Int *)send_buff + vec_start, vec_len, HYPRE_MPI_INT, ip, 0, comm, requests + num_recvs + i ); } +#endif break; case HYPRE_COMM_PKG_JOB_INT_TRANSPOSE: @@ -152,6 +310,45 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); send_buff = hypre_TAlloc(HYPRE_Int, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!comm_pkg->use_neighbor) { + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_Int *)recv_buff + vec_start, vec_len, HYPRE_MPI_INT, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_Int *)send_buff + vec_start, vec_len, HYPRE_MPI_INT, + ip, 0, comm, requests + num_sends + i ); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_part_locality_topo_alltoallv_init( (HYPRE_Int *)send_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + HYPRE_MPI_INT, + (HYPRE_Int *)recv_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + HYPRE_MPI_INT, comm_pkg->neighborT_topo, + comm_pkg->neighbor_comm, + 0, &Xrequest); + } +#else for (i = 0; i < num_sends; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); @@ -168,6 +365,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm hypre_MPI_Send_init( (HYPRE_Int *)send_buff + vec_start, vec_len, HYPRE_MPI_INT, ip, 0, comm, requests + num_sends + i ); } +#endif break; case HYPRE_COMM_PKG_JOB_BIGINT: @@ -177,6 +375,49 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); recv_buff = hypre_TAlloc(HYPRE_BigInt, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!comm_pkg->use_neighbor) { + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_BigInt *)recv_buff + (HYPRE_BigInt)vec_start, vec_len, + HYPRE_MPI_BIG_INT, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_BigInt *)send_buff + (HYPRE_BigInt)vec_start, vec_len, + HYPRE_MPI_BIG_INT, + ip, 0, comm, requests + num_recvs + i); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_locality_topo_alltoallv_init( (HYPRE_BigInt *)send_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + comm_pkg->global_send_indices, + HYPRE_MPI_BIG_INT, + (HYPRE_BigInt *)recv_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + comm_pkg->global_recv_indices, + HYPRE_MPI_BIG_INT, comm_pkg->neighbor_topo, + comm_pkg->neighbor_comm, + 0, &Xrequest); + } +#else for (i = 0; i < num_recvs; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); @@ -195,6 +436,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MPI_BIG_INT, ip, 0, comm, requests + num_recvs + i); } +#endif break; case HYPRE_COMM_PKG_JOB_BIGINT_TRANSPOSE: @@ -204,6 +446,47 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); send_buff = hypre_TAlloc(HYPRE_BigInt, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!comm_pkg->use_neighbor) { + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_BigInt *)recv_buff + (HYPRE_BigInt)vec_start, vec_len, + HYPRE_MPI_BIG_INT, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_BigInt *)send_buff + (HYPRE_BigInt)vec_start, vec_len, + HYPRE_MPI_BIG_INT, + ip, 0, comm, requests + num_sends + i); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_part_locality_topo_alltoallv_init( (HYPRE_BigInt *)send_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + HYPRE_MPI_BIG_INT, + (HYPRE_BigInt *)recv_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + HYPRE_MPI_BIG_INT, comm_pkg->neighborT_topo, + comm_pkg->neighbor_comm, + 0, &Xrequest); + } +#else for (i = 0; i < num_sends; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); @@ -218,22 +501,33 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; - hypre_MPI_Send_init( (HYPRE_BigInt *)send_buff + (HYPRE_BigInt)vec_start, vec_len, HYPRE_MPI_BIG_INT, ip, 0, comm, requests + num_sends + i); } +#endif break; default: hypre_assert(1 == 0); break; } // switch (job_type) + hypre_ParCSRCommHandleCommPkg(comm_handle) = comm_pkg; hypre_ParCSRCommHandleRecvDataBuffer(comm_handle) = recv_buff; hypre_ParCSRCommHandleSendDataBuffer(comm_handle) = send_buff; hypre_ParCSRCommHandleNumSendBytes(comm_handle) = num_bytes_send; hypre_ParCSRCommHandleNumRecvBytes(comm_handle) = num_bytes_recv; + +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (comm_pkg->use_neighbor) { + comm_handle->Xrequest = Xrequest; + + hypre_TFree(send_sizes, HYPRE_MEMORY_HOST); + hypre_TFree(recv_sizes, HYPRE_MEMORY_HOST); + } +#endif + return ( comm_handle ); } @@ -266,8 +560,14 @@ hypre_ParCSRPersistentCommHandleDestroy( hypre_ParCSRPersistentCommHandle *comm_ { hypre_TFree(hypre_ParCSRCommHandleSendDataBuffer(comm_handle), HYPRE_MEMORY_HOST); hypre_TFree(hypre_ParCSRCommHandleRecvDataBuffer(comm_handle), HYPRE_MEMORY_HOST); - hypre_TFree(comm_handle->requests, HYPRE_MEMORY_HOST); - + if (comm_handle->requests) { + hypre_TFree(comm_handle->requests, HYPRE_MEMORY_HOST); + } +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (comm_handle->Xrequest) { + MPIX_Request_free(comm_handle->Xrequest); + } +#endif hypre_TFree(comm_handle, HYPRE_MEMORY_HOST); } } @@ -284,6 +584,43 @@ hypre_ParCSRPersistentCommHandleStart( hypre_ParCSRPersistentCommHandle *comm_ha hypre_ParCSRCommHandleSendData(comm_handle) = send_data; hypre_ParCSRCommHandleSendMemoryLocation(comm_handle) = send_memory_location; +#ifdef HYPRE_USING_NODE_AWARE_MPI + hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRCommHandleCommPkg(comm_handle); + if (!comm_pkg->use_neighbor) { + if (hypre_ParCSRCommHandleNumRequests(comm_handle) > 0) + { + hypre_TMemcpy( hypre_ParCSRCommHandleSendDataBuffer(comm_handle), + send_data, + char, + hypre_ParCSRCommHandleNumSendBytes(comm_handle), + HYPRE_MEMORY_HOST, + send_memory_location ); + HYPRE_Int ret = hypre_MPI_Startall(hypre_ParCSRCommHandleNumRequests(comm_handle), + hypre_ParCSRCommHandleRequests(comm_handle)); + if (hypre_MPI_SUCCESS != ret) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "MPI error\n"); + /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ + } + } + } else { + if (comm_handle->Xrequest) + { + hypre_TMemcpy( hypre_ParCSRCommHandleSendDataBuffer(comm_handle), + send_data, + char, + hypre_ParCSRCommHandleNumSendBytes(comm_handle), + HYPRE_MEMORY_HOST, + send_memory_location ); + HYPRE_Int ret = (HYPRE_Int) MPIX_Start(comm_handle->Xrequest); + if (hypre_MPI_SUCCESS != ret) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "MPI error\n"); + /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ + } + } + } +#else if (hypre_ParCSRCommHandleNumRequests(comm_handle) > 0) { hypre_TMemcpy( hypre_ParCSRCommHandleSendDataBuffer(comm_handle), @@ -292,7 +629,6 @@ hypre_ParCSRPersistentCommHandleStart( hypre_ParCSRPersistentCommHandle *comm_ha hypre_ParCSRCommHandleNumSendBytes(comm_handle), HYPRE_MEMORY_HOST, send_memory_location ); - HYPRE_Int ret = hypre_MPI_Startall(hypre_ParCSRCommHandleNumRequests(comm_handle), hypre_ParCSRCommHandleRequests(comm_handle)); if (hypre_MPI_SUCCESS != ret) @@ -301,6 +637,7 @@ hypre_ParCSRPersistentCommHandleStart( hypre_ParCSRPersistentCommHandle *comm_ha /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ } } +#endif } /*------------------------------------------------------------------ @@ -315,6 +652,48 @@ hypre_ParCSRPersistentCommHandleWait( hypre_ParCSRPersistentCommHandle *comm_han hypre_ParCSRCommHandleRecvData(comm_handle) = recv_data; hypre_ParCSRCommHandleRecvMemoryLocation(comm_handle) = recv_memory_location; +#ifdef HYPRE_USING_NODE_AWARE_MPI + hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRCommHandleCommPkg(comm_handle); + if (!comm_pkg->use_neighbor) { + if (hypre_ParCSRCommHandleNumRequests(comm_handle) > 0) + { + HYPRE_Int ret = hypre_MPI_Waitall(hypre_ParCSRCommHandleNumRequests(comm_handle), + hypre_ParCSRCommHandleRequests(comm_handle), + hypre_MPI_STATUSES_IGNORE); + if (hypre_MPI_SUCCESS != ret) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "MPI error\n"); + /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ + } + + hypre_TMemcpy(recv_data, + hypre_ParCSRCommHandleRecvDataBuffer(comm_handle), + char, + hypre_ParCSRCommHandleNumRecvBytes(comm_handle), + recv_memory_location, + HYPRE_MEMORY_HOST); + } + } else { + if (comm_handle->Xrequest) + { + HYPRE_Int ret = (HYPRE_Int) MPIX_Wait(comm_handle->Xrequest, + MPI_STATUS_IGNORE); + + if (hypre_MPI_SUCCESS != ret) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "MPI error\n"); + /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ + } + + hypre_TMemcpy(recv_data, + hypre_ParCSRCommHandleRecvDataBuffer(comm_handle), + char, + hypre_ParCSRCommHandleNumRecvBytes(comm_handle), + recv_memory_location, + HYPRE_MEMORY_HOST); + } + } +#else if (hypre_ParCSRCommHandleNumRequests(comm_handle) > 0) { HYPRE_Int ret = hypre_MPI_Waitall(hypre_ParCSRCommHandleNumRequests(comm_handle), @@ -333,6 +712,7 @@ hypre_ParCSRPersistentCommHandleWait( hypre_ParCSRPersistentCommHandle *comm_han recv_memory_location, HYPRE_MEMORY_HOST); } +#endif } #endif // HYPRE_USING_PERSISTENT_COMM @@ -1030,6 +1410,10 @@ hypre_ParCSRCommPkgCreateAndFill( MPI_Comm comm, } #endif +#ifdef HYPRE_USING_NODE_AWARE_MPI + comm_pkg->use_neighbor = 0; +#endif + /* Set input info */ hypre_ParCSRCommPkgComm(comm_pkg) = comm; hypre_ParCSRCommPkgNumRecvs(comm_pkg) = num_recvs; @@ -1191,6 +1575,9 @@ hypre_MatvecCommPkgCreate ( hypre_ParCSRMatrix *A ) num_cols_offd, global_num_cols, apart, comm_pkg ); +#ifdef HYPRE_USING_NODE_AWARE_MPI + comm_pkg->use_neighbor = 0; +#endif HYPRE_ANNOTATE_FUNC_END; @@ -1238,6 +1625,13 @@ hypre_MatvecCommPkgDestroy( hypre_ParCSRCommPkg *comm_pkg ) hypre_CSRMatrixDestroy(hypre_ParCSRCommPkgMatrixE(comm_pkg)); #endif +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (comm_pkg->use_neighbor) { + MPIX_Topo_free(comm_pkg->neighbor_topo); + MPIX_Topo_free(comm_pkg->neighborT_topo); + MPIX_Comm_free(comm_pkg->neighbor_comm); + } +#endif hypre_TFree(comm_pkg, HYPRE_MEMORY_HOST); return hypre_error_flag; @@ -1269,6 +1663,10 @@ hypre_ParCSRFindExtendCommPkg(MPI_Comm comm, hypre_ParCSRCommPkg *new_comm_pkg = hypre_TAlloc(hypre_ParCSRCommPkg, 1, HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + new_comm_pkg->use_neighbor = 0; +#endif + hypre_assert(apart != NULL); hypre_ParCSRCommPkgCreateApart(comm, indices, my_first, indices_len, global_num, apart, new_comm_pkg); diff --git a/src/parcsr_mv/par_csr_communication.h b/src/parcsr_mv/par_csr_communication.h index 06ac5dcfed..617eaa3dce 100644 --- a/src/parcsr_mv/par_csr_communication.h +++ b/src/parcsr_mv/par_csr_communication.h @@ -8,6 +8,10 @@ #ifndef HYPRE_PAR_CSR_COMMUNICATION_HEADER #define HYPRE_PAR_CSR_COMMUNICATION_HEADER +#ifdef HYPRE_USING_NODE_AWARE_MPI +#include "mpi_advance.h" +#endif + /*-------------------------------------------------------------------------- * hypre_ParCSRCommPkg: * Structure containing information for doing communications @@ -44,6 +48,9 @@ typedef struct void *recv_data_buffer; HYPRE_Int num_requests; hypre_MPI_Request *requests; +#ifdef HYPRE_USING_NODE_AWARE_MPI + MPIX_Request *Xrequest; +#endif } hypre_ParCSRCommHandle; typedef hypre_ParCSRCommHandle hypre_ParCSRPersistentCommHandle; @@ -51,6 +58,11 @@ typedef hypre_ParCSRCommHandle hypre_ParCSRPersistentCommHandle; typedef struct _hypre_ParCSRCommPkg { MPI_Comm comm; +#ifdef HYPRE_USING_NODE_AWARE_MPI + MPIX_Comm *neighbor_comm; + MPIX_Topo *neighbor_topo; + MPIX_Topo *neighborT_topo; +#endif HYPRE_Int num_components; HYPRE_Int num_sends; HYPRE_Int *send_procs; @@ -60,6 +72,11 @@ typedef struct _hypre_ParCSRCommPkg HYPRE_Int num_recvs; HYPRE_Int *recv_procs; HYPRE_Int *recv_vec_starts; +#ifdef HYPRE_USING_NODE_AWARE_MPI + HYPRE_Int use_neighbor; + long *global_send_indices; + long *global_recv_indices; +#endif /* remote communication information */ hypre_MPI_Datatype *send_mpi_types; hypre_MPI_Datatype *recv_mpi_types; diff --git a/src/parcsr_mv/protos.h b/src/parcsr_mv/protos.h index 7b29da12d2..a3c027595e 100644 --- a/src/parcsr_mv/protos.h +++ b/src/parcsr_mv/protos.h @@ -137,6 +137,12 @@ HYPRE_Int hypre_RangeFillResponseIJDetermineRecvProcs ( void *p_recv_contact_buf HYPRE_Int hypre_FillResponseIJDetermineSendProcs ( void *p_recv_contact_buf, HYPRE_Int contact_size, HYPRE_Int contact_proc, void *ro, MPI_Comm comm, void **p_send_response_buf, HYPRE_Int *response_message_size ); +#ifdef HYPRE_USING_NODE_AWARE_MPI +void hypre_ParCSRCreateCommGraph( HYPRE_BigInt first_col_diag, + HYPRE_BigInt *col_map_offd, + MPI_Comm comm, + hypre_ParCSRCommPkg *comm_pkg ); +#endif /* numbers.c */ hypre_NumbersNode *hypre_NumbersNewNode ( void ); diff --git a/src/test/ij.c b/src/test/ij.c index d8a7ec15b5..4b17491acd 100644 --- a/src/test/ij.c +++ b/src/test/ij.c @@ -1650,6 +1650,13 @@ main( hypre_int argc, arg_index++; gpu_aware_mpi = atoi(argv[arg_index++]); } +#if defined(HYPRE_USING_NODE_AWARE_MPI) + else if ( strcmp(argv[arg_index], "-node_aware_lvl_threshold") == 0 ) + { + arg_index++; + hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle()) = atoi(argv[arg_index++]); + } +#endif else { arg_index++; @@ -2708,6 +2715,9 @@ main( hypre_int argc, hypre_printf(" -umpire_pinned_pool_size : pinned memory pool size (GiB)\n"); hypre_printf(" -umpire_host_pool_size : host memory pool size (GiB)\n"); /* end umpire options */ +#endif +#if defined(HYPRE_USING_NODE_AWARE_MPI) + hypre_printf(" -node_aware_lvl_threshold : Min. level in AMG hierarchy to use node-aware MPI\n"); #endif } diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index 9a26c20a7f..790bfb9c55 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -94,6 +94,12 @@ typedef struct #if defined(HYPRE_USING_MAGMA) magma_queue_t magma_queue; #endif + +#if defined(HYPRE_USING_NODE_AWARE_MPI) + /* level at which to begin using node aware optimization */ + HYPRE_Int node_aware_switchover_threshold; +#endif + } hypre_Handle; /* accessor macros to hypre_Handle */ @@ -161,6 +167,10 @@ typedef struct #define hypre_HandleMagmaQueue(hypre_handle) ((hypre_handle) -> magma_queue) +#ifdef HYPRE_USING_NODE_AWARE_MPI +#define hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle) ((hypre_handle) -> node_aware_switchover_threshold) +#endif + #endif /****************************************************************************** * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other diff --git a/src/utilities/general.c b/src/utilities/general.c index 71535e51d4..22081087c6 100644 --- a/src/utilities/general.c +++ b/src/utilities/general.c @@ -53,6 +53,10 @@ hypre_HandleCreate(void) #endif #endif +#if defined(HYPRE_USING_NODE_AWARE_MPI) + hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle_) = 0; +#endif + return hypre_handle_; } diff --git a/src/utilities/handle.h b/src/utilities/handle.h index 7ca8c12442..5b2a1436c3 100644 --- a/src/utilities/handle.h +++ b/src/utilities/handle.h @@ -78,6 +78,12 @@ typedef struct #if defined(HYPRE_USING_MAGMA) magma_queue_t magma_queue; #endif + +#if defined(HYPRE_USING_NODE_AWARE_MPI) + /* level at which to begin using node aware optimization */ + HYPRE_Int node_aware_switchover_threshold; +#endif + } hypre_Handle; /* accessor macros to hypre_Handle */ @@ -145,4 +151,8 @@ typedef struct #define hypre_HandleMagmaQueue(hypre_handle) ((hypre_handle) -> magma_queue) +#ifdef HYPRE_USING_NODE_AWARE_MPI +#define hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle) ((hypre_handle) -> node_aware_switchover_threshold) +#endif + #endif