diff --git a/c/mpi/one-sided/Makefile.am b/c/mpi/one-sided/Makefile.am index 3b646ba..42f271f 100644 --- a/c/mpi/one-sided/Makefile.am +++ b/c/mpi/one-sided/Makefile.am @@ -19,7 +19,8 @@ AM_CFLAGS = -I${top_srcdir}/c/util UTILITIES = ../../util/osu_util.c ../../util/osu_util.h \ ../../util/osu_util_mpi.c ../../util/osu_util_mpi.h \ ../../util/osu_util_graph.c ../../util/osu_util_graph.h \ - ../../util/osu_util_papi.c ../../util/osu_util_papi.h + ../../util/osu_util_papi.c ../../util/osu_util_papi.h \ + osu_osc_verify.c if CUDA_KERNELS UTILITIES += ../../util/kernel.cu diff --git a/c/mpi/one-sided/osu_acc_latency.c b/c/mpi/one-sided/osu_acc_latency.c index 6cd90d2..6e788ea 100644 --- a/c/mpi/one-sided/osu_acc_latency.c +++ b/c/mpi/one-sided/osu_acc_latency.c @@ -14,18 +14,39 @@ double t_start = 0.0, t_end = 0.0; char *sbuf=NULL, *win_base=NULL; omb_graph_options_t omb_graph_op; +int validation_error_flag = 0; +int dtype_size; -void print_latency (int, int); -void run_acc_with_lock (int, enum WINDOW); -void run_acc_with_fence (int, enum WINDOW); -void run_acc_with_lock_all (int, enum WINDOW); -void run_acc_with_flush (int, enum WINDOW); -void run_acc_with_flush_local (int, enum WINDOW); -void run_acc_with_pscw (int, enum WINDOW); +void print_latency (int, int, float); +void run_acc_with_lock (int, enum WINDOW, MPI_Datatype, MPI_Op); +void run_acc_with_fence (int, enum WINDOW, MPI_Datatype, MPI_Op); +void run_acc_with_lock_all (int, enum WINDOW, MPI_Datatype, MPI_Op); +void run_acc_with_flush (int, enum WINDOW, MPI_Datatype, MPI_Op); +void run_acc_with_flush_local (int, enum WINDOW, MPI_Datatype, MPI_Op); +void run_acc_with_pscw (int, enum WINDOW, MPI_Datatype, MPI_Op); int main (int argc, char *argv[]) { int po_ret = PO_OKAY; + char *type_name = NULL; + char dtype_name_str[128]; + MPI_Datatype dtype_list[] = { + MPI_SIGNED_CHAR, MPI_UNSIGNED_CHAR, MPI_SHORT, MPI_UNSIGNED_SHORT, + MPI_INT, MPI_UNSIGNED, MPI_LONG_LONG, MPI_UNSIGNED_LONG_LONG, + MPI_FLOAT, MPI_DOUBLE, MPI_LONG_DOUBLE, + MPI_C_FLOAT_COMPLEX, MPI_C_DOUBLE_COMPLEX, MPI_C_LONG_DOUBLE_COMPLEX, + }; + MPI_Op op_list[] = { + MPI_SUM, MPI_PROD, + MPI_LAND, MPI_BAND, + MPI_LOR, MPI_BOR, + MPI_LXOR, MPI_BXOR, + }; + const int nops = sizeof(op_list)/sizeof(op_list[0]); + const int ntypes = sizeof(dtype_list)/sizeof(dtype_list[0]); + int type_name_size = 0; + + #if MPI_VERSION >= 3 options.win = WIN_ALLOCATE; options.sync= FLUSH; @@ -38,10 +59,11 @@ int main (int argc, char *argv[]) options.bench = ONE_SIDED; options.subtype = LAT; options.synctype = ALL_SYNC; + options.show_validation = 1; set_header(HEADER); set_benchmark_name("osu_acc_latency"); - + po_ret = process_options(argc, argv); if (PO_OKAY == po_ret && NONE != options.accel) { @@ -104,28 +126,60 @@ int main (int argc, char *argv[]) } print_header_one_sided(rank, options.win, options.sync); + MPI_Datatype dtype = MPI_INT; + MPI_Op op = MPI_SUM; - switch (options.sync) { - case LOCK: - run_acc_with_lock(rank, options.win); - break; - case PSCW: - run_acc_with_pscw(rank, options.win); - break; - case FENCE: - run_acc_with_fence(rank, options.win); - break; + for (int jtype_test=0; jtype_test= 3 - case LOCK_ALL: - run_acc_with_lock_all(rank, options.win); - break; - case FLUSH_LOCAL: - run_acc_with_flush_local(rank, options.win); - break; - default: - run_acc_with_flush(rank, options.win); - break; + case LOCK_ALL: + run_acc_with_lock_all(rank, options.win, dtype_list[jtype_test], op); + break; + case FLUSH_LOCAL: + run_acc_with_flush_local(rank, options.win, dtype_list[jtype_test], op); + break; + default: + run_acc_with_flush(rank, options.win, dtype_list[jtype_test], op); + break; #endif + } + } } + + if (options.validate) { + for (int jrank_print=0; jrank_print<2; jrank_print++) { + if (jrank_print == rank) { + printf("-------------------------------------------\n"); + printf("Atomic Data Validation results for Rank=%d:\n",rank); + atomic_data_validation_print_summary(); + printf("-------------------------------------------\n"); + } + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + } } MPI_CHECK(MPI_Finalize()); @@ -140,28 +194,50 @@ int main (int argc, char *argv[]) return EXIT_SUCCESS; } -void print_latency(int rank, int size) + +void print_latency(int rank, int size, float latency_factor) { - if (rank == 0) { + char *validation_string; + if (rank != 0) return; + if ( options.validate ) { + if (2 & validation_error_flag) validation_string = "skipped"; + else if (1 & validation_error_flag) validation_string = "failed"; + else validation_string = "passed"; + + fprintf(stdout, "%-*d%*.*f%*s\n", 10, size, FIELD_WIDTH, + FLOAT_PRECISION, (t_end - t_start) * 1.0e6 * latency_factor + / options.iterations, + FIELD_WIDTH, validation_string); + fflush(stdout); + validation_error_flag = 0; + return; + } else { fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH, - FLOAT_PRECISION, (t_end - t_start) * 1.0e6 / options.iterations); + FLOAT_PRECISION, (t_end - t_start) * 1.0e6 * latency_factor + / options.iterations); fflush(stdout); + return; } + } + #if MPI_VERSION >= 3 /*Run ACC with flush */ -void run_acc_with_flush (int rank, enum WINDOW type) +void run_acc_with_flush (int rank, enum WINDOW type, MPI_Datatype data_type, MPI_Op op) { double t_graph_start, t_graph_end; omb_graph_data_t *omb_graph_data = NULL; int papi_eventset = OMB_PAPI_NULL; - int size, i; + int size, i, count; MPI_Aint disp = 0; MPI_Win win; omb_papi_init(&papi_eventset); for (size = options.min_message_size; size <= options.max_message_size; size = (size ? size * 2 : 1)) { + count = size / dtype_size; + if (count == 0) continue; + allocate_memory_one_sided(rank, &sbuf, &win_base, size, type, &win); if (type == WIN_DYNAMIC) { @@ -177,6 +253,10 @@ void run_acc_with_flush (int rank, enum WINDOW type) if (rank == 0) { MPI_CHECK(MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win)); for (i = 0; i < options.skip + options.iterations; i++) { + if (i == 0 && options.validate) { + atomic_data_validation_setup(data_type, rank, sbuf, size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + } if (i == options.skip) { omb_papi_start(&papi_eventset); t_start = MPI_Wtime (); @@ -184,7 +264,7 @@ void run_acc_with_flush (int rank, enum WINDOW type) if (i >= options.skip) { t_graph_start = MPI_Wtime(); } - MPI_CHECK(MPI_Accumulate(sbuf, size, MPI_CHAR, 1, disp, size, MPI_CHAR, MPI_SUM, win)); + MPI_CHECK(MPI_Accumulate(sbuf, count, data_type, 1, disp, count, data_type, op, win)); MPI_CHECK(MPI_Win_flush(1, win)); if (i >= options.skip) { t_graph_end = MPI_Wtime(); @@ -193,15 +273,38 @@ void run_acc_with_flush (int rank, enum WINDOW type) t_graph_start) * 1.0e6; } } + if (i == 0 && options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + // remote validation occurs + // recv the remote validation status, since rank 0 prints + MPI_CHECK(MPI_Recv( + &validation_error_flag, 1, MPI_INT, 1, 0, + MPI_COMM_WORLD, MPI_STATUS_IGNORE)); + } } t_end = MPI_Wtime (); MPI_CHECK(MPI_Win_unlock(1, win)); + } else { + atomic_data_validation_setup(data_type, rank, win_base, size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + // remote operation occurs + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check( + data_type, op, rank, win_base, NULL, + size, 1, 0, &validation_error_flag); + // this send serves both to inform rank 0 about validation + // so that rank 0 can do all printing, and to block rank 0 from + // doing the next atomic operation while we are validating. + MPI_CHECK(MPI_Send( + &validation_error_flag, 1, MPI_INT, 0, 0, + MPI_COMM_WORLD)); + } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); omb_papi_stop_and_print(&papi_eventset, size); - print_latency(rank, size); + print_latency(rank, size, 1.0); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations; @@ -217,17 +320,20 @@ void run_acc_with_flush (int rank, enum WINDOW type) } /*Run ACC with flush local*/ -void run_acc_with_flush_local (int rank, enum WINDOW type) +void run_acc_with_flush_local (int rank, enum WINDOW type, MPI_Datatype data_type, MPI_Op op) { double t_graph_start, t_graph_end; omb_graph_data_t *omb_graph_data = NULL; int papi_eventset = OMB_PAPI_NULL; - int size, i; + int size, i, count; MPI_Aint disp = 0; MPI_Win win; omb_papi_init(&papi_eventset); for (size = options.min_message_size; size <= options.max_message_size; size = (size ? size * 2 : 1)) { + count = size / dtype_size; + if (count == 0) continue; + allocate_memory_one_sided(rank, &sbuf, &win_base, size, type, &win); if (type == WIN_DYNAMIC) { @@ -243,6 +349,10 @@ void run_acc_with_flush_local (int rank, enum WINDOW type) if (rank == 0) { MPI_CHECK(MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win)); for (i = 0; i < options.skip + options.iterations; i++) { + if (i == 0 && options.validate) { + atomic_data_validation_setup(data_type, rank, sbuf, size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + } if (i == options.skip) { omb_papi_start(&papi_eventset); t_start = MPI_Wtime (); @@ -250,7 +360,7 @@ void run_acc_with_flush_local (int rank, enum WINDOW type) if (i >= options.skip) { t_graph_start = MPI_Wtime(); } - MPI_CHECK(MPI_Accumulate(sbuf, size, MPI_CHAR, 1, disp, size, MPI_CHAR, MPI_SUM, win)); + MPI_CHECK(MPI_Accumulate(sbuf, count, data_type, 1, disp, count, data_type, op, win)); MPI_CHECK(MPI_Win_flush_local(1, win)); if (i >= options.skip) { t_graph_end = MPI_Wtime(); @@ -259,15 +369,38 @@ void run_acc_with_flush_local (int rank, enum WINDOW type) t_graph_start) * 1.0e6; } } + if (i == 0 && options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + // remote validation occurs + // recv the remote validation status, since rank 0 prints + MPI_CHECK(MPI_Recv( + &validation_error_flag, 1, MPI_INT, 1, 0, + MPI_COMM_WORLD, MPI_STATUS_IGNORE)); + } } t_end = MPI_Wtime (); MPI_CHECK(MPI_Win_unlock(1, win)); + } else { + atomic_data_validation_setup(data_type, rank, win_base, size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + // remote operation occurs + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check( + data_type, op, rank, win_base, NULL, + size, 1, 0, &validation_error_flag); + // this send serves both to inform rank 0 about validation + // so that rank 0 can do all printing, and to block rank 0 from + // doing the next atomic operation while we are validating. + MPI_CHECK(MPI_Send( + &validation_error_flag, 1, MPI_INT, 0, 0, + MPI_COMM_WORLD)); + } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); omb_papi_stop_and_print(&papi_eventset, size); - print_latency(rank, size); + print_latency(rank, size, 1.0); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations; @@ -283,17 +416,19 @@ void run_acc_with_flush_local (int rank, enum WINDOW type) } /*Run ACC with Lock_all/unlock_all */ -void run_acc_with_lock_all (int rank, enum WINDOW type) +void run_acc_with_lock_all (int rank, enum WINDOW type, MPI_Datatype data_type, MPI_Op op) { double t_graph_start, t_graph_end; omb_graph_data_t *omb_graph_data = NULL; int papi_eventset = OMB_PAPI_NULL; - int size, i; + int size, i, count; MPI_Aint disp = 0; MPI_Win win; omb_papi_init(&papi_eventset); for (size = options.min_message_size; size <= options.max_message_size; size = (size ? size * 2 : 1)) { + count = size / dtype_size; + if (count == 0) continue; allocate_memory_one_sided(rank, &sbuf, &win_base, size, type, &win); if (type == WIN_DYNAMIC) { @@ -309,6 +444,10 @@ void run_acc_with_lock_all (int rank, enum WINDOW type) &omb_graph_op, size, options.iterations); if (rank == 0) { for (i = 0; i < options.skip + options.iterations; i++) { + if (i == 0 && options.validate) { + atomic_data_validation_setup(data_type, rank, sbuf, size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + } if (i == options.skip) { omb_papi_start(&papi_eventset); t_start = MPI_Wtime (); @@ -317,7 +456,7 @@ void run_acc_with_lock_all (int rank, enum WINDOW type) t_graph_start = MPI_Wtime(); } MPI_CHECK(MPI_Win_lock_all(0, win)); - MPI_CHECK(MPI_Accumulate(sbuf, size, MPI_CHAR, 1, disp, size, MPI_CHAR, MPI_SUM, win)); + MPI_CHECK(MPI_Accumulate(sbuf, count, data_type, 1, disp, count, data_type, op, win)); MPI_CHECK(MPI_Win_unlock_all(win)); if (i >= options.skip) { t_graph_end = MPI_Wtime(); @@ -326,14 +465,36 @@ void run_acc_with_lock_all (int rank, enum WINDOW type) t_graph_start) * 1.0e6; } } + if (i == 0 && options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + // remote validation occurs + // recv the remote validation status, since rank 0 prints + MPI_CHECK(MPI_Recv( + &validation_error_flag, 1, MPI_INT, 1, 0, + MPI_COMM_WORLD, MPI_STATUS_IGNORE)); + } } t_end = MPI_Wtime (); - } + } else { + atomic_data_validation_setup(data_type, rank, win_base, size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + // remote operation occurs + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check( + data_type, op, rank, win_base, NULL, + size, 1, 0, &validation_error_flag); + // this send serves both to inform rank 0 about validation + // so that rank 0 can do all printing, and to block rank 0 from + // doing the next atomic operation while we are validating. + MPI_CHECK(MPI_Send( + &validation_error_flag, 1, MPI_INT, 0, 0, + MPI_COMM_WORLD)); + } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); omb_papi_stop_and_print(&papi_eventset, size); - print_latency(rank, size); + print_latency(rank, size, 1.0); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations; @@ -350,17 +511,20 @@ void run_acc_with_lock_all (int rank, enum WINDOW type) #endif /*Run ACC with Lock/unlock */ -void run_acc_with_lock(int rank, enum WINDOW type) +void run_acc_with_lock(int rank, enum WINDOW type, MPI_Datatype data_type, MPI_Op op) { double t_graph_start, t_graph_end; omb_graph_data_t *omb_graph_data = NULL; int papi_eventset = OMB_PAPI_NULL; - int size, i; + int size, i, count; MPI_Aint disp = 0; MPI_Win win; omb_papi_init(&papi_eventset); for (size = options.min_message_size; size <= options.max_message_size; size = (size ? size * 2 : 1)) { + count = size / dtype_size; + if (count == 0) continue; + allocate_memory_one_sided(rank, &sbuf, &win_base, size, type, &win); #if MPI_VERSION >= 3 @@ -377,6 +541,10 @@ void run_acc_with_lock(int rank, enum WINDOW type) &omb_graph_op, size, options.iterations); if (rank == 0) { for (i = 0; i < options.skip + options.iterations; i++) { + if (i == 0 && options.validate) { + atomic_data_validation_setup(data_type, rank, sbuf, size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + } if (i == options.skip) { omb_papi_start(&papi_eventset); t_start = MPI_Wtime (); @@ -385,7 +553,7 @@ void run_acc_with_lock(int rank, enum WINDOW type) t_graph_start = MPI_Wtime(); } MPI_CHECK(MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win)); - MPI_CHECK(MPI_Accumulate(sbuf, size, MPI_CHAR, 1, disp, size, MPI_CHAR, MPI_SUM, win)); + MPI_CHECK(MPI_Accumulate(sbuf, count, data_type, 1, disp, count, data_type, op, win)); MPI_CHECK(MPI_Win_unlock(1, win)); if (i >= options.skip) { t_graph_end = MPI_Wtime(); @@ -394,14 +562,38 @@ void run_acc_with_lock(int rank, enum WINDOW type) t_graph_start) * 1.0e6; } } + if (i == 0 && options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + // remote validation occurs + // recv the remote validation status, since rank 0 prints + MPI_CHECK(MPI_Recv( + &validation_error_flag, 1, MPI_INT, 1, 0, + MPI_COMM_WORLD, MPI_STATUS_IGNORE)); + } + } t_end = MPI_Wtime (); + } else { + atomic_data_validation_setup(data_type, rank, win_base, size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + // remote operation occurs + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check( + data_type, op, rank, win_base, NULL, + size, 1, 0, &validation_error_flag); + // this send serves both to inform rank 0 about validation + // so that rank 0 can do all printing, and to block rank 0 from + // doing the next atomic operation while we are validating. + MPI_CHECK(MPI_Send( + &validation_error_flag, 1, MPI_INT, 0, 0, + MPI_COMM_WORLD)); + } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); omb_papi_stop_and_print(&papi_eventset, size); - print_latency(rank, size); + print_latency(rank, size, 1.0); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations; @@ -417,17 +609,20 @@ void run_acc_with_lock(int rank, enum WINDOW type) } /*Run ACC with Fence */ -void run_acc_with_fence(int rank, enum WINDOW type) +void run_acc_with_fence(int rank, enum WINDOW type, MPI_Datatype data_type, MPI_Op op) { double t_graph_start, t_graph_end; omb_graph_data_t *omb_graph_data = NULL; int papi_eventset = OMB_PAPI_NULL; - int size, i; + int size, i, count; MPI_Aint disp = 0; MPI_Win win; omb_papi_init(&papi_eventset); for (size = options.min_message_size; size <= options.max_message_size; size = (size ? size * 2 : 1)) { + count = size / dtype_size; + if (count == 0) continue; + allocate_memory_one_sided(rank, &sbuf, &win_base, size, type, &win); #if MPI_VERSION >= 3 @@ -447,6 +642,10 @@ void run_acc_with_fence(int rank, enum WINDOW type) if (rank == 0) { for (i = 0; i < options.skip + options.iterations; i++) { + if (options.validate) { + atomic_data_validation_setup(data_type, rank, sbuf, size); + atomic_data_validation_setup(data_type, rank, win_base, size); + } if (i == options.skip) { omb_papi_start(&papi_eventset); t_start = MPI_Wtime (); @@ -455,7 +654,8 @@ void run_acc_with_fence(int rank, enum WINDOW type) t_graph_start = MPI_Wtime(); } MPI_CHECK(MPI_Win_fence(0, win)); - MPI_CHECK(MPI_Accumulate(sbuf, size, MPI_CHAR, 1, disp, size, MPI_CHAR, MPI_SUM, win)); + MPI_CHECK(MPI_Accumulate(sbuf, count, data_type, 1, disp, count, data_type, op, win)); + MPI_CHECK(MPI_Win_fence(0, win)); MPI_CHECK(MPI_Win_fence(0, win)); if (i >= options.skip) { @@ -465,27 +665,39 @@ void run_acc_with_fence(int rank, enum WINDOW type) t_graph_start) * 1.0e6 / 2.0; } } + if (options.validate) { + atomic_data_validation_check( + data_type, op, rank, win_base, NULL, + size, 1, 0, &validation_error_flag); + } } t_end = MPI_Wtime (); } else { for (i = 0; i < options.skip + options.iterations; i++) { + if (options.validate) { + atomic_data_validation_setup(data_type, rank, sbuf, size); + atomic_data_validation_setup(data_type, rank, win_base, size); + } if (i == options.skip) { omb_papi_start(&papi_eventset); } MPI_CHECK(MPI_Win_fence(0, win)); MPI_CHECK(MPI_Win_fence(0, win)); - MPI_CHECK(MPI_Accumulate(sbuf, size, MPI_CHAR, 0, disp, size, MPI_CHAR, MPI_SUM, win)); + MPI_CHECK(MPI_Accumulate(sbuf, count, data_type, 0, disp, count, data_type, op, win)); MPI_CHECK(MPI_Win_fence(0, win)); } + if (options.validate) { + atomic_data_validation_check( + data_type, op, rank, win_base, NULL, + size, 1, 0, &validation_error_flag); + } } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); omb_papi_stop_and_print(&papi_eventset, size); + print_latency(rank, size, 0.5); if (rank == 0) { - fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH, - FLOAT_PRECISION, (t_end - t_start) * 1.0e6 / options.iterations / 2); - fflush(stdout); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations / 2; @@ -503,12 +715,12 @@ void run_acc_with_fence(int rank, enum WINDOW type) } /*Run ACC with Post/Start/Complete/Wait */ -void run_acc_with_pscw(int rank, enum WINDOW type) +void run_acc_with_pscw(int rank, enum WINDOW type, MPI_Datatype data_type, MPI_Op op) { double t_graph_start, t_graph_end; omb_graph_data_t *omb_graph_data = NULL; int papi_eventset = OMB_PAPI_NULL; - int destrank, size, i; + int destrank, size, i, count; MPI_Aint disp = 0; MPI_Win win; MPI_Group comm_group, group; @@ -517,6 +729,9 @@ void run_acc_with_pscw(int rank, enum WINDOW type) omb_papi_init(&papi_eventset); for (size = options.min_message_size; size <= options.max_message_size; size = (size ? size * 2 : 1)) { + count = size / dtype_size; + if (count == 0) continue; + allocate_memory_one_sided(rank, &sbuf, &win_base, size, type, &win); #if MPI_VERSION >= 3 @@ -539,6 +754,11 @@ void run_acc_with_pscw(int rank, enum WINDOW type) MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); for (i = 0; i < options.skip + options.iterations; i++) { + if (options.validate) { + atomic_data_validation_setup(data_type, rank, sbuf, size); + atomic_data_validation_setup(data_type, rank, win_base, size); + } + MPI_CHECK(MPI_Win_start (group, 0, win)); if (i == options.skip) { omb_papi_start(&papi_eventset); @@ -547,7 +767,8 @@ void run_acc_with_pscw(int rank, enum WINDOW type) if (i >= options.skip) { t_graph_start = MPI_Wtime(); } - MPI_CHECK(MPI_Accumulate(sbuf, size, MPI_CHAR, 1, disp, size, MPI_CHAR, MPI_SUM, win)); + MPI_CHECK(MPI_Accumulate(sbuf, count, data_type, 1, disp, count, data_type, op, win)); + MPI_CHECK(MPI_Win_complete(win)); MPI_CHECK(MPI_Win_post(group, 0, win)); MPI_CHECK(MPI_Win_wait(win)); @@ -558,6 +779,11 @@ void run_acc_with_pscw(int rank, enum WINDOW type) t_graph_start) * 1.0e6 / 2.0; } } + if (options.validate) { + atomic_data_validation_check( + data_type, op, rank, win_base, NULL, + size, 1, 0, &validation_error_flag); + } } t_end = MPI_Wtime (); @@ -569,24 +795,33 @@ void run_acc_with_pscw(int rank, enum WINDOW type) MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); for (i = 0; i < options.skip + options.iterations; i++) { + if (options.validate) { + atomic_data_validation_setup(data_type, rank, sbuf, size); + atomic_data_validation_setup(data_type, rank, win_base, size); + } + if (i == options.skip) { omb_papi_start(&papi_eventset); } MPI_CHECK(MPI_Win_post(group, 0, win)); MPI_CHECK(MPI_Win_wait(win)); MPI_CHECK(MPI_Win_start(group, 0, win)); - MPI_CHECK(MPI_Accumulate(sbuf, size, MPI_CHAR, 0, disp, size, MPI_CHAR, MPI_SUM, win)); + MPI_CHECK(MPI_Accumulate(sbuf, count, data_type, 0, disp, count, data_type, op, win)); + MPI_CHECK(MPI_Win_complete(win)); + if (options.validate) { + atomic_data_validation_check( + data_type, op, rank, win_base, NULL, + size, 1, 0, &validation_error_flag); + } } } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); omb_papi_stop_and_print(&papi_eventset, size); + print_latency(rank, size, 0.5); if (rank == 0) { - fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH, - FLOAT_PRECISION, (t_end - t_start) * 1.0e6 / options.iterations / 2); - fflush(stdout); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations / 2; diff --git a/c/mpi/one-sided/osu_cas_latency.c b/c/mpi/one-sided/osu_cas_latency.c index f81e4cb..3600832 100644 --- a/c/mpi/one-sided/osu_cas_latency.c +++ b/c/mpi/one-sided/osu_cas_latency.c @@ -14,27 +14,62 @@ double t_start = 0.0, t_end = 0.0; uint64_t *sbuf=NULL, *tbuf=NULL, *cbuf=NULL, *win_base=NULL; omb_graph_options_t omb_graph_op; +int validation_error_flag = 0; +int dtype_size; -void print_latency (int, int); -void run_cas_with_lock (int, enum WINDOW); -void run_cas_with_fence (int, enum WINDOW); -void run_cas_with_lock_all (int, enum WINDOW); -void run_cas_with_flush (int, enum WINDOW); -void run_cas_with_flush_local (int, enum WINDOW); -void run_cas_with_pscw (int, enum WINDOW); + +void print_latency (int, int, float); +void run_cas_with_lock (int, enum WINDOW, MPI_Datatype); +void run_cas_with_fence (int, enum WINDOW, MPI_Datatype); +void run_cas_with_lock_all (int, enum WINDOW, MPI_Datatype); +void run_cas_with_flush (int, enum WINDOW, MPI_Datatype); +void run_cas_with_flush_local (int, enum WINDOW, MPI_Datatype); +void run_cas_with_pscw (int, enum WINDOW, MPI_Datatype); + +static int mpi_types_count() { + return 4; +} +static MPI_Datatype mpi_types_index_to_value(int type_index) { + switch(type_index) + { + case 0: return MPI_INT8_T; + case 1: return MPI_UINT8_T; + case 2: return MPI_INT16_T; + case 3: return MPI_UINT16_T; + default: return MPI_DATATYPE_NULL; + } +} +static const char* mpi_types_index_to_name(int type_index) { + switch (type_index) { + case 0: return "MPI_INT8_T"; + case 1: return "MPI_UINT8_T"; + case 2: return "MPI_INT16_T"; + case 3: return "MPI_UINT16_T"; + default: return "Error: MPI_Datatype not recognized."; + } +} int main (int argc, char *argv[]) { int rank,nprocs; int po_ret = PO_OKAY; + MPI_Datatype dtype_list[] = { + MPI_SIGNED_CHAR, MPI_UNSIGNED_CHAR, MPI_SHORT, MPI_UNSIGNED_SHORT, + MPI_INT, MPI_UNSIGNED, MPI_LONG_LONG, MPI_UNSIGNED_LONG_LONG, + MPI_FLOAT, MPI_DOUBLE, MPI_LONG_DOUBLE, + }; + const int ntypes = sizeof(dtype_list)/sizeof(dtype_list[0]); + int type_name_size = 0; + options.win = WIN_ALLOCATE; options.sync = FLUSH; options.bench = ONE_SIDED; options.subtype = LAT; options.synctype = ALL_SYNC; options.max_message_size = 1 << 20; - + options.show_validation = 1; + set_header(HEADER); set_benchmark_name("osu_cas_latency"); @@ -97,27 +132,47 @@ int main (int argc, char *argv[]) return EXIT_FAILURE; } - print_header_one_sided(rank, options.win, options.sync); + for (int jtype_test=0; jtype_test= options.skip) { t_graph_start = MPI_Wtime(); } - MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, MPI_LONG_LONG, 1, disp, win)); + MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, data_type, 1, disp, win)); MPI_CHECK(MPI_Win_flush(1, win)); if (i >= options.skip) { t_graph_end = MPI_Wtime(); @@ -183,26 +262,43 @@ void run_cas_with_flush (int rank, enum WINDOW type) t_graph_start) * 1.0e6; } } + if (i == options.skip && options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check(data_type, (MPI_Op)-1, rank, + win_base, tbuf, options.max_message_size, 0, 1, + &validation_error_flag ); + } } t_end = MPI_Wtime (); MPI_CHECK(MPI_Win_unlock(1, win)); + } else if (options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, cbuf, options.max_message_size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check(data_type, (MPI_Op)-1, rank, + win_base, tbuf, options.max_message_size, 1, 0, + &validation_error_flag ); } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); omb_papi_stop_and_print(&papi_eventset, 8); - print_latency(rank, 8); + print_latency(rank, dtype_size, 1); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations; } omb_graph_plot(&omb_graph_op, benchmark_name); omb_graph_free_data_buffers(&omb_graph_op); omb_papi_free(&papi_eventset); - free_atomic_memory (sbuf, win_base, tbuf, cbuf, type, win, rank); + free_atomic_memory (sbuf, win_base, tbuf, cbuf, win_type, win, rank); } /*Run CAS with Lock_all/unlock_all */ -void run_cas_with_lock_all (int rank, enum WINDOW type) +void run_cas_with_lock_all (int rank, enum WINDOW win_type, MPI_Datatype data_type) { int i = 0; double t_graph_start = 0.0, t_graph_end = 0.0; @@ -217,15 +313,24 @@ void run_cas_with_lock_all (int rank, enum WINDOW type) omb_papi_init(&papi_eventset); allocate_atomic_memory(rank, (char **)&sbuf, (char **)&tbuf, (char **) &cbuf, (char **)&win_base, - options.max_message_size, type, &win); + options.max_message_size, win_type, &win); if (rank == 0) { - if (type == WIN_DYNAMIC) { + if (win_type == WIN_DYNAMIC) { disp = disp_remote; } for (i = 0; i < options.skip + options.iterations; i++) { + if (i == options.skip && options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, cbuf, options.max_message_size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + } + if (i == options.skip) { omb_papi_start(&papi_eventset); t_start = MPI_Wtime (); @@ -234,7 +339,7 @@ void run_cas_with_lock_all (int rank, enum WINDOW type) t_graph_start = MPI_Wtime(); } MPI_CHECK(MPI_Win_lock_all(0, win)); - MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, MPI_LONG_LONG, 1, disp, win)); + MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, data_type, 1, disp, win)); MPI_CHECK(MPI_Win_unlock_all(win)); if (i >= options.skip) { t_graph_end = MPI_Wtime(); @@ -243,25 +348,42 @@ void run_cas_with_lock_all (int rank, enum WINDOW type) t_graph_start) * 1.0e6; } } + if (i == options.skip && options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check(data_type, (MPI_Op)-1, rank, + win_base, tbuf, options.max_message_size, 0, 1, + &validation_error_flag ); + } } t_end = MPI_Wtime (); + } else if (options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, cbuf, options.max_message_size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check(data_type, (MPI_Op)-1, rank, + win_base, tbuf, options.max_message_size, 1, 0, + &validation_error_flag ); } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); omb_papi_stop_and_print(&papi_eventset, 8); - print_latency(rank, 8); + print_latency(rank, dtype_size, 1); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations; } omb_graph_plot(&omb_graph_op, benchmark_name); omb_graph_free_data_buffers(&omb_graph_op); omb_papi_free(&papi_eventset); - free_atomic_memory (sbuf, win_base, tbuf, cbuf, type, win, rank); + free_atomic_memory (sbuf, win_base, tbuf, cbuf, win_type, win, rank); } /*Run CAS with flush */ -void run_cas_with_flush_local (int rank, enum WINDOW type) +void run_cas_with_flush_local (int rank, enum WINDOW win_type, MPI_Datatype data_type) { int i = 0; double t_graph_start = 0.0, t_graph_end = 0.0; @@ -276,17 +398,26 @@ void run_cas_with_flush_local (int rank, enum WINDOW type) omb_papi_init(&papi_eventset); allocate_atomic_memory(rank, (char **)&sbuf, (char **)&tbuf, (char **) &cbuf, (char **)&win_base, - options.max_message_size, type, &win); + options.max_message_size, win_type, &win); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); if (rank == 0) { - if (type == WIN_DYNAMIC) { + if (win_type == WIN_DYNAMIC) { disp = disp_remote; } MPI_CHECK(MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win)); for (i = 0; i < options.skip + options.iterations; i++) { + if (i == options.skip && options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, cbuf, options.max_message_size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + } + if (i == options.skip) { omb_papi_start(&papi_eventset); t_start = MPI_Wtime (); @@ -294,7 +425,7 @@ void run_cas_with_flush_local (int rank, enum WINDOW type) if (i >= options.skip) { t_graph_start = MPI_Wtime(); } - MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, MPI_LONG_LONG, 1, disp, win)); + MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, data_type, 1, disp, win)); MPI_CHECK(MPI_Win_flush_local(1, win)); if (i >= options.skip) { t_graph_end = MPI_Wtime(); @@ -303,26 +434,44 @@ void run_cas_with_flush_local (int rank, enum WINDOW type) t_graph_start) * 1.0e6; } } + if (i == options.skip && options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check(data_type, (MPI_Op)-1, rank, + win_base, tbuf, options.max_message_size, 0, 1, + &validation_error_flag ); + } + } t_end = MPI_Wtime (); MPI_CHECK(MPI_Win_unlock(1, win)); + } else if (options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, cbuf, options.max_message_size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check(data_type, (MPI_Op)-1, rank, + win_base, tbuf, options.max_message_size, 1, 0, + &validation_error_flag ); } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); omb_papi_stop_and_print(&papi_eventset, 8); - print_latency(rank, 8); + print_latency(rank, dtype_size, 1); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations; } omb_graph_plot(&omb_graph_op, benchmark_name); omb_graph_free_data_buffers(&omb_graph_op); omb_papi_free(&papi_eventset); - free_atomic_memory (sbuf, win_base, tbuf, cbuf, type, win, rank); + free_atomic_memory (sbuf, win_base, tbuf, cbuf, win_type, win, rank); } /*Run CAS with Lock/unlock */ -void run_cas_with_lock(int rank, enum WINDOW type) +void run_cas_with_lock(int rank, enum WINDOW win_type, MPI_Datatype data_type) { int i = 0; double t_graph_start = 0, t_graph_end = 0; @@ -330,6 +479,7 @@ void run_cas_with_lock(int rank, enum WINDOW type) int papi_eventset = OMB_PAPI_NULL; MPI_Aint disp = 0; MPI_Win win; + int dtype_size; omb_graph_op.number_of_graphs = 0; omb_graph_allocate_and_get_data_buffer(&omb_graph_data, @@ -337,14 +487,25 @@ void run_cas_with_lock(int rank, enum WINDOW type) omb_papi_init(&papi_eventset); allocate_atomic_memory(rank, (char **)&sbuf, (char **)&tbuf, (char **) &cbuf, (char **)&win_base, - options.max_message_size, type, &win); + options.max_message_size, win_type, &win); + + MPI_CHECK(MPI_Type_size(data_type, &dtype_size)); if (rank == 0) { - if (type == WIN_DYNAMIC) { + if (win_type == WIN_DYNAMIC) { disp = disp_remote; } for (i = 0; i < options.skip + options.iterations; i++) { + if (i == options.skip && options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, cbuf, options.max_message_size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + } + if (i == options.skip) { omb_papi_start(&papi_eventset); t_start = MPI_Wtime (); @@ -353,7 +514,7 @@ void run_cas_with_lock(int rank, enum WINDOW type) t_graph_start = MPI_Wtime(); } MPI_CHECK(MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 1, 0, win)); - MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, MPI_LONG_LONG, 1, disp, win)); + MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, data_type, 1, disp, win)); MPI_CHECK(MPI_Win_unlock(1, win)); if (i >= options.skip) { t_graph_end = MPI_Wtime(); @@ -362,25 +523,42 @@ void run_cas_with_lock(int rank, enum WINDOW type) t_graph_start) * 1.0e6; } } + if (i == options.skip && options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check(data_type, (MPI_Op)-1, rank, + win_base, tbuf, options.max_message_size, 0, 1, + &validation_error_flag ); + } } t_end = MPI_Wtime (); + } else if (options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, cbuf, options.max_message_size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check(data_type, (MPI_Op)-1, rank, + win_base, tbuf, options.max_message_size, 1, 0, + &validation_error_flag ); } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); omb_papi_stop_and_print(&papi_eventset, 8); - print_latency(rank, 8); + print_latency(rank, dtype_size, 1); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations; } omb_graph_plot(&omb_graph_op, benchmark_name); omb_graph_free_data_buffers(&omb_graph_op); omb_papi_free(&papi_eventset); - free_atomic_memory (sbuf, win_base, tbuf, cbuf, type, win, rank); + free_atomic_memory (sbuf, win_base, tbuf, cbuf, win_type, win, rank); } /*Run CAS with Fence */ -void run_cas_with_fence(int rank, enum WINDOW type) +void run_cas_with_fence(int rank, enum WINDOW win_type, MPI_Datatype data_type) { int i = 0; double t_graph_start = 0.0, t_graph_end = 0.0; @@ -397,9 +575,9 @@ void run_cas_with_fence(int rank, enum WINDOW type) allocate_atomic_memory(rank, (char **)&sbuf, (char **)&tbuf, (char **) &cbuf, (char **)&win_base, - options.max_message_size, type, &win); + options.max_message_size, win_type, &win); - if (type == WIN_DYNAMIC) { + if (win_type == WIN_DYNAMIC) { disp = disp_remote; } if (rank == 0) { @@ -409,10 +587,16 @@ void run_cas_with_fence(int rank, enum WINDOW type) t_start = MPI_Wtime (); } if (i >= options.skip) { + if (options.validate) { + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, cbuf, options.max_message_size); + } t_graph_start = MPI_Wtime(); } MPI_CHECK(MPI_Win_fence(0, win)); - MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, MPI_LONG_LONG, 1, disp, win)); + MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, data_type, 1, disp, win)); MPI_CHECK(MPI_Win_fence(0, win)); MPI_CHECK(MPI_Win_fence(0, win)); if (i >= options.skip) { @@ -421,6 +605,11 @@ void run_cas_with_fence(int rank, enum WINDOW type) omb_graph_data->data[i - options.skip] = (t_graph_end - t_graph_start) * 1.0e6 / 2.0; } + if (options.validate) { + atomic_data_validation_check(data_type, (MPI_Op)-1, rank, + win_base, tbuf, options.max_message_size, 0, 1, + &validation_error_flag); + } } } t_end = MPI_Wtime (); @@ -429,21 +618,28 @@ void run_cas_with_fence(int rank, enum WINDOW type) if (i == options.skip) { omb_papi_start(&papi_eventset); } + if (i >= options.skip && options.validate) { + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, cbuf, options.max_message_size); + } MPI_CHECK(MPI_Win_fence(0, win)); MPI_CHECK(MPI_Win_fence(0, win)); - MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, MPI_LONG_LONG, 0, disp, win)); + MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, data_type, 0, disp, win)); MPI_CHECK(MPI_Win_fence(0, win)); + if (i >= options.skip && options.validate) { + atomic_data_validation_check(data_type, (MPI_Op)-1, rank, + win_base, tbuf, options.max_message_size, 0, 1, + &validation_error_flag); + } } } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); omb_papi_stop_and_print(&papi_eventset, 8); - if (rank == 0) { - fprintf(stdout, "%-*d%*.*f\n", 10, 8, FIELD_WIDTH, - FLOAT_PRECISION, (t_end - t_start) * 1.0e6 / options.iterations / 2); - fflush(stdout); - } + print_latency(rank, dtype_size, 0.5); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations / 2; @@ -451,11 +647,11 @@ void run_cas_with_fence(int rank, enum WINDOW type) omb_graph_plot(&omb_graph_op, benchmark_name); omb_graph_free_data_buffers(&omb_graph_op); omb_papi_free(&papi_eventset); - free_atomic_memory (sbuf, win_base, tbuf, cbuf, type, win, rank); + free_atomic_memory (sbuf, win_base, tbuf, cbuf, win_type, win, rank); } /*Run CAS with Post/Start/Complete/Wait */ -void run_cas_with_pscw(int rank, enum WINDOW type) +void run_cas_with_pscw(int rank, enum WINDOW win_type, MPI_Datatype data_type) { int destrank, i; double t_graph_start = 0.0, t_graph_end = 0.0; @@ -463,7 +659,6 @@ void run_cas_with_pscw(int rank, enum WINDOW type) int papi_eventset = OMB_PAPI_NULL; MPI_Aint disp = 0; MPI_Win win; - MPI_Group comm_group, group; MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &comm_group)); @@ -473,9 +668,9 @@ void run_cas_with_pscw(int rank, enum WINDOW type) omb_papi_init(&papi_eventset); allocate_atomic_memory(rank, (char **)&sbuf, (char **)&tbuf, (char **) &cbuf, (char **)&win_base, - options.max_message_size, type, &win); + options.max_message_size, win_type, &win); - if (type == WIN_DYNAMIC) { + if (win_type == WIN_DYNAMIC) { disp = disp_remote; } @@ -486,6 +681,13 @@ void run_cas_with_pscw(int rank, enum WINDOW type) MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); for (i = 0; i < options.skip + options.iterations; i++) { + if (i >= options.skip && options.validate) { + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, cbuf, options.max_message_size); + } + MPI_CHECK(MPI_Win_start (group, 0, win)); if (i == options.skip) { @@ -496,7 +698,7 @@ void run_cas_with_pscw(int rank, enum WINDOW type) if (i >= options.skip) { t_graph_start = MPI_Wtime(); } - MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, MPI_LONG_LONG, 1, disp, win)); + MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, data_type, 1, disp, win)); MPI_CHECK(MPI_Win_complete(win)); MPI_CHECK(MPI_Win_post(group, 0, win)); MPI_CHECK(MPI_Win_wait(win)); @@ -506,6 +708,12 @@ void run_cas_with_pscw(int rank, enum WINDOW type) omb_graph_data->data[i - options.skip] = (t_graph_end - t_graph_start) * 1.0e6 / 2.0; } + if (options.validate) { + atomic_data_validation_check(data_type, (MPI_Op)-1, rank, + win_base, tbuf, options.max_message_size, 1, 1, + &validation_error_flag); + } + } } @@ -518,25 +726,34 @@ void run_cas_with_pscw(int rank, enum WINDOW type) MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); for (i = 0; i < options.skip + options.iterations; i++) { + if (i >= options.skip && options.validate) { + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, cbuf, options.max_message_size); + } + if (i == options.skip) { omb_papi_start(&papi_eventset); } MPI_CHECK(MPI_Win_post(group, 0, win)); MPI_CHECK(MPI_Win_wait(win)); MPI_CHECK(MPI_Win_start(group, 0, win)); - MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, MPI_LONG_LONG, 0, disp, win)); + MPI_CHECK(MPI_Compare_and_swap(sbuf, cbuf, tbuf, data_type, 0, disp, win)); MPI_CHECK(MPI_Win_complete(win)); + + if (i >= options.skip && options.validate) { + atomic_data_validation_check(data_type, (MPI_Op)-1, rank, + win_base, tbuf, options.max_message_size, 1, 1, + &validation_error_flag ); + } } } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); omb_papi_stop_and_print(&papi_eventset, 8); - if (rank == 0) { - fprintf(stdout, "%-*d%*.*f\n", 10, 8, FIELD_WIDTH, - FLOAT_PRECISION, (t_end - t_start) * 1.0e6 / options.iterations / 2); - fflush(stdout); - } + print_latency(rank, dtype_size, 0.5); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations / 2; @@ -548,6 +765,6 @@ void run_cas_with_pscw(int rank, enum WINDOW type) MPI_CHECK(MPI_Group_free(&group)); MPI_CHECK(MPI_Group_free(&comm_group)); - free_atomic_memory (sbuf, win_base, tbuf, cbuf, type, win, rank); + free_atomic_memory (sbuf, win_base, tbuf, cbuf, win_type, win, rank); } /* vi: set sw=4 sts=4 tw=80: */ diff --git a/c/mpi/one-sided/osu_fop_latency.c b/c/mpi/one-sided/osu_fop_latency.c index 389205a..d0c1eee 100644 --- a/c/mpi/one-sided/osu_fop_latency.c +++ b/c/mpi/one-sided/osu_fop_latency.c @@ -10,18 +10,20 @@ */ #include +#include double t_start = 0.0, t_end = 0.0; uint64_t *sbuf=NULL, *tbuf=NULL, *win_base = NULL; omb_graph_options_t omb_graph_op; +int validation_error_flag = 0; -void print_latency (int, int); -void run_fop_with_lock (int, enum WINDOW); -void run_fop_with_fence (int, enum WINDOW); -void run_fop_with_lock_all (int, enum WINDOW); -void run_fop_with_flush (int, enum WINDOW); -void run_fop_with_flush_local (int, enum WINDOW); -void run_fop_with_pscw (int, enum WINDOW); +void print_latency (int, int, float); +void run_fop_with_lock(int rank, enum WINDOW win_type, MPI_Datatype data_type, MPI_Op op); +void run_fop_with_fence (int rank, enum WINDOW win_type, MPI_Datatype data_type, MPI_Op op); +void run_fop_with_lock_all(int rank, enum WINDOW win_type, MPI_Datatype data_type, MPI_Op op); +void run_fop_with_flush(int rank, enum WINDOW win_type, MPI_Datatype data_type, MPI_Op op); +void run_fop_with_flush_local(int rank, enum WINDOW win_type, MPI_Datatype data_type, MPI_Op op); +void run_fop_with_pscw(int rank, enum WINDOW win_type, MPI_Datatype data_type, MPI_Op op); int main (int argc, char *argv[]) { @@ -34,6 +36,29 @@ int main (int argc, char *argv[]) options.bench = ONE_SIDED; options.subtype = LAT; options.synctype = ALL_SYNC; + options.show_validation = 1; + + + char op_name[32] = {'\0'}; + char type_name[32] = {'\0'}; + + MPI_Datatype dtype_list[] = { + MPI_SIGNED_CHAR, MPI_UNSIGNED_CHAR, MPI_SHORT, MPI_UNSIGNED_SHORT, + MPI_INT, MPI_UNSIGNED, MPI_LONG_LONG, MPI_UNSIGNED_LONG_LONG, + MPI_FLOAT, MPI_DOUBLE, MPI_LONG_DOUBLE, + MPI_C_FLOAT_COMPLEX, MPI_C_DOUBLE_COMPLEX, MPI_C_LONG_DOUBLE_COMPLEX, + }; + MPI_Op op_list[] = { + MPI_MAX, MPI_MIN, + MPI_SUM, MPI_PROD, + MPI_LAND, MPI_BAND, + MPI_LOR, MPI_BOR, + MPI_LXOR, MPI_BXOR, + }; + const int nops = sizeof(op_list)/sizeof(op_list[0]); + const int ntypes = sizeof(dtype_list)/sizeof(dtype_list[0]); + int type_name_size = 0; + set_header(HEADER); set_benchmark_name("osu_fop_latency"); @@ -100,29 +125,58 @@ int main (int argc, char *argv[]) return EXIT_FAILURE; } + + for (int jdata_type = 0; jdata_type < ntypes; jdata_type++) { + for (int jop = 0; jop < nops; jop++) { + + MPI_Type_get_name(dtype_list[jdata_type], type_name, &type_name_size); + + if ( !is_mpi_op_allowed(dtype_list[jdata_type], op_list[jop]) ) { + printf("Skipping: OP %s on Datatype %s is not allowed\n", osc_tostr(&op_list[jop], OSC_TYPE_ATOMIC_OP),type_name); + continue; + } + printf("# Operation: %s, Datatype: %s.\n", osc_tostr(&op_list[jop], OSC_TYPE_ATOMIC_OP),type_name); + + print_header_one_sided(rank, options.win, options.sync); switch (options.sync) { case LOCK: - run_fop_with_lock(rank, options.win); + run_fop_with_lock(rank, options.win, dtype_list[jdata_type], op_list[jop]); break; case LOCK_ALL: - run_fop_with_lock_all(rank, options.win); + run_fop_with_lock_all(rank, options.win, dtype_list[jdata_type], op_list[jop]); break; case PSCW: - run_fop_with_pscw(rank, options.win); + run_fop_with_pscw(rank, options.win, dtype_list[jdata_type], op_list[jop]); break; case FENCE: - run_fop_with_fence(rank, options.win); + run_fop_with_fence(rank, options.win, dtype_list[jdata_type], op_list[jop]); break; case FLUSH_LOCAL: - run_fop_with_flush_local(rank, options.win); + run_fop_with_flush_local(rank, options.win, dtype_list[jdata_type], op_list[jop]); break; default: - run_fop_with_flush(rank, options.win); + run_fop_with_flush(rank, options.win, dtype_list[jdata_type], op_list[jop]); break; } + }} + + if (options.validate) { + for (int jrank_print=0; jrank_print<2; jrank_print++) { + if (jrank_print == rank) { + printf("-------------------------------------------\n"); + printf("Atomic Data Validation results for Rank=%d:\n",rank); + atomic_data_validation_print_summary(); + printf("-------------------------------------------\n"); + } + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + } + } + + + MPI_CHECK(MPI_Finalize()); if (NONE != options.accel) { @@ -134,24 +188,40 @@ int main (int argc, char *argv[]) return EXIT_SUCCESS; } -void print_latency(int rank, int size) +void print_latency(int rank, int size, float latency_factor) { - if (rank == 0) { + char *validation_string; + if (rank != 0) return; + if ( options.validate ) { + if (2 & validation_error_flag) validation_string = "skipped"; + else if (1 & validation_error_flag) validation_string = "failed"; + else validation_string = "passed"; + + fprintf(stdout, "%-*d%*.*f%*s\n", 10, size, FIELD_WIDTH, + FLOAT_PRECISION, (t_end - t_start) * 1.0e6 * latency_factor + / options.iterations, + FIELD_WIDTH, validation_string); + fflush(stdout); + validation_error_flag = 0; + return; + } else { fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH, - FLOAT_PRECISION, (t_end - t_start) * 1.0e6 / options.iterations); + FLOAT_PRECISION, (t_end - t_start) * 1.0e6 * latency_factor + / options.iterations); fflush(stdout); + return; } + } /*Run FOP with flush local*/ -void run_fop_with_flush_local (int rank, enum WINDOW type) +void run_fop_with_flush_local(int rank, enum WINDOW win_type, MPI_Datatype data_type, MPI_Op op) { double t_graph_start = 0.0, t_graph_end = 0.0; omb_graph_data_t *omb_graph_data = NULL; int papi_eventset = OMB_PAPI_NULL; - int i; + int i, jrank, dtype_size; MPI_Win win; - MPI_Aint disp = 0; omb_graph_op.number_of_graphs = 0; @@ -159,12 +229,20 @@ void run_fop_with_flush_local (int rank, enum WINDOW type) &omb_graph_op, 8, options.iterations); omb_papi_init(&papi_eventset); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + MPI_CHECK(MPI_Type_size(data_type, &dtype_size)); allocate_atomic_memory(rank, (char **)&sbuf, - (char **)&tbuf, NULL, (char **)&win_base, options.max_message_size, type, &win); + (char **)&tbuf, NULL, (char **)&win_base, options.max_message_size, win_type, &win); + + if (options.validate) { + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + } + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); if (rank == 0) { - if (type == WIN_DYNAMIC) { + if (win_type == WIN_DYNAMIC) { disp = disp_remote; } MPI_CHECK(MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win)); @@ -176,7 +254,7 @@ void run_fop_with_flush_local (int rank, enum WINDOW type) if (i >= options.skip) { t_graph_start = MPI_Wtime(); } - MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, MPI_LONG_LONG, 1, disp, MPI_SUM, win)); + MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, data_type, 1, disp, op, win)); MPI_CHECK(MPI_Win_flush_local(1, win)); if (i >= options.skip) { t_graph_end = MPI_Wtime(); @@ -185,31 +263,44 @@ void run_fop_with_flush_local (int rank, enum WINDOW type) t_graph_start) * 1.0e6; } } + if (i == 0 && options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check( + data_type, op, rank, win_base, tbuf, + options.max_message_size, 0, 1, + &validation_error_flag); + } } t_end = MPI_Wtime (); MPI_CHECK(MPI_Win_unlock(1, win)); + } else if (options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check( + data_type, op, rank, win_base, tbuf, options.max_message_size, + 1, 0, &validation_error_flag ); } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); - omb_papi_stop_and_print(&papi_eventset, 8); - print_latency(rank, 8); + omb_papi_stop_and_print(&papi_eventset, dtype_size); + print_latency(rank, dtype_size, 1); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations; } omb_graph_plot(&omb_graph_op, benchmark_name); omb_graph_free_data_buffers(&omb_graph_op); omb_papi_free(&papi_eventset); - free_atomic_memory (sbuf, win_base, tbuf, NULL, type, win, rank); + free_atomic_memory (sbuf, win_base, tbuf, NULL, win_type, win, rank); + } /*Run FOP with flush */ -void run_fop_with_flush (int rank, enum WINDOW type) +void run_fop_with_flush(int rank, enum WINDOW win_type, MPI_Datatype data_type, MPI_Op op) { double t_graph_start = 0.0, t_graph_end = 0.0; omb_graph_data_t *omb_graph_data = NULL; int papi_eventset = OMB_PAPI_NULL; - int i; + int i, dtype_size; MPI_Aint disp = 0; MPI_Win win; @@ -218,12 +309,20 @@ void run_fop_with_flush (int rank, enum WINDOW type) &omb_graph_op, 8, options.iterations); omb_papi_init(&papi_eventset); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + MPI_CHECK(MPI_Type_size(data_type, &dtype_size)); allocate_atomic_memory(rank, (char **)&sbuf, - (char **)&tbuf, NULL, (char **)&win_base, options.max_message_size, type, &win); + (char **)&tbuf, NULL, (char **)&win_base, options.max_message_size, win_type, &win); + + if (options.validate) { + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + } + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); if (rank == 0) { - if (type == WIN_DYNAMIC) { + if (win_type == WIN_DYNAMIC) { disp = disp_remote; } MPI_CHECK(MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win)); @@ -235,7 +334,7 @@ void run_fop_with_flush (int rank, enum WINDOW type) if (i >= options.skip) { t_graph_start = MPI_Wtime(); } - MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, MPI_LONG_LONG, 1, disp, MPI_SUM, win)); + MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, data_type, 1, disp, op, win)); MPI_CHECK(MPI_Win_flush(1, win)); if (i >= options.skip) { t_graph_end = MPI_Wtime(); @@ -244,43 +343,64 @@ void run_fop_with_flush (int rank, enum WINDOW type) t_graph_start) * 1.0e6; } } + if (i == 0 && options.validate) { + atomic_data_validation_check( + data_type, op, rank, win_base, tbuf, + options.max_message_size, 0, 1, &validation_error_flag); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + } } t_end = MPI_Wtime (); MPI_CHECK(MPI_Win_unlock(1, win)); + } else if (options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check( + data_type, op, rank, win_base, tbuf, options.max_message_size, + 1, 0, &validation_error_flag); } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); - omb_papi_stop_and_print(&papi_eventset, 8); - print_latency(rank, 8); + omb_papi_stop_and_print(&papi_eventset, dtype_size); + print_latency(rank, dtype_size, 1); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations; } omb_graph_plot(&omb_graph_op, benchmark_name); omb_graph_free_data_buffers(&omb_graph_op); omb_papi_free(&papi_eventset); - free_atomic_memory (sbuf, win_base, tbuf, NULL, type, win, rank); + free_atomic_memory (sbuf, win_base, tbuf, NULL, win_type, win, rank); } /*Run FOP with Lock_all/unlock_all */ -void run_fop_with_lock_all (int rank, enum WINDOW type) +void run_fop_with_lock_all(int rank, enum WINDOW win_type, MPI_Datatype data_type, MPI_Op op) { double t_graph_start = 0.0, t_graph_end = 0.0; omb_graph_data_t *omb_graph_data = NULL; int papi_eventset = OMB_PAPI_NULL; - int i; + int i, dtype_size; MPI_Aint disp = 0; MPI_Win win; + MPI_CHECK(MPI_Type_size(data_type, &dtype_size)); + omb_graph_op.number_of_graphs = 0; omb_graph_allocate_and_get_data_buffer(&omb_graph_data, &omb_graph_op, 8, options.iterations); omb_papi_init(&papi_eventset); allocate_atomic_memory(rank, (char **)&sbuf, - (char **)&tbuf, NULL, (char **)&win_base, options.max_message_size, type, &win); + (char **)&tbuf, NULL, (char **)&win_base, options.max_message_size, win_type, &win); + + if (options.validate) { + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + } + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + if (rank == 0) { - if (type == WIN_DYNAMIC) { + if (win_type == WIN_DYNAMIC) { disp = disp_remote; } @@ -293,7 +413,7 @@ void run_fop_with_lock_all (int rank, enum WINDOW type) t_graph_start = MPI_Wtime(); } MPI_CHECK(MPI_Win_lock_all(0, win)); - MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, MPI_LONG_LONG, 1, disp, MPI_SUM, win)); + MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, data_type, 1, disp, op, win)); MPI_CHECK(MPI_Win_unlock_all(win)); if (i >= options.skip) { t_graph_end = MPI_Wtime(); @@ -302,42 +422,63 @@ void run_fop_with_lock_all (int rank, enum WINDOW type) t_graph_start) * 1.0e6; } } + if (i == 0 && options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check( + data_type, op, rank, win_base, tbuf, + options.max_message_size, 0, 1, &validation_error_flag); + } } t_end = MPI_Wtime (); + } else if (options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check( + data_type, op, rank, win_base, tbuf,options.max_message_size, + 1, 0, &validation_error_flag); } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); - omb_papi_stop_and_print(&papi_eventset, 8); - print_latency(rank, 8); + omb_papi_stop_and_print(&papi_eventset, dtype_size); + print_latency(rank, dtype_size, 1); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations; } omb_graph_plot(&omb_graph_op, benchmark_name); omb_graph_free_data_buffers(&omb_graph_op); omb_papi_free(&papi_eventset); - free_atomic_memory (sbuf, win_base, tbuf, NULL, type, win, rank); + free_atomic_memory (sbuf, win_base, tbuf, NULL, win_type, win, rank); } /*Run FOP with Lock/unlock */ -void run_fop_with_lock(int rank, enum WINDOW type) +void run_fop_with_lock(int rank, enum WINDOW win_type, MPI_Datatype data_type, MPI_Op op) { - int i; + int i, dtype_size; double t_graph_start = 0.0, t_graph_end = 0.0; omb_graph_data_t *omb_graph_data = NULL; int papi_eventset = OMB_PAPI_NULL; MPI_Aint disp = 0; MPI_Win win; + MPI_CHECK(MPI_Type_size(data_type, &dtype_size)); + omb_graph_op.number_of_graphs = 0; omb_graph_allocate_and_get_data_buffer(&omb_graph_data, &omb_graph_op, 8, options.iterations); omb_papi_init(&papi_eventset); allocate_atomic_memory(rank, (char **)&sbuf, - (char **)&tbuf, NULL, (char **)&win_base, options.max_message_size, type, &win); + (char **)&tbuf, NULL, (char **)&win_base, options.max_message_size, win_type, &win); + + if (options.validate) { + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + } + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + if (rank == 0) { - if (type == WIN_DYNAMIC) { + if (win_type == WIN_DYNAMIC) { disp = disp_remote; } @@ -350,7 +491,7 @@ void run_fop_with_lock(int rank, enum WINDOW type) t_graph_start = MPI_Wtime(); } MPI_CHECK(MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 1, 0, win)); - MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, MPI_LONG_LONG, 1, disp, MPI_SUM, win)); + MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, data_type, 1, disp, op, win)); MPI_CHECK(MPI_Win_unlock(1, win)); if (i >= options.skip) { t_graph_end = MPI_Wtime(); @@ -359,37 +500,51 @@ void run_fop_with_lock(int rank, enum WINDOW type) t_graph_start) * 1.0e6; } } + if (i == 0 && options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check( + data_type, op, rank, win_base, tbuf, + options.max_message_size, 0, 1, &validation_error_flag); + } } t_end = MPI_Wtime (); + } else if (options.validate) { + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + atomic_data_validation_check( + data_type, op, rank, win_base, tbuf, options.max_message_size, + 1, 0, &validation_error_flag); } + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); - omb_papi_stop_and_print(&papi_eventset, 8); - print_latency(rank, 8); + omb_papi_stop_and_print(&papi_eventset, dtype_size); + print_latency(rank, dtype_size, 1); if (options.graph && 0 == rank) { omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations; } omb_graph_plot(&omb_graph_op, benchmark_name); omb_graph_free_data_buffers(&omb_graph_op); omb_papi_free(&papi_eventset); - free_atomic_memory (sbuf, win_base, tbuf, NULL, type, win, rank); + free_atomic_memory (sbuf, win_base, tbuf, NULL, win_type, win, rank); } /*Run FOP with Fence */ -void run_fop_with_fence(int rank, enum WINDOW type) +void run_fop_with_fence(int rank, enum WINDOW win_type, MPI_Datatype data_type, MPI_Op op) { double t_graph_start = 0.0, t_graph_end = 0.0; omb_graph_data_t *omb_graph_data = NULL; int papi_eventset = OMB_PAPI_NULL; - int i; + int i, dtype_size; MPI_Aint disp = 0; MPI_Win win; + MPI_CHECK(MPI_Type_size(data_type, &dtype_size)); + allocate_atomic_memory(rank, (char **)&sbuf, - (char **)&tbuf, NULL, (char **)&win_base, options.max_message_size, type, &win); + (char **)&tbuf, NULL, (char **)&win_base, options.max_message_size, win_type, &win); - if (type == WIN_DYNAMIC) { + if (win_type == WIN_DYNAMIC) { disp = disp_remote; } omb_graph_op.number_of_graphs = 0; @@ -405,10 +560,15 @@ void run_fop_with_fence(int rank, enum WINDOW type) t_start = MPI_Wtime (); } if (i >= options.skip) { + if (options.validate) { + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + } t_graph_start = MPI_Wtime(); } MPI_CHECK(MPI_Win_fence(0, win)); - MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, MPI_LONG_LONG, 1, disp, MPI_SUM, win)); + MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, data_type, 1, disp, op, win)); MPI_CHECK(MPI_Win_fence(0, win)); MPI_CHECK(MPI_Win_fence(0, win)); if (i >= options.skip) { @@ -417,6 +577,11 @@ void run_fop_with_fence(int rank, enum WINDOW type) omb_graph_data->data[i - options.skip] = (t_graph_end - t_graph_start) * 1.0e6; } + if (options.validate) { + atomic_data_validation_check( + data_type, op, rank, win_base, tbuf, + options.max_message_size, 1, 1, &validation_error_flag); + } } } t_end = MPI_Wtime (); @@ -425,62 +590,79 @@ void run_fop_with_fence(int rank, enum WINDOW type) if (i == options.skip) { omb_papi_start(&papi_eventset); } + if (i >= options.skip && options.validate) { + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + } MPI_CHECK(MPI_Win_fence(0, win)); MPI_CHECK(MPI_Win_fence(0, win)); - MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, MPI_LONG_LONG, 0, disp, MPI_SUM, win)); + MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, data_type, 0, disp, op, win)); MPI_CHECK(MPI_Win_fence(0, win)); + if (i >= options.skip && options.validate) { + atomic_data_validation_check( + data_type, op, rank, win_base, tbuf, + options.max_message_size, 1, 1, &validation_error_flag); + } + } } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); - omb_papi_stop_and_print(&papi_eventset, 8); - if (rank == 0) { - fprintf(stdout, "%-*d%*.*f\n", 10, 8, FIELD_WIDTH, - FLOAT_PRECISION, (t_end - t_start) * 1.0e6 / options.iterations / 2); - fflush(stdout); - if (options.graph && 0 == rank) { - omb_graph_data->avg = (t_end - t_start) * 1.0e6 / - options.iterations/ 2; - } - omb_graph_plot(&omb_graph_op, benchmark_name); - omb_graph_free_data_buffers(&omb_graph_op); + omb_papi_stop_and_print(&papi_eventset, dtype_size); + print_latency(rank, dtype_size, 0.5); + if (options.graph && 0 == rank) { + omb_graph_data->avg = (t_end - t_start) * 1.0e6 / + options.iterations/ 2; } + omb_graph_plot(&omb_graph_op, benchmark_name); + omb_graph_free_data_buffers(&omb_graph_op); omb_papi_free(&papi_eventset); - free_atomic_memory (sbuf, win_base, tbuf, NULL, type, win, rank); + free_atomic_memory (sbuf, win_base, tbuf, NULL, win_type, win, rank); } /*Run FOP with Post/Start/Complete/Wait */ -void run_fop_with_pscw(int rank, enum WINDOW type) +void run_fop_with_pscw(int rank, enum WINDOW win_type, MPI_Datatype data_type, MPI_Op op) { double t_graph_start = 0.0, t_graph_end = 0.0; omb_graph_data_t *omb_graph_data = NULL; int papi_eventset = OMB_PAPI_NULL; - int destrank, i; + int destrank, i, dtype_size; MPI_Aint disp = 0; MPI_Win win; MPI_Group comm_group, group; MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &comm_group)); + MPI_CHECK(MPI_Type_size(data_type, &dtype_size)); omb_graph_op.number_of_graphs = 0; omb_graph_allocate_and_get_data_buffer(&omb_graph_data, &omb_graph_op, 8, options.iterations); omb_papi_init(&papi_eventset); + + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); allocate_atomic_memory(rank, (char **)&sbuf, - (char **)&tbuf, NULL, (char **)&win_base, options.max_message_size, type, &win); + (char **)&tbuf, NULL, (char **)&win_base, options.max_message_size, win_type, &win); + - if (type == WIN_DYNAMIC) { + if (win_type == WIN_DYNAMIC) { disp = disp_remote; } if (rank == 0) { destrank = 1; MPI_CHECK(MPI_Group_incl(comm_group, 1, &destrank, &group)); - MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); for (i = 0; i < options.skip + options.iterations; i++) { + if (i >= options.skip) { + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + } + MPI_CHECK(MPI_Win_start (group, 0, win)); if (i == options.skip) { @@ -491,7 +673,7 @@ void run_fop_with_pscw(int rank, enum WINDOW type) if (i >= options.skip) { t_graph_start = MPI_Wtime(); } - MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, MPI_LONG_LONG, 1, disp, MPI_SUM, win)); + MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, data_type, 1, disp, op, win)); MPI_CHECK(MPI_Win_complete(win)); MPI_CHECK(MPI_Win_post(group, 0, win)); MPI_CHECK(MPI_Win_wait(win)); @@ -501,6 +683,11 @@ void run_fop_with_pscw(int rank, enum WINDOW type) omb_graph_data->data[i - options.skip] = (t_graph_end - t_graph_start) * 1.0e6; } + if (options.validate) { + atomic_data_validation_check( + data_type, op, rank, win_base, tbuf, + options.max_message_size, 1, 1, &validation_error_flag); + } } } @@ -510,39 +697,46 @@ void run_fop_with_pscw(int rank, enum WINDOW type) destrank = 0; MPI_CHECK(MPI_Group_incl(comm_group, 1, &destrank, &group)); - MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); for (i = 0; i < options.skip + options.iterations; i++) { + if (i >= options.skip) { + atomic_data_validation_setup(data_type, rank, sbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, tbuf, options.max_message_size); + atomic_data_validation_setup(data_type, rank, win_base, options.max_message_size); + MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); + } + + if (i == options.skip) { omb_papi_start(&papi_eventset); } MPI_CHECK(MPI_Win_post(group, 0, win)); MPI_CHECK(MPI_Win_wait(win)); MPI_CHECK(MPI_Win_start(group, 0, win)); - MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, MPI_LONG_LONG, 0, disp, MPI_SUM, win)); + MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, data_type, 0, disp, op, win)); MPI_CHECK(MPI_Win_complete(win)); + if (i >= options.skip && options.validate) { + atomic_data_validation_check( + data_type, op, rank, win_base, tbuf, + options.max_message_size, 1, 1, &validation_error_flag); + } } } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); - omb_papi_stop_and_print(&papi_eventset, 8); - if (rank == 0) { - fprintf(stdout, "%-*d%*.*f\n", 10, 8, FIELD_WIDTH, - FLOAT_PRECISION, (t_end - t_start) * 1.0e6 / options.iterations / 2); - fflush(stdout); - if (options.graph && 0 == rank) { - omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations - / 2; - } - omb_graph_plot(&omb_graph_op, benchmark_name); - omb_graph_free_data_buffers(&omb_graph_op); + omb_papi_stop_and_print(&papi_eventset, dtype_size); + print_latency(rank, dtype_size, 0.5); + if (options.graph && 0 == rank) { + omb_graph_data->avg = (t_end - t_start) * 1.0e6 / options.iterations + / 2; } omb_papi_free(&papi_eventset); MPI_CHECK(MPI_Group_free(&group)); MPI_CHECK(MPI_Group_free(&comm_group)); - free_atomic_memory (sbuf, win_base, tbuf, NULL, type, win, rank); + free_atomic_memory (sbuf, win_base, tbuf, NULL, win_type, win, rank); + } /* vi: set sw=4 sts=4 tw=80: */ diff --git a/c/mpi/one-sided/osu_osc_verify.c b/c/mpi/one-sided/osu_osc_verify.c new file mode 100644 index 0000000..5578382 --- /dev/null +++ b/c/mpi/one-sided/osu_osc_verify.c @@ -0,0 +1,875 @@ +#include +#include +#include +#include +#include +#include + +#include "osu_util.h" + +// not consistent with errno.h, but errno.h conflicts with CUDA_CHECK macro. +#define ENODATA 2 + +static char rank_buffer_type = '\0'; + +static char inline get_rank_buffer_type() { + if (rank_buffer_type == '\0') { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + rank_buffer_type = (rank == 0) ? options.src : options.dst; + } + return rank_buffer_type; +} + +static int set_hmem_buffer (void *dst, void *src, size_t size) +{ + char buf_type = get_rank_buffer_type(); + + if (buf_type == 'H' || options.accel == NONE) { + memcpy(dst, src, size); + return 0; + } + switch (options.accel) { +#ifdef _ENABLE_CUDA_ + case CUDA: + CUDA_CHECK(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaDeviceSynchronize()); + break; +#endif + default: + printf("Memory copy not implemented for the selected acceleration platform\n"); + return -1; + } + return 0; +} + +static int get_hmem_buffer (void *dst, void *src, size_t size) +{ + char buf_type = get_rank_buffer_type(); + if (rank_buffer_type == 'H' || options.accel == NONE) { + memcpy(dst, src, size); + return 0; + } + switch (options.accel) { +#ifdef _ENABLE_CUDA_ + case CUDA: + CUDA_CHECK(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaDeviceSynchronize()); + break; +#endif + default: + printf("Memory copy not implemented for the selected acceleration platform\n"); + return -1; + } + return 0; +} +struct atomic_dv_summary { + MPI_Datatype datatype; + MPI_Op op; + size_t trials; + size_t validation_failures; + size_t validations_performed; + size_t first_failure; + size_t last_failure; + struct atomic_dv_summary *next; +}; + +#define bool _Bool +#define MPI_OP_COUNT 16 + +struct atomic_dv_summary* dv_summary_root = NULL; + +typedef enum type_of_enum { + OSC_TYPE_ATOMIC_TYPE, + OSC_TYPE_ATOMIC_OP +} type_of_enum; + +char osc_str_output[256]; +char* osc_tostr(void *val, type_of_enum type_type) { + memset(osc_str_output, 0, sizeof(osc_str_output)); + if (type_type == OSC_TYPE_ATOMIC_OP) { + MPI_Op op = *(MPI_Op*)val; + if (-1 == *(int*)val) sprintf(osc_str_output,"%s", "Compare-And-Swap"); + if (op == MPI_OP_NULL) sprintf(osc_str_output,"%s", "MPI_OP_NULL"); + if (op == MPI_MAX) sprintf(osc_str_output,"%s", "MPI_MAX"); + if (op == MPI_MIN) sprintf(osc_str_output,"%s", "MPI_MIN"); + if (op == MPI_SUM) sprintf(osc_str_output,"%s", "MPI_SUM"); + if (op == MPI_PROD) sprintf(osc_str_output,"%s", "MPI_PROD"); + if (op == MPI_LAND) sprintf(osc_str_output,"%s", "MPI_LAND"); + if (op == MPI_BAND) sprintf(osc_str_output,"%s", "MPI_BAND"); + if (op == MPI_LOR) sprintf(osc_str_output,"%s", "MPI_LOR"); + if (op == MPI_BOR) sprintf(osc_str_output,"%s", "MPI_BOR"); + if (op == MPI_LXOR) sprintf(osc_str_output,"%s", "MPI_LXOR"); + if (op == MPI_BXOR) sprintf(osc_str_output,"%s", "MPI_BXOR"); + if (op == MPI_MINLOC) sprintf(osc_str_output,"%s", "MPI_MINLOC"); + if (op == MPI_MAXLOC) sprintf(osc_str_output,"%s", "MPI_MAXLOC"); + if (op == MPI_REPLACE) sprintf(osc_str_output,"%s", "MPI_REPLACE"); + } + if (type_type == OSC_TYPE_ATOMIC_TYPE) { + MPI_Datatype type = *(MPI_Datatype*)val; + if (type == MPI_DATATYPE_NULL) sprintf(osc_str_output,"%s", "MPI_DATATYPE_NULL"); + if (type == MPI_SIGNED_CHAR) sprintf(osc_str_output,"%s", "MPI_SIGNED_CHAR"); + if (type == MPI_UNSIGNED_CHAR) sprintf(osc_str_output,"%s", "MPI_UNSIGNED_CHAR"); + if (type == MPI_SHORT) sprintf(osc_str_output,"%s", "MPI_SHORT"); + if (type == MPI_UNSIGNED_SHORT) sprintf(osc_str_output,"%s", "MPI_UNSIGNED_SHORT"); + if (type == MPI_INT) sprintf(osc_str_output,"%s", "MPI_INT"); + if (type == MPI_UNSIGNED) sprintf(osc_str_output,"%s", "MPI_UNSIGNED"); + if (type == MPI_LONG) sprintf(osc_str_output,"%s", "MPI_LONG"); + if (type == MPI_UNSIGNED_LONG) sprintf(osc_str_output,"%s", "MPI_UNSIGNED_LONG"); + if (type == MPI_LONG_LONG) sprintf(osc_str_output,"%s", "MPI_LONG_LONG"); + if (type == MPI_UNSIGNED_LONG_LONG) sprintf(osc_str_output,"%s", "MPI_UNSIGNED_LONG_LONG"); + if (type == MPI_FLOAT) sprintf(osc_str_output,"%s", "MPI_FLOAT"); + if (type == MPI_DOUBLE) sprintf(osc_str_output,"%s", "MPI_DOUBLE"); + if (type == MPI_LONG_DOUBLE) sprintf(osc_str_output,"%s", "MPI_LONG_DOUBLE"); + if (type == MPI_C_FLOAT_COMPLEX) sprintf(osc_str_output,"%s", "MPI_C_FLOAT_COMPLEX"); + if (type == MPI_C_DOUBLE_COMPLEX) sprintf(osc_str_output,"%s", "MPI_C_DOUBLE_COMPLEX"); + if (type == MPI_C_LONG_DOUBLE_COMPLEX) sprintf(osc_str_output,"%s", "MPI_C_LONG_DOUBLE_COMPLEX"); + } + return osc_str_output; +} + +int mpi_op_enumerate(MPI_Op op) { + if (op == MPI_OP_NULL) return 0; + if (op == MPI_MAX) return 1; + if (op == MPI_MIN) return 2; + if (op == MPI_SUM) return 3; + if (op == MPI_PROD) return 4; + if (op == MPI_LAND) return 5; + if (op == MPI_BAND) return 6; + if (op == MPI_LOR) return 7; + if (op == MPI_BOR) return 8; + if (op == MPI_LXOR) return 9; + if (op == MPI_BXOR) return 10; + if (op == MPI_MINLOC) return 11; + if (op == MPI_MAXLOC) return 12; + if (op == MPI_REPLACE) return 13; + return -1; +} +#define ENUM_OF_DMPI_OP_NULL 0 +#define ENUM_OF_DMPI_MAX 1 +#define ENUM_OF_DMPI_MIN 2 +#define ENUM_OF_DMPI_SUM 3 +#define ENUM_OF_DMPI_PROD 4 +#define ENUM_OF_DMPI_LAND 5 +#define ENUM_OF_DMPI_BAND 6 +#define ENUM_OF_DMPI_LOR 7 +#define ENUM_OF_DMPI_BOR 8 +#define ENUM_OF_DMPI_LXOR 9 +#define ENUM_OF_DMPI_BXOR 10 +#define ENUM_OF_DMPI_MINLOC 11 +#define ENUM_OF_DMPI_MAXLOC 12 +#define ENUM_OF_DMPI_REPLACE 13 + +int mpi_dtype_enumerate(MPI_Datatype dtype) { + if (dtype == MPI_DATATYPE_NULL) return 0; + if (dtype == MPI_SIGNED_CHAR) return 1; + if (dtype == MPI_UNSIGNED_CHAR) return 2; + if (dtype == MPI_SHORT) return 3; + if (dtype == MPI_UNSIGNED_SHORT) return 4; + if (dtype == MPI_INT) return 5; + if (dtype == MPI_UNSIGNED) return 6; + if (dtype == MPI_LONG) return 7; + if (dtype == MPI_UNSIGNED_LONG) return 8; + if (dtype == MPI_LONG_LONG) return 9; + if (dtype == MPI_UNSIGNED_LONG_LONG) return 10; + if (dtype == MPI_FLOAT) return 11; + if (dtype == MPI_DOUBLE) return 12; + if (dtype == MPI_LONG_DOUBLE) return 13; + if (dtype == MPI_C_FLOAT_COMPLEX) return 14; + if (dtype == MPI_C_DOUBLE_COMPLEX) return 15; + if (dtype == MPI_C_LONG_DOUBLE_COMPLEX) return 16; + return -1; +} +// these must match the above routine! +#define ENUM_OF_DMPI_DATATYPE_NULL 0 +#define ENUM_OF_DMPI_SIGNED_CHAR 1 +#define ENUM_OF_DMPI_UNSIGNED_CHAR 2 +#define ENUM_OF_DMPI_SHORT 3 +#define ENUM_OF_DMPI_UNSIGNED_SHORT 4 +#define ENUM_OF_DMPI_INT 5 +#define ENUM_OF_DMPI_UNSIGNED 6 +#define ENUM_OF_DMPI_LONG 7 +#define ENUM_OF_DMPI_UNSIGNED_LONG 8 +#define ENUM_OF_DMPI_LONG_LONG 9 +#define ENUM_OF_DMPI_UNSIGNED_LONG_LONG 10 +#define ENUM_OF_DMPI_FLOAT 11 +#define ENUM_OF_DMPI_DOUBLE 12 +#define ENUM_OF_DMPI_LONG_DOUBLE 13 +#define ENUM_OF_DMPI_C_FLOAT_COMPLEX 14 +#define ENUM_OF_DMPI_C_DOUBLE_COMPLEX 15 +#define ENUM_OF_DMPI_C_LONG_DOUBLE_COMPLEX 16 + +/* + @brief Prints a summary of test failures. + @return 0 if all validations passed, <0 if any failures recorded. +*/ +int atomic_data_validation_print_summary() { + + int retval = 0; + char type_str[32] = {0}; + char op_str[32] = {0}; + char test_name[64] = {}; + int validation_combos = 0; + int failure_count = 0; + + struct atomic_dv_summary *node = dv_summary_root; + struct atomic_dv_summary *next = NULL; + + if (!node) { + printf("SKIPPED: No validations were performed!\n"); + return 0; + } + + while(node) { + snprintf(type_str, sizeof(type_str)-1, "%s", osc_tostr(&node->datatype, OSC_TYPE_ATOMIC_TYPE)); + snprintf(op_str, sizeof(op_str)-1, "%s", osc_tostr(&node->op, OSC_TYPE_ATOMIC_OP)); + snprintf(test_name, sizeof(test_name), "%s on %s", op_str, type_str); + validation_combos += 1; + + if (node->validation_failures==0 && node->validations_performed==node->trials) { + // all these tests passed + //printf("PASSED: %s passed %zu trials.\n",test_name, node->trials); + } + else if (node->validation_failures) { + printf("FAILED: %s had %zu of %zu tests fail data validation.\n", + test_name, node->validation_failures, node->trials); + printf("\t\tFirst failure at trial %zu, last failure at trial %zu.\n", + node->first_failure, node->last_failure); + retval = -1; + failure_count++; + } + else if (node->validations_performed < node->trials) { + printf("SKIPPED: Data validation not available for %s\n", test_name); + retval = -1; + failure_count++; + } + + // clean up as we go + next = node->next; + free(node); + node = next; + } + + if (retval == 0) { + printf("PASSED: All %d combinations of ops and datatypes tested passed.\n",validation_combos); + } else { + printf("FAILED: %d of the %d combinations of ops and datatypes tested failed.\n",failure_count,validation_combos); + } + dv_summary_root = NULL; + return retval; +} + +static void atomic_dv_record(MPI_Datatype dtype, MPI_Op op, bool failed, bool checked) { + + struct atomic_dv_summary *node = dv_summary_root; + + if (!node || node->op != op || node->datatype != dtype) { + // allocate and add a new node + node = calloc(1, sizeof(struct atomic_dv_summary)); + node->next = dv_summary_root; + dv_summary_root = node; + node->op = op; + node->datatype = dtype; + } + + // record trial. + node->trials++; + if (failed) { + if (node->validation_failures==0) node->first_failure = node->trials; + node->last_failure = node->trials; + node->validation_failures++; + } + if (checked) node->validations_performed++; +} + + +// debugging macro help: gcc -Iinclude -I/fsx/lrbison/libfabric/install/include -E functional/atomic_verify.c | sed 's/case/\ncase/g' | less + +#define ATOM_FOR_DMPI_MIN(a,ao,b) (ao) = (((a) < (b)) ? a : b) +#define ATOM_FOR_DMPI_MAX(a,ao,b) (ao) = (((a) > (b)) ? a : b) +#define ATOM_FOR_DMPI_SUM(a,ao,b) (ao) = ((a) + (b)) +#define ATOM_FOR_DMPI_PROD(a,ao,b) (ao) = ((a) * (b)) +#define ATOM_FOR_DMPI_LOR(a,ao,b) (ao) = ((a) || (b)) +#define ATOM_FOR_DMPI_LAND(a,ao,b) (ao) = ((a) && (b)) +#define ATOM_FOR_DMPI_BOR(a,ao,b) (ao) = ((a) | (b)) +#define ATOM_FOR_DMPI_BAND(a,ao,b) (ao) = ((a) & (b)) +#define ATOM_FOR_DMPI_LXOR(a,ao,b) (ao) = (((a) && !(b)) || (!(a) && (b))) +#define ATOM_FOR_DMPI_BXOR(a,ao,b) (ao) = ((a) ^ (b)) +#define ATOM_FOR_DMPI_ATOMIC_READ(a,ao,b) (ao) = (a) +#define ATOM_FOR_DMPI_ATOMIC_WRITE(a,ao,b) (ao) = (b) + +#define ATOM_FOR_DMPI_CSWAP(a,ao,b,c) if ((c) == (a)) {(ao) = (b);} +#define ATOM_FOR_DMPI_CSWAP_NE(a,ao,b,c) if ((c) != (a)) {(ao) = (b);} +#define ATOM_FOR_DMPI_CSWAP_LE(a,ao,b,c) if ((c) <= (a)) {(ao) = (b);} +#define ATOM_FOR_DMPI_CSWAP_LT(a,ao,b,c) if ((c) < (a)) {(ao) = (b);} +#define ATOM_FOR_DMPI_CSWAP_GE(a,ao,b,c) if ((c) >= (a)) {(ao) = (b);} +#define ATOM_FOR_DMPI_CSWAP_GT(a,ao,b,c) if ((c) > (a)) {(ao) = (b);} +#define ATOM_FOR_DMPI_MSWAP(a,ao,b,c) (ao) = ((b) & (c)) | ((a) & ~(c)); + +#define ATOM_FOR_CPLX_DMPI_MIN(a,ao,b,absfun) (ao) = ((absfun(a) < absfun(b)) ? (a) : (b)) +#define ATOM_FOR_CPLX_DMPI_MAX(a,ao,b,absfun) (ao) = (absfun(a) > absfun(b) ? (a) : (b)) +#define ATOM_FOR_CPLX_DMPI_CSWAP_LE(a,ao,b,c,absfun) if (absfun(c) <= absfun(a)) {(ao) = (b);} +#define ATOM_FOR_CPLX_DMPI_CSWAP_LT(a,ao,b,c,absfun) if (absfun(c) < absfun(a)) {(ao) = (b);} +#define ATOM_FOR_CPLX_DMPI_CSWAP_GE(a,ao,b,c,absfun) if (absfun(c) >= absfun(a)) {(ao) = (b);} +#define ATOM_FOR_CPLX_DMPI_CSWAP_GT(a,ao,b,c,absfun) if (absfun(c) > absfun(a)) {(ao) = (b);} + +#define ATOM_CTYPE_FOR_DMPI_SIGNED_CHAR char +#define ATOM_CTYPE_FOR_DMPI_UNSIGNED_CHAR unsigned char +#define ATOM_CTYPE_FOR_DMPI_SHORT short +#define ATOM_CTYPE_FOR_DMPI_UNSIGNED_SHORT unsigned short +#define ATOM_CTYPE_FOR_DMPI_INT int +#define ATOM_CTYPE_FOR_DMPI_UNSIGNED unsigned +#define ATOM_CTYPE_FOR_DMPI_LONG long +#define ATOM_CTYPE_FOR_DMPI_UNSIGNED_LONG unsigned long +#define ATOM_CTYPE_FOR_DMPI_LONG_LONG long long +#define ATOM_CTYPE_FOR_DMPI_UNSIGNED_LONG_LONG unsigned long long + +#define ATOM_CTYPE_FOR_DMPI_FLOAT float +#define ATOM_CTYPE_FOR_DMPI_DOUBLE double +#define ATOM_CTYPE_FOR_DMPI_LONG_DOUBLE long double + +#define ATOM_CTYPE_FOR_DMPI_C_FLOAT_COMPLEX float complex +#define ATOM_CTYPE_FOR_DMPI_C_DOUBLE_COMPLEX double complex +#define ATOM_CTYPE_FOR_DMPI_C_LONG_DOUBLE_COMPLEX long double complex + +// this macro is for expansion inside the perform_atomic_op function +// and uses variables local to that function. +#define atomic_case_cplx(ftype, fop, absfun) \ +case ftype*MPI_OP_COUNT + fop: \ + { if(result) *(ATOM_CTYPE_FOR_##ftype*)result = *(ATOM_CTYPE_FOR_##ftype*)addr_in; \ + ATOM_FOR_CPLX_##fop( *(ATOM_CTYPE_FOR_##ftype*)addr_in, \ + *(ATOM_CTYPE_FOR_##ftype*)addr_out, \ + *(ATOM_CTYPE_FOR_##ftype*)buf, \ + absfun ); \ + break; \ + } + +#define atomic_case(ftype, fop) \ +case ENUM_OF_##ftype*MPI_OP_COUNT + ENUM_OF_##fop: \ + { if(result) *(ATOM_CTYPE_FOR_##ftype*)result = *(ATOM_CTYPE_FOR_##ftype*)addr_in; \ + ATOM_FOR_##fop( *(ATOM_CTYPE_FOR_##ftype*)addr_in, \ + *(ATOM_CTYPE_FOR_##ftype*)addr_out, \ + *(ATOM_CTYPE_FOR_##ftype*)buf ); \ + break; \ + } + + +// this macro is for expansion inside the perform_atomic_cas function +// and uses variables local to that function. +#define atomic_case_cas(ftype) \ +case ENUM_OF_##ftype: \ + { if(result) {*(ATOM_CTYPE_FOR_##ftype*)result = *(ATOM_CTYPE_FOR_##ftype*)addr_in; } \ + ATOM_FOR_DMPI_CSWAP( *(ATOM_CTYPE_FOR_##ftype*)addr_in, \ + *(ATOM_CTYPE_FOR_##ftype*)addr_out, \ + *(ATOM_CTYPE_FOR_##ftype*)buf, \ + *(ATOM_CTYPE_FOR_##ftype*)compare ); \ + break; \ + } + +#define atomic_int_ops(dtype) \ + atomic_case(dtype, DMPI_MIN) \ + atomic_case(dtype, DMPI_MAX) \ + atomic_case(dtype, DMPI_SUM) \ + atomic_case(dtype, DMPI_PROD) \ + atomic_case(dtype, DMPI_LOR) \ + atomic_case(dtype, DMPI_LAND) \ + atomic_case(dtype, DMPI_BOR) \ + atomic_case(dtype, DMPI_BAND) \ + atomic_case(dtype, DMPI_LXOR) \ + atomic_case(dtype, DMPI_BXOR) + + +#define atomic_real_float_ops(dtype) \ + atomic_case(dtype, DMPI_MIN) \ + atomic_case(dtype, DMPI_MAX) \ + atomic_case(dtype, DMPI_SUM) \ + atomic_case(dtype, DMPI_PROD) \ + atomic_case(dtype, DMPI_LOR) \ + atomic_case(dtype, DMPI_LAND) \ + atomic_case(dtype, DMPI_LXOR) + +#define atomic_complex_float_ops(dtype, absfun) \ + atomic_case(dtype, DMPI_SUM) \ + atomic_case(dtype, DMPI_PROD) + + +int perform_atomic_op( MPI_Datatype dtype, + MPI_Op op, + void *addr_in, + void *buf, + void *addr_out, + void *compare, + void *result) +{ + int op_enumeration = mpi_op_enumerate(op); + int dtype_enumeration = mpi_dtype_enumerate(dtype); + switch(dtype_enumeration*MPI_OP_COUNT + op_enumeration) { + atomic_int_ops(DMPI_SIGNED_CHAR) + atomic_int_ops(DMPI_UNSIGNED_CHAR) + atomic_int_ops(DMPI_SHORT) + atomic_int_ops(DMPI_UNSIGNED_SHORT) + atomic_int_ops(DMPI_INT) + atomic_int_ops(DMPI_UNSIGNED) + atomic_int_ops(DMPI_LONG_LONG) + atomic_int_ops(DMPI_UNSIGNED_LONG) + atomic_int_ops(DMPI_UNSIGNED_LONG_LONG) + + atomic_real_float_ops(DMPI_FLOAT) + atomic_real_float_ops(DMPI_DOUBLE) + atomic_real_float_ops(DMPI_LONG_DOUBLE) + + atomic_complex_float_ops(DMPI_C_FLOAT_COMPLEX, cabsf) + atomic_complex_float_ops(DMPI_C_DOUBLE_COMPLEX, cabs) + atomic_complex_float_ops(DMPI_C_LONG_DOUBLE_COMPLEX, cabsl) + + default: + return -1; + + } + return 0; +} + +int perform_atomic_cas( MPI_Datatype dtype, + void *addr_in, + void *buf, + void *addr_out, + void *compare, + void *result) +{ + int dtype_enumeration = mpi_dtype_enumerate(dtype); + switch(dtype_enumeration) { + atomic_case_cas(DMPI_SIGNED_CHAR) + atomic_case_cas(DMPI_UNSIGNED_CHAR) + atomic_case_cas(DMPI_SHORT) + atomic_case_cas(DMPI_UNSIGNED_SHORT) + atomic_case_cas(DMPI_INT) + atomic_case_cas(DMPI_UNSIGNED) + atomic_case_cas(DMPI_LONG_LONG) + atomic_case_cas(DMPI_UNSIGNED_LONG) + atomic_case_cas(DMPI_UNSIGNED_LONG_LONG) + atomic_case_cas(DMPI_FLOAT) + atomic_case_cas(DMPI_DOUBLE) + atomic_case_cas(DMPI_LONG_DOUBLE) + + default: + return -1; + + } + return 0; +} + +static int validation_input_value(MPI_Datatype dtype, int jrank, void *val) { + + if (dtype == MPI_DATATYPE_NULL) {} + else if (dtype == MPI_SIGNED_CHAR) + *(char*)val = (1+jrank)*10; + else if (dtype == MPI_UNSIGNED_CHAR) + *(unsigned char*)val = (1+jrank)*10; + else if (dtype == MPI_SHORT) + *(short*)val = (1+jrank)*10; + else if (dtype == MPI_UNSIGNED_SHORT) + *(unsigned short*)val = (1+jrank)*10; + else if (dtype == MPI_INT) + *(int*)val = (1+jrank)*10; + else if (dtype == MPI_UNSIGNED) + *(unsigned*)val = (1+jrank)*10; + else if (dtype == MPI_LONG) + *(long*)val = (1+jrank)*10; + else if (dtype == MPI_UNSIGNED_LONG) + *(unsigned long*)val = (1+jrank)*10; + else if (dtype == MPI_LONG_LONG) + *(long long*)val = (1+jrank)*10; + else if (dtype == MPI_UNSIGNED_LONG_LONG) + *(unsigned long long*)val = (1+jrank)*10; + else if (dtype == MPI_FLOAT) + *(float*)val = (1+jrank)*1.11f; + else if (dtype == MPI_DOUBLE) + *(double*)val = (1+jrank)*1.11; + else if (dtype == MPI_LONG_DOUBLE) + *(long double*)val = (1+jrank)*1.11L; + else if (dtype == MPI_C_FLOAT_COMPLEX) + *(float complex*)val = CMPLXF( (1+jrank)*1.11f, (1+jrank*-0.5f) ); + else if (dtype == MPI_C_DOUBLE_COMPLEX) { + *(double complex*)val = CMPLX( (1+jrank)*1.11, (1+jrank*-0.5) ); + } + else if (dtype == MPI_C_LONG_DOUBLE_COMPLEX) + *(long double complex*)val = CMPLXL( (1+jrank)*1.11L, (1+jrank*-0.5L) ); + else { + fprintf(stderr, "No initial value defined, cannot perform data validation " + "on atomic operations using %s\n", + osc_tostr(&dtype, OSC_TYPE_ATOMIC_TYPE) ); + return -1; + } + return 0; +} + +#define COMPARE_AS_TYPE(c_type, a, b) *(c_type*)(a) == *(c_type*)(b) +static int atom_binary_compare(MPI_Datatype dtype, void *a, void *b) +{ + int dtype_size = 0; + char *achar, *bchar; + int err; + + // treat floating point types specially. Avoid NaNs, since NaN != NaN. + if (dtype == MPI_C_LONG_DOUBLE_COMPLEX) { + return COMPARE_AS_TYPE(ATOM_CTYPE_FOR_DMPI_C_LONG_DOUBLE_COMPLEX, a, b); + } + if (dtype == MPI_C_DOUBLE_COMPLEX) { + return COMPARE_AS_TYPE(ATOM_CTYPE_FOR_DMPI_C_DOUBLE_COMPLEX, a, b); + } + if (dtype == MPI_C_FLOAT_COMPLEX) { + return COMPARE_AS_TYPE(ATOM_CTYPE_FOR_DMPI_C_FLOAT_COMPLEX, a, b); + } + if (dtype == MPI_LONG_DOUBLE) { + return COMPARE_AS_TYPE(ATOM_CTYPE_FOR_DMPI_LONG_DOUBLE, a, b); + } + if (dtype == MPI_DOUBLE) { + return COMPARE_AS_TYPE(ATOM_CTYPE_FOR_DMPI_DOUBLE, a, b); + } + if (dtype == MPI_FLOAT) { + return COMPARE_AS_TYPE(ATOM_CTYPE_FOR_DMPI_FLOAT, a, b); + } + + // treat remaining integers based soley on their size + err = MPI_Type_size(dtype, &dtype_size); + if (err) return 0; + + switch (dtype_size) + { + case 1: return COMPARE_AS_TYPE(__int8_t, a, b); + case 2: return COMPARE_AS_TYPE(__int16_t, a, b); + case 4: return COMPARE_AS_TYPE(__int32_t, a, b); + case 8: return COMPARE_AS_TYPE(__int64_t, a, b); + case 16: return COMPARE_AS_TYPE(__int128_t, a, b); + } + return 0; +} + +int atomic_data_validation_setup(MPI_Datatype datatype, int jrank, void *buf, size_t buf_size) { + char set_value[64]; // fits maximum atom size of 256 bits. + char *set_buf; + int jatom; + int dtype_size; + size_t natoms; + int err; + + set_buf = calloc(buf_size, 1); + err = MPI_Type_size(datatype, &dtype_size); + if (err) goto exit_path; + + natoms = buf_size/dtype_size; + + // get the value we wish to set the memory to. + err = validation_input_value(datatype, jrank, set_value); + if (err == -ENODATA) { + err = 0; + goto exit_path; + } + if (err) goto exit_path; + + + + // fill a system buffer with the value + for (jatom=0; jatom < natoms; jatom++) { + memcpy( set_buf + jatom*dtype_size, set_value, dtype_size ); + } + + // copy system buffer to hmem. + err = set_hmem_buffer(buf, set_buf, buf_size ); +exit_path: + free(set_buf); + return err; +} + +#define PRINT_ADR_COMPARISON(dtype,fmt,ai,bi,ci,ao,ae) \ + fprintf(stderr, \ + "Initial Values: [local]addr=" fmt ", [remote]buf=" fmt ", [remote]compare=" fmt "\n" \ + "Observed Final Value: addr=" fmt "\n" \ + "Expected Final Value: addr=" fmt "\n", \ + *(ATOM_CTYPE_FOR_##dtype*)(ai), \ + *(ATOM_CTYPE_FOR_##dtype*)(bi), \ + *(ATOM_CTYPE_FOR_##dtype*)(ci), \ + *(ATOM_CTYPE_FOR_##dtype*)(ao), \ + *(ATOM_CTYPE_FOR_##dtype*)(ae) ); +#define PRINT_ADR_COMPARISON_CPLX(fmtc,realfun,imagfun,ai,bi,ci,ao,ae) \ + fprintf(stderr, \ + "Initial Values: [local]addr=%"fmtc"%+"fmtc"i, [remote]buf=%"fmtc"%+"fmtc"i, [remote]compare=%"fmtc"%+"fmtc"i\n" \ + "Observed Final Value: addr=%"fmtc"%+"fmtc"i\n" \ + "Expected Final Value: addr=%"fmtc"%+"fmtc"i\n", \ + realfun(ai), imagfun(ai), \ + realfun(bi), imagfun(bi), \ + realfun(ci), imagfun(ci), \ + realfun(ao), imagfun(ao), \ + realfun(ae), imagfun(ae) ); + +#define PRINT_RES_COMPARISON(dtype,fmt,ai,bi,ci,ro,re) \ + fprintf(stderr, \ + "Initial Values: [remote]addr=" fmt ", [local]buf=" fmt ", [local]compare=" fmt "\n" \ + "Observed Final Value: result=" fmt "\n" \ + "Expected Final Value: result=" fmt "\n", \ + *(ATOM_CTYPE_FOR_##dtype*)(ai), \ + *(ATOM_CTYPE_FOR_##dtype*)(bi), \ + *(ATOM_CTYPE_FOR_##dtype*)(ci), \ + *(ATOM_CTYPE_FOR_##dtype*)(ro), \ + *(ATOM_CTYPE_FOR_##dtype*)(re) ) +#define PRINT_RES_COMPARISON_CPLX(fmtc,realfun,imagfun,ai,bi,ci,ro,re) \ + fprintf(stderr, \ + "Initial Values: [remote]addr=%"fmtc"%+"fmtc"i, [local]buf=%"fmtc"%+"fmtc"i, [local]compare=%"fmtc"%+"fmtc"i\n" \ + "Observed Final Value: addr=%"fmtc"%+"fmtc"i\n" \ + "Expected Final Value: addr=%"fmtc"%+"fmtc"i\n", \ + realfun(ai), imagfun(ai), \ + realfun(bi), imagfun(bi), \ + realfun(ci), imagfun(ci), \ + realfun(ro), imagfun(ro), \ + realfun(re), imagfun(re) ); + + +static void print_failure_message(MPI_Datatype datatype, + void *adr_in, void *buf_in, void *compare_in, + void *adr_obs, void *res_obs, + void *adr_expect, void *res_expect) +{ + double complex dc; + + if (datatype == MPI_SIGNED_CHAR) { + if (adr_obs) PRINT_ADR_COMPARISON(DMPI_SIGNED_CHAR,"%d",adr_in,buf_in,compare_in,adr_obs,adr_expect); + if (res_obs) PRINT_RES_COMPARISON(DMPI_SIGNED_CHAR,"%d",adr_in,buf_in,compare_in,res_obs,res_expect); + } + if (datatype == MPI_UNSIGNED_CHAR) { + if (adr_obs) PRINT_ADR_COMPARISON(DMPI_UNSIGNED_CHAR,"%u",adr_in,buf_in,compare_in,adr_obs,adr_expect); + if (res_obs) PRINT_RES_COMPARISON(DMPI_UNSIGNED_CHAR,"%u",adr_in,buf_in,compare_in,res_obs,res_expect); + } + if (datatype == MPI_SHORT) { + if (adr_obs) PRINT_ADR_COMPARISON(DMPI_SHORT,"%d",adr_in,buf_in,compare_in,adr_obs,adr_expect); + if (res_obs) PRINT_RES_COMPARISON(DMPI_SHORT,"%d",adr_in,buf_in,compare_in,res_obs,res_expect); + } + if (datatype == MPI_UNSIGNED_SHORT) { + if (adr_obs) PRINT_ADR_COMPARISON(DMPI_UNSIGNED_SHORT,"%u",adr_in,buf_in,compare_in,adr_obs,adr_expect); + if (res_obs) PRINT_RES_COMPARISON(DMPI_UNSIGNED_SHORT,"%u",adr_in,buf_in,compare_in,res_obs,res_expect); + } + if (datatype == MPI_INT) { + if (adr_obs) PRINT_ADR_COMPARISON(DMPI_INT,"%d",adr_in,buf_in,compare_in,adr_obs,adr_expect); + if (res_obs) PRINT_RES_COMPARISON(DMPI_INT,"%d",adr_in,buf_in,compare_in,res_obs,res_expect); + } + if (datatype == MPI_UNSIGNED) { + if (adr_obs) PRINT_ADR_COMPARISON(DMPI_UNSIGNED,"%u",adr_in,buf_in,compare_in,adr_obs,adr_expect); + if (res_obs) PRINT_RES_COMPARISON(DMPI_UNSIGNED,"%u",adr_in,buf_in,compare_in,res_obs,res_expect); + } + if (datatype == MPI_LONG_LONG) { + if (adr_obs) PRINT_ADR_COMPARISON(DMPI_LONG_LONG,"%ld",adr_in,buf_in,compare_in,adr_obs,adr_expect); + if (res_obs) PRINT_RES_COMPARISON(DMPI_LONG_LONG,"%ld",adr_in,buf_in,compare_in,res_obs,res_expect); + } + if (datatype == MPI_UNSIGNED_LONG_LONG) { + if (adr_obs) PRINT_ADR_COMPARISON(DMPI_UNSIGNED_LONG_LONG,"%lu",adr_in,buf_in,compare_in,adr_obs,adr_expect); + if (res_obs) PRINT_RES_COMPARISON(DMPI_UNSIGNED_LONG_LONG,"%lu",adr_in,buf_in,compare_in,res_obs,res_expect); + } + if (datatype == MPI_LONG_DOUBLE) { + if (adr_obs) PRINT_ADR_COMPARISON(DMPI_LONG_DOUBLE,"%Lf",adr_in,buf_in,compare_in,adr_obs,adr_expect); + if (res_obs) PRINT_RES_COMPARISON(DMPI_LONG_DOUBLE,"%Lf",adr_in,buf_in,compare_in,res_obs,res_expect); + } + if (datatype == MPI_DOUBLE) { + if (adr_obs) PRINT_ADR_COMPARISON(DMPI_DOUBLE,"%f",adr_in,buf_in,compare_in,adr_obs,adr_expect); + if (res_obs) PRINT_RES_COMPARISON(DMPI_DOUBLE,"%f",adr_in,buf_in,compare_in,res_obs,res_expect); + } + if (datatype == MPI_LONG_DOUBLE) { + if (adr_obs) PRINT_ADR_COMPARISON(DMPI_LONG_DOUBLE,"%Lf",adr_in,buf_in,compare_in,adr_obs,adr_expect); + if (res_obs) PRINT_RES_COMPARISON(DMPI_LONG_DOUBLE,"%Lf",adr_in,buf_in,compare_in,res_obs,res_expect); + } + if (datatype == MPI_FLOAT) { + if (adr_obs) PRINT_ADR_COMPARISON(DMPI_FLOAT,"%f",adr_in,buf_in,compare_in,adr_obs,adr_expect); + if (res_obs) PRINT_RES_COMPARISON(DMPI_FLOAT,"%f",adr_in,buf_in,compare_in,res_obs,res_expect); + } + if (datatype == MPI_C_FLOAT_COMPLEX) { + if (adr_obs) PRINT_ADR_COMPARISON_CPLX("f",crealf,cimagf,*(float complex*)adr_in,*(float complex*)buf_in,*(float complex*)compare_in,*(float complex*)adr_obs,*(float complex*)adr_expect); + if (res_obs) PRINT_RES_COMPARISON_CPLX("f",crealf,cimagf,*(float complex*)adr_in,*(float complex*)buf_in,*(float complex*)compare_in,*(float complex*)res_obs,*(float complex*)res_expect); + } + if (datatype == MPI_C_DOUBLE_COMPLEX) { + if (adr_obs) PRINT_ADR_COMPARISON_CPLX("f",creal,cimag,*(double complex*)adr_in,*(double complex*)buf_in,*(double complex*)compare_in,*(double complex*)adr_obs,*(double complex*)adr_expect); + if (res_obs) PRINT_RES_COMPARISON_CPLX("f",creal,cimag,*(double complex*)adr_in,*(double complex*)buf_in,*(double complex*)compare_in,*(double complex*)res_obs,*(double complex*)res_expect); + } + if (datatype == MPI_C_LONG_DOUBLE_COMPLEX) { + if (adr_obs) PRINT_ADR_COMPARISON_CPLX("Lf",creall,cimagl,*(long double complex*)adr_in,*(long double complex*)buf_in,*(long double complex*)compare_in,*(long double complex*)adr_obs,*(long double complex*)adr_expect); + if (res_obs) PRINT_RES_COMPARISON_CPLX("Lf",creall,cimagl,*(long double complex*)adr_in,*(long double complex*)buf_in,*(long double complex*)compare_in,*(long double complex*)res_obs,*(long double complex*)res_expect); + } +} + +/** + * Checks the result of an operation descriped by op. + * + * Arguments: + * datatype: the type of data being operated on + * op: The operation to do. -1 means Compare-and-swap. + * jrank: the rank of this processes + * addr: the pointer to a local buffer that may have been operated on + * res: the fetch result from the operation + * buf_size: the number of bytes operated on + * check_addr: true if the remote has done an operation on addr. + * check_result: true if this process did an operation on a remote. + * validation_results: a bitmask which is updated. User should set to 0. + * |=1 when failure occurred. + * |=2 when no validation performed. + * Note that addr and res might be in GPU memory or in system memory. + * Validation will only pass if both local and remote memories were initialized + * with atomic_data_validation_setup. + * + * Additionally validation results are saved in a list and can be printed with + * atomic_data_validation_print_summary(). + */ +int atomic_data_validation_check( + MPI_Datatype datatype, MPI_Op op, int jrank, void *addr, void *res, + size_t buf_size, bool check_addr, bool check_result, int *validation_results + ) +{ + // these all fit the maximum atom size of 256 bits. + const int MAX_ATOM_BYTES=64; + char local_addr[MAX_ATOM_BYTES], remote_addr[MAX_ATOM_BYTES]; + char local_buf[MAX_ATOM_BYTES], remote_buf[MAX_ATOM_BYTES]; + char local_compare[MAX_ATOM_BYTES], remote_compare[MAX_ATOM_BYTES]; + char expected_local_addr[MAX_ATOM_BYTES], dummy_remote_addr[MAX_ATOM_BYTES]; + char expected_local_result[MAX_ATOM_BYTES]; + + char local_addr_in_sysmem[buf_size]; + char local_result_in_sysmem[buf_size]; + int dtype_size; + size_t natoms; + int jatom; + int err, addr_eq, res_eq, any_errors=0; + int jrank_remote = (jrank+1)%2; + + err = MPI_Type_size(datatype, &dtype_size); + if (err) return err; + + natoms = buf_size/dtype_size; + natoms = 1; + + + // setup initial conditions so we can mock the test + err = validation_input_value(datatype, jrank, local_addr); + err |= validation_input_value(datatype, jrank, local_buf); + err |= validation_input_value(datatype, jrank, local_compare); + err |= validation_input_value(datatype, jrank, expected_local_addr); + err |= validation_input_value(datatype, jrank_remote, remote_addr); + err |= validation_input_value(datatype, jrank_remote, remote_buf); + err |= validation_input_value(datatype, jrank_remote, remote_compare); + if (err == -ENODATA) goto nocheck; + if (err) goto error; + + if ((long long)op == -1) { + // mock the remote side performing CAS on our local addr + err = perform_atomic_cas(datatype, local_addr, remote_buf, expected_local_addr, remote_compare, NULL); + // mock the local side performing CAS on remote addr + err |= perform_atomic_cas(datatype, remote_addr, local_buf, dummy_remote_addr, local_compare, expected_local_result); + } + else { + // mock the remote side performing operations on our local addr + err = perform_atomic_op(datatype, op, local_addr, remote_buf, expected_local_addr, remote_compare, NULL); + // mock the local side performing operations on remote addr + err |= perform_atomic_op(datatype, op, remote_addr, local_buf, dummy_remote_addr, local_compare, expected_local_result); + } + if (err == -ENODATA) goto nocheck; + if (err) goto error; + + // if (datatype == MPI_C_LONG_DOUBLE_COMPLEX) { + // printf("Checking: %Lf%+Lfi\t%Lf%+Lfi\t%Lf%+Lfi", + // creall(local_addr),remote_addr, expected_local_addr); + // } + + err = 0; + if (check_addr) err |= get_hmem_buffer(local_addr_in_sysmem, addr, buf_size ); + if (check_result) err |= get_hmem_buffer(local_result_in_sysmem, res, buf_size ); + + if (err) goto error; + for (jatom=0; jatom < natoms; jatom++) { + addr_eq = 1; + res_eq = 1; + if (check_addr) { + addr_eq = atom_binary_compare( datatype, expected_local_addr, + local_addr_in_sysmem + jatom*dtype_size); + } + if (!addr_eq) { + fprintf( stderr, "FAILED: Remote atomic operation %s",osc_tostr(&op, OSC_TYPE_ATOMIC_OP)); + fprintf(stderr, " on %s failed validation of addr at atom index %d.\n", + osc_tostr(&datatype, OSC_TYPE_ATOMIC_TYPE), + jatom ); + print_failure_message( datatype, + local_addr, remote_buf, remote_compare, + local_addr_in_sysmem + jatom*dtype_size, NULL, + expected_local_addr, NULL); + } + if (check_result) { + res_eq = atom_binary_compare( datatype, expected_local_result, + local_result_in_sysmem + jatom*dtype_size); + } + if (!res_eq) { + fprintf( stderr, "FAILED: Local atomic operation %s",osc_tostr(&op, OSC_TYPE_ATOMIC_OP)); + fprintf(stderr, " on %s failed validation of result at atom index %d.\n", + osc_tostr(&datatype, OSC_TYPE_ATOMIC_TYPE), + jatom ); + print_failure_message( datatype, + remote_addr, local_buf, local_compare, + NULL, local_result_in_sysmem + jatom*dtype_size, + NULL, expected_local_result); + } + if (!res_eq || !addr_eq) { + any_errors = 1; + break; + } + } + atomic_dv_record(datatype, op, any_errors, 1); + if (any_errors) *validation_results |= 1; + return 0; + +nocheck: + atomic_dv_record(datatype, op, 0, 0); + *validation_results |= 2; + return 0; +error: + atomic_dv_record(datatype, op, 0, 0); + return err; + + +} + +int is_mpi_cas_allowed(MPI_Datatype dtype) { + if (dtype == MPI_C_FLOAT_COMPLEX) return 0; + if (dtype == MPI_C_DOUBLE_COMPLEX) return 0; + if (dtype == MPI_C_LONG_DOUBLE_COMPLEX) return 0; + return 1; +} + +int is_mpi_op_allowed(MPI_Datatype dtype, MPI_Op op) { + // see MPI standard v4.0 June 2021: Sec 6.9.2, page 226 + // this function is not comprehensive, but it covers + // most of the operations on C types that we intend to test. + + enum data_class { integer, floating_point, floating_complex}; + enum data_class dclass; + + if (dtype == MPI_DATATYPE_NULL) return 0; + if (dtype == MPI_SIGNED_CHAR) dclass = integer; + if (dtype == MPI_UNSIGNED_CHAR) dclass = integer; + if (dtype == MPI_SHORT) dclass = integer; + if (dtype == MPI_UNSIGNED_SHORT) dclass = integer; + if (dtype == MPI_INT) dclass = integer; + if (dtype == MPI_UNSIGNED) dclass = integer; + if (dtype == MPI_LONG) dclass = integer; + if (dtype == MPI_UNSIGNED_LONG) dclass = integer; + if (dtype == MPI_LONG_LONG) dclass = integer; + if (dtype == MPI_UNSIGNED_LONG_LONG) dclass = integer; + if (dtype == MPI_FLOAT) dclass = floating_point; + if (dtype == MPI_DOUBLE) dclass = floating_point; + if (dtype == MPI_LONG_DOUBLE) dclass = floating_point; + if (dtype == MPI_C_FLOAT_COMPLEX) dclass = floating_complex; + if (dtype == MPI_C_DOUBLE_COMPLEX) dclass = floating_complex; + if (dtype == MPI_C_LONG_DOUBLE_COMPLEX) dclass = floating_complex; + + if (op == MPI_MAX || op == MPI_MIN) + return dclass == integer || dclass == floating_point; + if (op == MPI_SUM || op == MPI_PROD) + return dclass == integer || dclass == floating_point || dclass == floating_complex; + if (op == MPI_LAND || op == MPI_LOR || op == MPI_LXOR) + return dclass == integer; + if (op == MPI_BAND || op == MPI_BOR || op == MPI_BXOR) + return dclass == integer; + return 0; +} \ No newline at end of file diff --git a/c/util/osu_util.c b/c/util/osu_util.c index 3839d96..32a9d73 100644 --- a/c/util/osu_util.c +++ b/c/util/osu_util.c @@ -20,7 +20,7 @@ char const * benchmark_header = NULL; char const * benchmark_name = NULL; int accel_enabled = 0; -struct options_t options; +struct options_t options = {0}; struct bad_usage_t bad_usage; @@ -412,6 +412,7 @@ int process_options (int argc, char *argv[]) extern int optind, optopt; char const * optstring = NULL; + char optstring_buf[80]; int c, ret = PO_OKAY; int option_index = 0; @@ -536,13 +537,29 @@ int process_options (int argc, char *argv[]) } } } else if (options.bench == ONE_SIDED) { + int jchar = 0; + + jchar = sprintf(&optstring_buf[jchar], "%s","+:w:s:hvm:x:i:G:"); if(options.subtype == BW) { - optstring = (accel_enabled) ? "+:w:s:hvm:d:x:i:W:G:" : - "+:w:s:hvm:x:i:W:G:P:"; + jchar += sprintf(&optstring_buf[jchar], "%s","W:"); + } + if (accel_enabled) { + jchar += sprintf(&optstring_buf[jchar], "%s","d:"); } else { - optstring = (accel_enabled) ? "+:w:s:hvm:d:x:i:G:" : - "+:w:s:hvm:x:i:G:P:"; + jchar += sprintf(&optstring_buf[jchar], "%s","P:"); + } + if (options.show_validation) { + jchar += sprintf(&optstring_buf[jchar], "%s","c"); } + optstring = optstring_buf; + +// if(options.subtype == BW) { +// optstring = (accel_enabled) ? "+:w:s:hvm:d:x:i:W:G:" : +// "+:w:s:hvm:x:i:W:G:P:"; +// } else { +// optstring = (accel_enabled) ? "+:w:s:hvm:d:x:i:G:" : +// "+:w:s:hvm:x:i:G:P:"; +// } } else if (options.bench == MBW_MR) { optstring = (accel_enabled) ? "p:W:R:x:i:m:d:Vhvb:cu:G:D:" : "p:W:R:x:i:m:Vhvb:cu:G:D:P:"; diff --git a/c/util/osu_util.h b/c/util/osu_util.h index dd947fa..6805672 100644 --- a/c/util/osu_util.h +++ b/c/util/osu_util.h @@ -313,6 +313,7 @@ struct options_t { enum target_type target; int show_size; int show_full; + int show_validation; size_t min_message_size; size_t max_message_size; size_t iterations; diff --git a/c/util/osu_util_mpi.c b/c/util/osu_util_mpi.c index 53fadd7..99243b7 100644 --- a/c/util/osu_util_mpi.c +++ b/c/util/osu_util_mpi.c @@ -206,7 +206,9 @@ void usage_one_sided (char const * name) fprintf(stdout, " [PATH] //PAPI output file path\n"); #endif fprintf(stdout, " -i, --iterations ITER number of iterations for timing (default 10000)\n"); - + if (options.show_validation) { + fprintf(stdout, " -c, --validation Enable or disable validation. Disabled by default. \n"); + } fprintf(stdout, " -h, --help print this help message\n"); fflush(stdout); } @@ -534,10 +536,14 @@ void print_header_one_sided (int rank, enum WINDOW win, enum SYNC sync) 'M' == options.dst ? "MANAGED (M)" : ('D' == options.dst ? "DEVICE (D)" : "HOST (H)")); default: if (options.subtype == BW) { - fprintf(stdout, "%-*s%*s\n", 10, "# Size", FIELD_WIDTH, "Bandwidth (MB/s)"); + fprintf(stdout, "%-*s%*s", 10, "# Size", FIELD_WIDTH, "Bandwidth (MB/s)"); } else { - fprintf(stdout, "%-*s%*s\n", 10, "# Size", FIELD_WIDTH, "Latency (us)"); + fprintf(stdout, "%-*s%*s", 10, "# Size", FIELD_WIDTH, "Latency (us)"); + } + if (options.validate) { + fprintf(stdout, "%*s", FIELD_WIDTH, "Validation"); } + fprintf(stdout, "\n"); fflush(stdout); } } diff --git a/c/util/osu_util_mpi.h b/c/util/osu_util_mpi.h index 53ae48a..3584856 100644 --- a/c/util/osu_util_mpi.h +++ b/c/util/osu_util_mpi.h @@ -159,3 +159,14 @@ size_t omb_ddt_assign(MPI_Datatype *datatype, MPI_Datatype base_datatype, void omb_ddt_free(MPI_Datatype *datatype); size_t omb_ddt_get_size(size_t size); void omb_ddt_append_stats(size_t omb_ddt_transmit_size); + +int atomic_data_validation_setup(MPI_Datatype datatype, int jrank, void *buf, size_t buf_size); +int atomic_data_validation_check(MPI_Datatype datatype, MPI_Op op, int jrank, void *addr, void *res, size_t buf_size, _Bool check_addr, _Bool check_result, int *validation_error_flag); +int atomic_data_validation_print_summary(); +int is_mpi_op_allowed(MPI_Datatype dtype, MPI_Op op); +typedef enum type_of_enum { + OSC_TYPE_ATOMIC_TYPE, + OSC_TYPE_ATOMIC_OP +} type_of_enum; +char* osc_tostr(void *, type_of_enum); +