diff --git a/configure.ac b/configure.ac index 30b6098..379af10 100644 --- a/configure.ac +++ b/configure.ac @@ -126,7 +126,7 @@ if test "$ac_cv_prog_cc_c99" = "no" ; then AC_ERROR([C99 not supported by the compiler]) fi -AC_CHECK_HEADERS([execinfo.h string.h strings.h stdint.h inttypes.h unistd.h errno.h time.h sys/time.h]) +AC_CHECK_HEADERS([execinfo.h string.h strings.h stdint.h stdbool.h inttypes.h unistd.h errno.h time.h sys/time.h]) AC_TYPE_UINT8_T # asynchronous progress diff --git a/src/armci_internals.h b/src/armci_internals.h index 3dbfb81..405db46 100644 --- a/src/armci_internals.h +++ b/src/armci_internals.h @@ -105,6 +105,7 @@ typedef struct { int progress_usleep; /* Argument to usleep() to throttling polling */ #endif int use_win_allocate; /* Use win_allocate or win_create (or special memory...) */ + int msg_barrier_syncs; /* Call MPI_Win_sync in armci_msg_barrier */ int explicit_nb_progress; /* Poke the MPI progress engine at the end of nonblocking (NB) calls */ int use_alloc_shm; /* Pass alloc_shm info to win_allocate / alloc_mem */ int rma_atomicity; /* Use Accumulate and Get_accumulate for Put and Get */ @@ -230,4 +231,7 @@ void ARMCII_Buf_finish_write_vec(void **orig_bufs, void **new_bufs, int count, i int ARMCII_Buf_acc_is_scaled(int datatype, void *scale); void ARMCII_Buf_acc_scale(void *buf_in, void *buf_out, int size, int datatype, void *scale); +int ARMCII_Is_win_unified(MPI_Win win); +void ARMCII_Sync(void); + #endif /* HAVE_ARMCI_INTERNALS_H */ diff --git a/src/gmr-extras.c b/src/gmr-extras.c index 8ca2930..d180b0a 100644 --- a/src/gmr-extras.c +++ b/src/gmr-extras.c @@ -190,13 +190,18 @@ int gmr_flushall(gmr_t *mreg, int local_only) { * @param[in] mreg Memory region * @return 0 on success, non-zero on failure */ -int gmr_sync(gmr_t *mreg) { - int grp_me = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); - +int gmr_sync(gmr_t *mreg) +{ +#if 0 + // what is the point of this? + int grp_me = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); ARMCII_Assert(grp_me >= 0); +#endif ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); - MPI_Win_sync(mreg->window); + if (!(mreg->unified)) { + MPI_Win_sync(mreg->window); + } return 0; } diff --git a/src/gmr.c b/src/gmr.c index 1d70e92..bdb2e8d 100644 --- a/src/gmr.c +++ b/src/gmr.c @@ -72,6 +72,7 @@ gmr_t *gmr_create(gmr_size_t local_size, void **base_ptrs, ARMCI_Group *group) { mreg->nslices = world_nproc; mreg->prev = NULL; mreg->next = NULL; + mreg->unified = false; /* Allocate my slice of the GMR */ alloc_slices[alloc_me].size = local_size; @@ -213,7 +214,8 @@ gmr_t *gmr_create(gmr_size_t local_size, void **base_ptrs, ARMCI_Group *group) { mreg->window); { - int unified = 0; +#if 0 + int unified = false; void *attr_ptr; int *attr_val; int attr_flag; @@ -224,23 +226,38 @@ gmr_t *gmr_create(gmr_size_t local_size, void **base_ptrs, ARMCI_Group *group) { if (world_me==0) { if ( (*attr_val)==MPI_WIN_SEPARATE ) { printf("MPI_WIN_MODEL = MPI_WIN_SEPARATE \n" ); - unified = 0; + unified = false; } else if ( (*attr_val)==MPI_WIN_UNIFIED ) { #ifdef DEBUG printf("MPI_WIN_MODEL = MPI_WIN_UNIFIED \n" ); #endif - unified = 1; + unified = true; } else { printf("MPI_WIN_MODEL = %d (not UNIFIED or SEPARATE) \n", *attr_val ); - unified = 0; + unified = false; } } } else { if (world_me==0) { printf("MPI_WIN_MODEL attribute missing \n"); } + unified = false; } - if (!unified && (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_NOGUARD) ) { +#else + int unified = ARMCII_Is_win_unified(mreg->window); + int print = ARMCII_GLOBAL_STATE.verbose; + if (unified == 1) { + mreg->unified = true; + if (print) printf("MPI_WIN_MODEL = MPI_WIN_UNIFIED\n"); + } else if (unified == 0) { + mreg->unified = false; + if (print) printf("MPI_WIN_MODEL = MPI_WIN_SEPARATE\n"); + } else { + mreg->unified = false; + if (print) printf("MPI_WIN_MODEL not available\n"); + } +#endif + if (!(mreg->unified) && (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_NOGUARD) ) { if (world_me==0) { printf("Please re-run with ARMCI_SHR_BUF_METHOD=COPY\n"); } diff --git a/src/gmr.h b/src/gmr.h index 2cbc90f..5a45317 100644 --- a/src/gmr.h +++ b/src/gmr.h @@ -10,6 +10,14 @@ #include #include +#ifdef HAVE_STDBOOL_H +#include +#else +typedef int bool; +#define false 0 +#define true 1 +#endif + typedef armci_size_t gmr_size_t; #define GMR_MPI_SIZE_T ARMCII_MPI_SIZE_T @@ -26,6 +34,7 @@ typedef struct gmr_s { struct gmr_s *next; gmr_slice_t *slices; /* Array of GMR slices for this allocation */ int nslices; + bool unified; /* separate/unified attribute of the window */ } gmr_t; extern gmr_t *gmr_list; diff --git a/src/init_finalize.c b/src/init_finalize.c index a48e40b..9ad7150 100644 --- a/src/init_finalize.c +++ b/src/init_finalize.c @@ -357,7 +357,10 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { } /* Use win_allocate or not, to work around MPI-3 RMA implementation bugs. */ - ARMCII_GLOBAL_STATE.use_win_allocate=ARMCII_Getenv_int("ARMCI_USE_WIN_ALLOCATE", 1); + ARMCII_GLOBAL_STATE.use_win_allocate = ARMCII_Getenv_bool("ARMCI_USE_WIN_ALLOCATE", 1); + + /* Do MPI_Win_sync in armci_msg_barrier */ + ARMCII_GLOBAL_STATE.msg_barrier_syncs = ARMCII_Getenv_bool("ARMCI_MSG_BARRIER_SYNCS", 0); /* Equivalent to ARMCI_Set_shm_limit - determines the size of: * - MPI_Win_allocate slab in the case of slab allocation @@ -538,6 +541,7 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { printf(" RMA_ATOMICITY = %s\n", ARMCII_GLOBAL_STATE.rma_atomicity ? "TRUE" : "FALSE"); printf(" NO_FLUSH_LOCAL = %s\n", ARMCII_GLOBAL_STATE.end_to_end_flush ? "TRUE" : "FALSE"); printf(" RMA_NOCHECK = %s\n", ARMCII_GLOBAL_STATE.rma_nocheck ? "TRUE" : "FALSE"); + printf(" MSG_BARRIER_SYNCS = %s\n", ARMCII_GLOBAL_STATE.msg_barrier_syncs ? "TRUE" : "FALSE"); /* MPI info set on window */ printf(" USE_ALLOC_SHM = %s\n", ARMCII_GLOBAL_STATE.use_alloc_shm ? "TRUE" : "FALSE"); diff --git a/src/internals.c b/src/internals.c index 48f44d6..f4893f3 100644 --- a/src/internals.c +++ b/src/internals.c @@ -56,19 +56,23 @@ int ARMCII_Translate_absolute_to_group(ARMCI_Group *group, int world_rank) { MPI_Group world_group, sub_group; ARMCII_Assert(world_rank >= 0 && world_rank < ARMCI_GROUP_WORLD.size); - +#if 0 + // this is redundant - the assert checks this if (!(0 <= world_rank && world_rank < ARMCI_GROUP_WORLD.size)) { ARMCII_Warning("world_rank (%d) is not in the range [0,ARMCI_GROUP_WORLD.size=%d)!\n", world_rank, ARMCI_GROUP_WORLD.size); } +#endif /* Check if group is the world group */ if (group->comm == ARMCI_GROUP_WORLD.comm) { group_rank = world_rank; + return group_rank; } /* Check for translation cache */ else if (group->grp_to_abs != NULL) { group_rank = group->abs_to_grp[world_rank]; + return group_rank; } else { /* Translate the rank */ diff --git a/src/message.c b/src/message.c index 5002237..2bcb5f5 100644 --- a/src/message.c +++ b/src/message.c @@ -108,6 +108,11 @@ void armci_msg_bcast_scope(int scope, void *buffer, int len, int root) { */ void parmci_msg_barrier(void) { MPI_Barrier(ARMCI_GROUP_WORLD.comm); + + if (ARMCII_GLOBAL_STATE.msg_barrier_syncs) { + ARMCII_Sync(); + MPI_Barrier(ARMCI_GROUP_WORLD.comm); + } } @@ -127,6 +132,11 @@ void parmci_msg_barrier(void) { */ void parmci_msg_group_barrier(ARMCI_Group *group) { MPI_Barrier(group->comm); + + if (ARMCII_GLOBAL_STATE.msg_barrier_syncs) { + ARMCII_Sync(); + MPI_Barrier(group->comm); + } } diff --git a/src/strided.c b/src/strided.c index cb2a68f..85370d7 100644 --- a/src/strided.c +++ b/src/strided.c @@ -137,11 +137,10 @@ int PARMCI_PutS(void *src_ptr, int src_stride_ar[/*stride_levels*/], } } else { - /* Jeff: WIN_UNIFIED should allow overlap to work but we - * do a memory barrier here to be safe. */ gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank); - if (gmr_loc != NULL) + if (gmr_loc != NULL) { gmr_sync(gmr_loc); + } } /* NOGUARD: If src_buf hasn't been assigned to a copy, the strided source @@ -238,11 +237,10 @@ int PARMCI_GetS(void *src_ptr, int src_stride_ar[/*stride_levels*/], } } else { - /* Jeff: WIN_UNIFIED should allow overlap to work but we - * do a memory barrier here to be safe. */ gmr_loc = gmr_lookup(dst_ptr, ARMCI_GROUP_WORLD.rank); - if (gmr_loc != NULL) + if (gmr_loc != NULL) { gmr_sync(gmr_loc); + } } /* NOGUARD: If dst_buf hasn't been assigned to a copy, the strided source @@ -376,11 +374,10 @@ int PARMCI_AccS(int datatype, void *scale, } } else { - /* Jeff: WIN_UNIFIED should allow overlap to work but we - * do a memory barrier here to be safe. */ gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank); - if (gmr_loc != NULL) + if (gmr_loc != NULL) { gmr_sync(gmr_loc); + } } /* NOGUARD: If src_buf hasn't been assigned to a copy, the strided source diff --git a/src/strided_nb.c b/src/strided_nb.c index 2dfc7b6..7f4f748 100644 --- a/src/strided_nb.c +++ b/src/strided_nb.c @@ -66,11 +66,10 @@ int PARMCI_NbPutS(void *src_ptr, int src_stride_ar[/*stride_levels*/], } } else { - /* Jeff: WIN_UNIFIED should allow overlap to work but we - * do a memory barrier here to be safe. */ gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank); - if (gmr_loc != NULL) + if (gmr_loc != NULL) { gmr_sync(gmr_loc); + } } /* NOGUARD: If src_buf hasn't been assigned to a copy, the strided source @@ -174,11 +173,10 @@ int PARMCI_NbGetS(void *src_ptr, int src_stride_ar[/*stride_levels*/], } } else { - /* Jeff: WIN_UNIFIED should allow overlap to work but we - * do a memory barrier here to be safe. */ gmr_loc = gmr_lookup(dst_ptr, ARMCI_GROUP_WORLD.rank); - if (gmr_loc != NULL) + if (gmr_loc != NULL) { gmr_sync(gmr_loc); + } } /* NOGUARD: If dst_buf hasn't been assigned to a copy, the strided source @@ -318,11 +316,10 @@ int PARMCI_NbAccS(int datatype, void *scale, } } else { - /* Jeff: WIN_UNIFIED should allow overlap to work but we - * do a memory barrier here to be safe. */ gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank); - if (gmr_loc != NULL) + if (gmr_loc != NULL) { gmr_sync(gmr_loc); + } } /* NOGUARD: If src_buf hasn't been assigned to a copy, the strided source diff --git a/src/util.c b/src/util.c index e99e963..f9b4089 100644 --- a/src/util.c +++ b/src/util.c @@ -37,15 +37,9 @@ void ARMCI_Error(const char *msg, int code) { * group!). */ void PARMCI_Barrier(void) { - gmr_t *cur_mreg = gmr_list; - PARMCI_AllFence(); MPI_Barrier(ARMCI_GROUP_WORLD.comm); - - while (cur_mreg) { - gmr_sync(cur_mreg); - cur_mreg = cur_mreg->next; - } + ARMCII_Sync(); } /* -- begin weak symbols block -- */ @@ -59,8 +53,6 @@ void PARMCI_Barrier(void) { /* -- end weak symbols block -- */ /** Wait for remote completion on one-sided operations targeting process proc. - * In MPI-2, this is a no-op since get/put/acc already guarantee remote - * completion. * * @param[in] proc Process to target */ @@ -85,8 +77,7 @@ void PARMCI_Fence(int proc) { #endif /* -- end weak symbols block -- */ -/** Wait for remote completion on all one-sided operations. In MPI-2, this is - * a no-op since get/put/acc already guarantee remote completion. +/** Wait for remote completion on all one-sided operations. */ void PARMCI_AllFence(void) { gmr_t *cur_mreg = gmr_list; @@ -122,7 +113,6 @@ int ARMCI_Uses_shm_grp(ARMCI_Group *group) { return 0; } - /** Copy local data. * * @param[in] src Source buffer @@ -228,3 +218,14 @@ int ARMCII_Is_win_unified(MPI_Win win) return -1; } } + +/** Sync all windows + */ +void ARMCII_Sync(void) { + gmr_t *cur_mreg = gmr_list; + + while (cur_mreg) { + gmr_sync(cur_mreg); + cur_mreg = cur_mreg->next; + } +}