Merge pull request #7120 from hzhou/2408_req_info
ch4/request: enhance progress debugging Approved-by: Ken Raffenetti
Esse commit está contido em:
@@ -297,7 +297,7 @@ extern MPIR_Request MPIR_Request_direct[MPIR_REQUEST_PREALLOC];
|
||||
#ifdef MPICH_DEBUG_PROGRESS
|
||||
#define MPIR_REQUEST_SET_INFO(req, ...) \
|
||||
do { \
|
||||
MPL_snprintf((req)->info, 100, __VA_ARGS__); \
|
||||
MPL_snprintf_nowarn((req)->info, 100, __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define MPIR_REQUEST_DEBUG(req) \
|
||||
@@ -308,7 +308,7 @@ extern MPIR_Request MPIR_Request_direct[MPIR_REQUEST_PREALLOC];
|
||||
} while (0)
|
||||
#else
|
||||
|
||||
#define MPIR_REQUEST_SET_INFO(req, info) do { } while (0)
|
||||
#define MPIR_REQUEST_SET_INFO(req, ...) do { } while (0)
|
||||
#define MPIR_REQUEST_DEBUG(req) do { } while (0)
|
||||
#endif
|
||||
|
||||
|
||||
@@ -69,6 +69,7 @@ cvars:
|
||||
#ifdef MPICH_DEBUG_PROGRESS
|
||||
#define PROGRESS_START \
|
||||
int iter = 0; \
|
||||
bool progress_timed_out = false; \
|
||||
MPL_time_t time_start; \
|
||||
if (MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT > 0) { \
|
||||
MPL_wtime(&time_start); \
|
||||
@@ -82,12 +83,14 @@ cvars:
|
||||
MPL_time_t time_cur; \
|
||||
MPL_wtime(&time_cur); \
|
||||
MPL_wtime_diff(&time_start, &time_cur, &time_diff); \
|
||||
if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT) { \
|
||||
if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT && !progress_timed_out) { \
|
||||
MPIR_Request_debug(); \
|
||||
MPL_backtrace_show(stdout); \
|
||||
} else { \
|
||||
iter = 0; \
|
||||
progress_timed_out = true; \
|
||||
} else if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT * 2) { \
|
||||
MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**timeout"); \
|
||||
} \
|
||||
iter = 0; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
@@ -193,6 +193,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf,
|
||||
}
|
||||
}
|
||||
|
||||
MPIR_REQUEST_SET_INFO(rreq, "MPIDI_OFI_do_irecv: source=%d, tag=%d, data_sz=%ld", rank, tag,
|
||||
data_sz);
|
||||
if (!dt_contig || force_gpu_pack) {
|
||||
if (MPIDI_OFI_ENABLE_PT2PT_NOPACK && !force_gpu_pack &&
|
||||
((data_sz < MPIDI_OFI_global.max_msg_size && !MPIDI_OFI_COMM(comm).enable_striping) ||
|
||||
|
||||
@@ -20,6 +20,34 @@ MPL_STATIC_INLINE_PREFIX uint64_t MPIDI_OFI_win_read_issued_cntr(MPIR_Win * win)
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef MPICH_DEBUG_PROGRESS
|
||||
#define OFI_PROGRESS_START \
|
||||
bool progress_timed_out = false; \
|
||||
MPL_time_t time_start; \
|
||||
if (MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT > 0) { \
|
||||
MPL_wtime(&time_start); \
|
||||
}
|
||||
|
||||
#define OFI_PROGRESS_CHECK(tcount, donecount) \
|
||||
if (MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT > 0) { \
|
||||
double time_diff = 0.0; \
|
||||
MPL_time_t time_cur; \
|
||||
MPL_wtime(&time_cur); \
|
||||
MPL_wtime_diff(&time_start, &time_cur, &time_diff); \
|
||||
if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT && !progress_timed_out) { \
|
||||
printf("MPIDI_OFI_win_do_progress: current cntr %ld, expect cntr %ld\n", tcount, donecount); \
|
||||
MPL_backtrace_show(stdout); \
|
||||
progress_timed_out = true; \
|
||||
} else if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT * 2) { \
|
||||
MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**timeout"); \
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
#define OFI_PROGRESS_START do {} while (0)
|
||||
#define OFI_PROGRESS_CHECK(tcount, donecount) do {} while (0)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Blocking progress function to complete outstanding RMA operations on the input window.
|
||||
*/
|
||||
@@ -33,6 +61,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_win_do_progress(MPIR_Win * win, int vci)
|
||||
|
||||
MPIR_FUNC_ENTER;
|
||||
|
||||
OFI_PROGRESS_START;
|
||||
while (1) {
|
||||
tcount = MPIDI_OFI_win_read_issued_cntr(win);
|
||||
donecount = fi_cntr_read(MPIDI_OFI_WIN(win).cmpl_cntr);
|
||||
@@ -55,6 +84,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_win_do_progress(MPIR_Win * win, int vci)
|
||||
"**ofid_cntr_wait", "**ofid_cntr_wait %s %d %s %s",
|
||||
__SHORT_FILE__, __LINE__, __func__, fi_strerror(-ret));
|
||||
itercount = 0;
|
||||
OFI_PROGRESS_CHECK(tcount, donecount);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -405,23 +405,59 @@ MPL_STATIC_INLINE_PREFIX int MPIDIU_valid_group_rank(MPIR_Comm * comm, int rank,
|
||||
* blocking other progress (under global granularity).
|
||||
*/
|
||||
|
||||
#ifdef MPICH_DEBUG_PROGRESS
|
||||
#define MPIDIU_PROGRESS_START \
|
||||
int iter = 0; \
|
||||
bool progress_timed_out = false; \
|
||||
MPL_time_t time_start; \
|
||||
if (MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT > 0) { \
|
||||
MPL_wtime(&time_start); \
|
||||
}
|
||||
|
||||
#define MPIDIU_PROGRESS_CHECK \
|
||||
if (MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT > 0) { \
|
||||
iter++; \
|
||||
if (iter == 0xffff) {\
|
||||
double time_diff = 0.0; \
|
||||
MPL_time_t time_cur; \
|
||||
MPL_wtime(&time_cur); \
|
||||
MPL_wtime_diff(&time_start, &time_cur, &time_diff); \
|
||||
if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT && !progress_timed_out) { \
|
||||
MPL_backtrace_show(stdout); \
|
||||
progress_timed_out = true; \
|
||||
} else if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT * 2) { \
|
||||
MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**timeout"); \
|
||||
} \
|
||||
iter = 0; \
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
#define MPIDIU_PROGRESS_START do {} while (0)
|
||||
#define MPIDIU_PROGRESS_CHECK do {} while (0)
|
||||
#endif
|
||||
|
||||
/* declare to avoid header order dance */
|
||||
MPL_STATIC_INLINE_PREFIX int MPIDI_progress_test_vci(int vci);
|
||||
|
||||
#define MPIDIU_PROGRESS_WHILE(cond, vci) \
|
||||
MPIDIU_PROGRESS_START; \
|
||||
while (cond) { \
|
||||
mpi_errno = MPIDI_progress_test_vci(vci); \
|
||||
MPIR_ERR_CHECK(mpi_errno); \
|
||||
MPID_THREAD_CS_YIELD(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); \
|
||||
MPID_THREAD_CS_YIELD(VCI, MPIDI_VCI(vci).lock); \
|
||||
MPIDIU_PROGRESS_CHECK; \
|
||||
}
|
||||
|
||||
#define MPIDIU_PROGRESS_DO_WHILE(cond, vci) \
|
||||
MPIDIU_PROGRESS_START; \
|
||||
do { \
|
||||
mpi_errno = MPIDI_progress_test_vci(vci); \
|
||||
MPIR_ERR_CHECK(mpi_errno); \
|
||||
MPID_THREAD_CS_YIELD(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); \
|
||||
MPID_THREAD_CS_YIELD(VCI, MPIDI_VCI(vci).lock); \
|
||||
MPIDIU_PROGRESS_CHECK; \
|
||||
} while (cond)
|
||||
|
||||
#ifdef HAVE_ERROR_CHECKING
|
||||
|
||||
@@ -207,6 +207,8 @@ static int create_unexp_rreq(int rank, int tag, MPIR_Context_id_t context_id,
|
||||
|
||||
MPIR_Request *rreq = MPIDIG_request_create(MPIR_REQUEST_KIND__RECV, 2, local_vci, remote_vci);
|
||||
MPIR_ERR_CHKANDSTMT(rreq == NULL, mpi_errno, MPIX_ERR_NOREQ, goto fn_fail, "**nomemreq");
|
||||
MPIR_REQUEST_SET_INFO(rreq, "create_unexp_rreq: source=%d, tag=%d, data_sz=%ld", rank, tag,
|
||||
data_sz);
|
||||
|
||||
*req = rreq;
|
||||
|
||||
|
||||
@@ -285,6 +285,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_do_irecv(void *buf, MPI_Aint count, MPI_Data
|
||||
MPIR_Datatype_add_ref_if_not_builtin(datatype);
|
||||
MPIDIG_prepare_recv_req(rank, tag, context_id, buf, count, datatype, rreq);
|
||||
MPIDIG_enqueue_request(rreq, &MPIDI_global.per_vci[vci].posted_list, MPIDIG_PT2PT_POSTED);
|
||||
MPIR_REQUEST_SET_INFO(rreq, "MPIDIG_do_irecv: source=%d, tag=%d, count=%ld, datatype=%x",
|
||||
rank, tag, count, datatype);
|
||||
}
|
||||
|
||||
fn_exit:
|
||||
|
||||
Referência em uma Nova Issue
Bloquear um usuário