Merge pull request #7120 from hzhou/2408_req_info

ch4/request: enhance progress debugging

Approved-by: Ken Raffenetti
Esse commit está contido em:
Hui Zhou
2024-10-01 21:41:31 -05:00
commit de GitHub
7 arquivos alterados com 80 adições e 5 exclusões
+2 -2
Ver Arquivo
@@ -297,7 +297,7 @@ extern MPIR_Request MPIR_Request_direct[MPIR_REQUEST_PREALLOC];
#ifdef MPICH_DEBUG_PROGRESS
#define MPIR_REQUEST_SET_INFO(req, ...) \
do { \
MPL_snprintf((req)->info, 100, __VA_ARGS__); \
MPL_snprintf_nowarn((req)->info, 100, __VA_ARGS__); \
} while (0)
#define MPIR_REQUEST_DEBUG(req) \
@@ -308,7 +308,7 @@ extern MPIR_Request MPIR_Request_direct[MPIR_REQUEST_PREALLOC];
} while (0)
#else
#define MPIR_REQUEST_SET_INFO(req, info) do { } while (0)
#define MPIR_REQUEST_SET_INFO(req, ...) do { } while (0)
#define MPIR_REQUEST_DEBUG(req) do { } while (0)
#endif
+6 -3
Ver Arquivo
@@ -69,6 +69,7 @@ cvars:
#ifdef MPICH_DEBUG_PROGRESS
#define PROGRESS_START \
int iter = 0; \
bool progress_timed_out = false; \
MPL_time_t time_start; \
if (MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT > 0) { \
MPL_wtime(&time_start); \
@@ -82,12 +83,14 @@ cvars:
MPL_time_t time_cur; \
MPL_wtime(&time_cur); \
MPL_wtime_diff(&time_start, &time_cur, &time_diff); \
if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT) { \
if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT && !progress_timed_out) { \
MPIR_Request_debug(); \
MPL_backtrace_show(stdout); \
} else { \
iter = 0; \
progress_timed_out = true; \
} else if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT * 2) { \
MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**timeout"); \
} \
iter = 0; \
} \
}
+2
Ver Arquivo
@@ -193,6 +193,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf,
}
}
MPIR_REQUEST_SET_INFO(rreq, "MPIDI_OFI_do_irecv: source=%d, tag=%d, data_sz=%ld", rank, tag,
data_sz);
if (!dt_contig || force_gpu_pack) {
if (MPIDI_OFI_ENABLE_PT2PT_NOPACK && !force_gpu_pack &&
((data_sz < MPIDI_OFI_global.max_msg_size && !MPIDI_OFI_COMM(comm).enable_striping) ||
+30
Ver Arquivo
@@ -20,6 +20,34 @@ MPL_STATIC_INLINE_PREFIX uint64_t MPIDI_OFI_win_read_issued_cntr(MPIR_Win * win)
#endif
}
#ifdef MPICH_DEBUG_PROGRESS
#define OFI_PROGRESS_START \
bool progress_timed_out = false; \
MPL_time_t time_start; \
if (MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT > 0) { \
MPL_wtime(&time_start); \
}
#define OFI_PROGRESS_CHECK(tcount, donecount) \
if (MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT > 0) { \
double time_diff = 0.0; \
MPL_time_t time_cur; \
MPL_wtime(&time_cur); \
MPL_wtime_diff(&time_start, &time_cur, &time_diff); \
if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT && !progress_timed_out) { \
printf("MPIDI_OFI_win_do_progress: current cntr %ld, expect cntr %ld\n", tcount, donecount); \
MPL_backtrace_show(stdout); \
progress_timed_out = true; \
} else if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT * 2) { \
MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**timeout"); \
} \
}
#else
#define OFI_PROGRESS_START do {} while (0)
#define OFI_PROGRESS_CHECK(tcount, donecount) do {} while (0)
#endif
/*
* Blocking progress function to complete outstanding RMA operations on the input window.
*/
@@ -33,6 +61,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_win_do_progress(MPIR_Win * win, int vci)
MPIR_FUNC_ENTER;
OFI_PROGRESS_START;
while (1) {
tcount = MPIDI_OFI_win_read_issued_cntr(win);
donecount = fi_cntr_read(MPIDI_OFI_WIN(win).cmpl_cntr);
@@ -55,6 +84,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_win_do_progress(MPIR_Win * win, int vci)
"**ofid_cntr_wait", "**ofid_cntr_wait %s %d %s %s",
__SHORT_FILE__, __LINE__, __func__, fi_strerror(-ret));
itercount = 0;
OFI_PROGRESS_CHECK(tcount, donecount);
}
}
+36
Ver Arquivo
@@ -405,23 +405,59 @@ MPL_STATIC_INLINE_PREFIX int MPIDIU_valid_group_rank(MPIR_Comm * comm, int rank,
* blocking other progress (under global granularity).
*/
#ifdef MPICH_DEBUG_PROGRESS
#define MPIDIU_PROGRESS_START \
int iter = 0; \
bool progress_timed_out = false; \
MPL_time_t time_start; \
if (MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT > 0) { \
MPL_wtime(&time_start); \
}
#define MPIDIU_PROGRESS_CHECK \
if (MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT > 0) { \
iter++; \
if (iter == 0xffff) {\
double time_diff = 0.0; \
MPL_time_t time_cur; \
MPL_wtime(&time_cur); \
MPL_wtime_diff(&time_start, &time_cur, &time_diff); \
if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT && !progress_timed_out) { \
MPL_backtrace_show(stdout); \
progress_timed_out = true; \
} else if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT * 2) { \
MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**timeout"); \
} \
iter = 0; \
} \
}
#else
#define MPIDIU_PROGRESS_START do {} while (0)
#define MPIDIU_PROGRESS_CHECK do {} while (0)
#endif
/* declare to avoid header order dance */
MPL_STATIC_INLINE_PREFIX int MPIDI_progress_test_vci(int vci);
#define MPIDIU_PROGRESS_WHILE(cond, vci) \
MPIDIU_PROGRESS_START; \
while (cond) { \
mpi_errno = MPIDI_progress_test_vci(vci); \
MPIR_ERR_CHECK(mpi_errno); \
MPID_THREAD_CS_YIELD(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); \
MPID_THREAD_CS_YIELD(VCI, MPIDI_VCI(vci).lock); \
MPIDIU_PROGRESS_CHECK; \
}
#define MPIDIU_PROGRESS_DO_WHILE(cond, vci) \
MPIDIU_PROGRESS_START; \
do { \
mpi_errno = MPIDI_progress_test_vci(vci); \
MPIR_ERR_CHECK(mpi_errno); \
MPID_THREAD_CS_YIELD(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); \
MPID_THREAD_CS_YIELD(VCI, MPIDI_VCI(vci).lock); \
MPIDIU_PROGRESS_CHECK; \
} while (cond)
#ifdef HAVE_ERROR_CHECKING
+2
Ver Arquivo
@@ -207,6 +207,8 @@ static int create_unexp_rreq(int rank, int tag, MPIR_Context_id_t context_id,
MPIR_Request *rreq = MPIDIG_request_create(MPIR_REQUEST_KIND__RECV, 2, local_vci, remote_vci);
MPIR_ERR_CHKANDSTMT(rreq == NULL, mpi_errno, MPIX_ERR_NOREQ, goto fn_fail, "**nomemreq");
MPIR_REQUEST_SET_INFO(rreq, "create_unexp_rreq: source=%d, tag=%d, data_sz=%ld", rank, tag,
data_sz);
*req = rreq;
+2
Ver Arquivo
@@ -285,6 +285,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_do_irecv(void *buf, MPI_Aint count, MPI_Data
MPIR_Datatype_add_ref_if_not_builtin(datatype);
MPIDIG_prepare_recv_req(rank, tag, context_id, buf, count, datatype, rreq);
MPIDIG_enqueue_request(rreq, &MPIDI_global.per_vci[vci].posted_list, MPIDIG_PT2PT_POSTED);
MPIR_REQUEST_SET_INFO(rreq, "MPIDIG_do_irecv: source=%d, tag=%d, count=%ld, datatype=%x",
rank, tag, count, datatype);
}
fn_exit: