request: abort on progress timeout

Since some launcher will hold console output, to make debugging progress
hang a bit easier, this commit makes the process abort on time out. We
delay the abort after first dump the stack backtrace to allow other
processes to also dump progress backtrace before killing them.
Esse commit está contido em:
Hui Zhou
2024-08-28 15:25:24 -05:00
commit 9f3bbf33a2
+6 -3
Ver Arquivo
@@ -69,6 +69,7 @@ cvars:
#ifdef MPICH_DEBUG_PROGRESS
#define PROGRESS_START \
int iter = 0; \
bool progress_timed_out = false; \
MPL_time_t time_start; \
if (MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT > 0) { \
MPL_wtime(&time_start); \
@@ -82,12 +83,14 @@ cvars:
MPL_time_t time_cur; \
MPL_wtime(&time_cur); \
MPL_wtime_diff(&time_start, &time_cur, &time_diff); \
if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT) { \
if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT && !progress_timed_out) { \
MPIR_Request_debug(); \
MPL_backtrace_show(stdout); \
} else { \
iter = 0; \
progress_timed_out = true; \
} else if (time_diff > MPIR_CVAR_DEBUG_PROGRESS_TIMEOUT * 2) { \
MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**timeout"); \
} \
iter = 0; \
} \
}