Merge pull request #6907 from hzhou/2311_bench

test: add p2p benchmark code

Approved-by: Ken Raffenetti
Esse commit está contido em:
Hui Zhou
2024-10-02 16:59:17 -05:00
commit de GitHub
15 arquivos alterados com 310 adições e 2 exclusões
+3
Ver Arquivo
@@ -14,3 +14,6 @@
[submodule "modules/yaksa"]
path = modules/yaksa
url = https://github.com/pmodels/yaksa
[submodule "modules/mydef_boot"]
path = modules/mydef_boot
url = https://github.com/pmodels/mydef_boot
+13 -2
Ver Arquivo
@@ -65,6 +65,7 @@ do_hydra=yes
do_romio=yes
do_pmi=yes
do_doc=no
do_mydef=yes
yaksa_depth=
@@ -536,6 +537,14 @@ fn_json_gen() {
echo "done"
}
fn_mydef() {
MYDEF_BOOT=$PWD/modules/mydef_boot
export PATH=$MYDEF_BOOT/bin:$PATH
export PERL5LIB=$MYDEF_BOOT/lib/perl5
export MYDEFLIB=$MYDEF_BOOT/lib/MyDef
(cd test/mpi/bench && ./autogen.sh)
}
# internal
_patch_libtool() {
_file=$1
@@ -731,9 +740,9 @@ EOF
echo ">= $ver"
else
echo "bad autoconf installation"
echo "--- autoreconf diagnositcs ---"
echo "--- autoreconf diagnostics ---"
$(cat autoreconf.err)
echo "--- autoreconf diagnositcs ---"
echo "--- autoreconf diagnostics ---"
cat <<EOF
You either do not have autoconf in your path or it is too old (version
$ver or higher required). You may be able to use
@@ -1102,3 +1111,5 @@ fn_build_configure
fn_ch4_api
fn_json_gen
fn_mydef
+1
Submodule modules/mydef_boot added at ea2d685248
+3
Ver Arquivo
@@ -0,0 +1,3 @@
/*.c
/p2p_bw
/p2p_latency
+17
Ver Arquivo
@@ -0,0 +1,17 @@
##
## Copyright (C) by Argonne National Laboratory
## See COPYRIGHT in top-level directory
##
include $(top_srcdir)/Makefile_single.mtest
LDADD += -lm
## for all programs that are just built from the single corresponding source
## file, we don't need per-target _SOURCES rules, automake will infer them
## correctly
noinst_PROGRAMS = \
p2p_latency \
p2p_bw
.def.c:
mydef_page $<
Arquivo executável
+3
Ver Arquivo
@@ -0,0 +1,3 @@
for a in *.def ; do
mydef_page $a
done
+3
Ver Arquivo
@@ -0,0 +1,3 @@
module: c
CC: mpicc
run: mpirun -n 2
+107
Ver Arquivo
@@ -0,0 +1,107 @@
/*
* bench_frame : boilerplate for mpi program
* measure(iter) : measures `tf_dur` for $(iter) iterations
* run_stat(N, var) : run N measurements and obtain (avg, std) in sum1, sum2
* warm_up(iter, dur): repeat until measurements (iter, dur) stabilize
* report_latency(msgsize, MULTIPLICITY) : print a line of latency result
*/
subcode: bench_frame
$include stdio
$include stdlib
$(if:HAS_MTEST)
$include mpitest.h
$(else)
$include mpi
$function main
$(if:HAS_MTEST)
MTest_Init(NULL, NULL);
$(else)
MPI_Init(NULL, NULL);
$my grank, gsize: int
MPI_Comm_rank(MPI_COMM_WORLD, &grank);
MPI_Comm_size(MPI_COMM_WORLD, &gsize);
$(if:MIN_PROCS)
$if gsize < $(MIN_PROCS)
printf("! Test $(_pagename) requires $(MIN_PROCS) processes !\n");
return 1
MPI_Comm comm = MPI_COMM_WORLD;
$my void *buf
$(if:HAS_MTEST)
$call mtest_malloc, MAX_BUFSIZE
$(else)
buf = malloc(MAX_BUFSIZE)
$if !buf
printf("! Failed to allocate buffer (size=%d)\n", MAX_BUFSIZE)
return 1
$if grank == 0
printf("TEST $(_pagename):\n")
$call @report_header
$call main
$if grank == 0
printf("\n")
$(if:HAS_MTEST)
MTest_Finalize(0);
$(else)
MPI_Finalize();
macros:
use_double: 1
#----------------------------------------
subcode: _autoload
$register_prefix(comm) MPI_Comm
subcode: foreach_size
$for int size = 0; size < $(MAX_MSG); size = (size==0)?1:size*2
$(set:MSG_SIZE=size)
BLOCK
subcode: measure(iter)
tf_start = MPI_Wtime()
$for 0:$(iter)
BLOCK
tf_dur = MPI_Wtime() - tf_start
subcode: run_stat(N, var)
$my double sum1=0, double sum2=0
$for 0:$(N)
BLOCK
sum1 += $(var)
sum2 += $(var) * $(var)
sum1 /= $(N)
sum2 /= $(N)
sum2 = sqrt(sum2 - sum1 * sum1)
subcode: warm_up(iter, dur)
$(set:MIN_ITER=(int) ($(iter) * 0.001 / $(dur)))
$(iter) = 2
$my double last_dur = 1.0
$my int num_best = 0
$while num_best < 10
BLOCK
$if $(iter) < $(MIN_ITER)
$(iter) = $(MIN_ITER)
num_best = 0
continue
# check that t_dur is no longer monotonically decreasing
$if $(dur) > last_dur
num_best++
last_dur = $(dur)
subcode: header_latency
printf("%12s %10s(us) %6s(us) %12s(MB/s)\n", "msgsize", "latency", "sigma", "bandwidth")
subcode: report_latency(MSGSIZE, MULTIPLICITY)
$my tf_latency, tf_sigma, tf_bw
tf_latency = sum1 / ($(MULTIPLICITY)) * 1e6
tf_sigma = sum2 / ($(MULTIPLICITY)) * 1e6
tf_bw = $(MSGSIZE) / tf_latency
printf("%12d %10.3f %6.3f %12.3f\n", $(MSGSIZE), tf_latency, tf_sigma, tf_bw)
+79
Ver Arquivo
@@ -0,0 +1,79 @@
/*
* Defines following functions:
* bench_p2p
* bench_send, bench_warmup
* bench_recv
*
* For each measurement -
* First sender tells receiver the `iter` parameter. `iter = 0` means to quit.
* For each iteration runs `send_side` and `recv_side` assuming the measurement on sender side represents a latency measurement.
*
* Caller page defines -
* subcode: sender_side, recv_side
* macro:
* MULTIPLICITY: divisor for each measurement
*/
macros:
MIN_PROCS: 2
MAX_BUFSIZE: 5000000 # 5 MB
subcode: _autoload
$register_name(src) int
$register_name(dst) int
$register_name(buf) void *
$register_name(size) int
$define TAG 0
$define SYNC_TAG 100
$define MAX_BUFSIZE 5000000
$define NUM_REPEAT 20
subcode: report_header
$call header_latency
fncode: bench_p2p(comm, src, dst, buf, size)
int rank;
MPI_Comm_rank(comm, &rank)
$(if:!MULTIPLICITY)
$(set:MULTIPLICITY=1)
$if rank == src
iter = bench_warmup(comm, dst, buf, size)
&call run_stat, NUM_REPEAT, tf_latency
tf_latency = bench_send(iter, comm, dst, buf, size)
tf_latency /= iter
$call report_latency, size, $(MULTIPLICITY)
$call send_stop
$elif rank == dst
bench_recv(comm, src, buf, size)
subcode: send_stop
iter = 0;
MPI_Send(&iter, 1, MPI_INT, dst, SYNC_TAG, comm)
#----------------------------------------
fncode: bench_send(int iter, comm, dst, buf, size)
# synchronize with receiver
MPI_Send(&iter, 1, MPI_INT, dst, SYNC_TAG, comm);
&call measure, iter
$call @send_side
return tf_dur
fncode: bench_recv(comm, src, buf, size)
$while 1
int iter;
# synchronize with sender */
MPI_Recv(&iter, 1, MPI_INT, src, SYNC_TAG, comm, MPI_STATUS_IGNORE);
$if iter == 0
# time to quit
break
$for i=0:iter
$call @recv_side
fncode: bench_warmup(comm, dst, buf, size): int
&call warm_up, iter, tf_dur
tf_dur = bench_send(iter, comm, dst, buf, size)
return iter
+14
Ver Arquivo
@@ -0,0 +1,14 @@
macros:
HAS_MTEST: 1
subcode: mtest_malloc(size)
MTestArgList *head = MTestArgListCreate(argc, argv)
int send_rank = 0, recv_rank = 1;
$(for:a in send,recv)
$if grank == $(a)_rank
$my mtest_mem_type_e $(a)_memtype, int $(a)_device
$(a)_memtype = MTestArgListGetMemType(head, "$(a)mem")
$(a)_device = MTestArgListGetInt_with_default(head, "$(a)dev", 0)
MTestMalloc($(size), $(a)_memtype, NULL, &buf, $(a)_device)
MTestPrintfMsg(1, "Allocating buffer: memtype=%s, device=%d, size=%d\n", MTest_memtype_name($(a)_memtype), $(a)_device, $(size))
MTestArgListDestroy(head)
+27
Ver Arquivo
@@ -0,0 +1,27 @@
include: macros/bench_frame.def
include: macros/bench_p2p.def
include: macros/mtest.def
subcode: _autoload
$define WINDOW_SIZE 64
page: p2p_bw, bench_frame
MULTIPLICITY: WINDOW_SIZE
data: buf, size, MPI_CHAR
$for int size = 1; size < MAX_BUFSIZE; size *= 2
bench_p2p(comm, 0, 1, buf, size)
subcode: send_side
$my MPI_Request reqs[WINDOW_SIZE]
$for j=0:WINDOW_SIZE
MPI_Isend($(data), dst, TAG, comm, &reqs[j])
MPI_Waitall(WINDOW_SIZE, reqs, MPI_STATUSES_IGNORE)
MPI_Recv(NULL, 0, MPI_DATATYPE_NULL, dst, TAG, comm, MPI_STATUS_IGNORE)
subcode: recv_side
$my MPI_Request reqs[WINDOW_SIZE]
$for j=0:WINDOW_SIZE
MPI_Irecv($(data), src, TAG, comm, &reqs[j])
MPI_Waitall(WINDOW_SIZE, reqs, MPI_STATUSES_IGNORE)
MPI_Send(NULL, 0, MPI_DATATYPE_NULL, src, TAG, comm)
+19
Ver Arquivo
@@ -0,0 +1,19 @@
include: macros/bench_frame.def
include: macros/bench_p2p.def
include: macros/mtest.def
page: p2p_latency, bench_frame
MULTIPLICITY: 2
data: buf, size, MPI_CHAR
bench_p2p(comm, 0, 1, buf, 0)
$for int size = 1; size < MAX_BUFSIZE; size *= 2
bench_p2p(comm, 0, 1, buf, size)
subcode: send_side
MPI_Send($(data), dst, TAG, comm);
MPI_Recv($(data), dst, TAG, comm, MPI_STATUS_IGNORE);
subcode: recv_side
MPI_Recv($(data), src, TAG, comm, MPI_STATUS_IGNORE);
MPI_Send($(data), src, TAG, comm);
+2
Ver Arquivo
@@ -0,0 +1,2 @@
p2p_latency 2 resultTest=TestBench
p2p_bw 2 resultTest=TestBench
+1
Ver Arquivo
@@ -1904,5 +1904,6 @@ AC_OUTPUT(maint/testmerge \
impls/mpich/ulfm/Makefile \
impls/mpich/info/Makefile \
impls/mpich/info/testlist \
bench/Makefile \
)
+18
Ver Arquivo
@@ -934,6 +934,8 @@ sub get_resultTest {
return \&TestStatusNoErrors;
} elsif ($resultTest eq "TestErrFatal") {
return \&TestErrFatal;
} elsif ($resultTest eq "TestBench") {
return \&TestBench;
} else {
die "resultTest $resultTest not defined!\n";
}
@@ -1112,6 +1114,22 @@ sub TestErrFatal {
return ($found_error, $inline);
}
# Only check exit code: 0 means success, non-zero means failure
sub TestBench {
my ($MPIOUT, $programname) = @_;
my ($found_error, $inline);
while (<$MPIOUT>) {
print STDOUT $_;
}
my $rc = close($MPIOUT);
if (!$rc) {
expect_status_zero($programname, $?);
$found_error = 1;
}
return ($found_error, $inline);
}
# ----------------------------------------------------------------------------
# Output routines:
# OpenOutputs - Open report files and print initial lines