examples: Add cudapi example code

Add a port of cpi.c to CUDA. Use a GPU kernel to compute the partial
areas at each process, then sum them with a final MPI_Reduce from device
memory into CPU memory. This is intended to be used as smoke test for
functioning GPU support.
Esse commit está contido em:
Ken Raffenetti
2023-01-24 16:03:49 -06:00
commit de Ken Raffenetti
commit b78242cc38
4 arquivos alterados com 115 adições e 0 exclusões
+3
Ver Arquivo
@@ -329,6 +329,9 @@ Makefile.am-stamp
/examples/spawn_merge_child2
/examples/spawn_merge_parent
# /examples/cuda/
/examples/cuda/cudapi
# /examples/cxx/
/examples/cxx/cxxpi
+9
Ver Arquivo
@@ -3820,6 +3820,15 @@ AC_ARG_ENABLE(checkpointing,
fi ],
)
# NVCC can be used to compile examples/cuda/cudapi.cu
if test -z "$NVCC" ; then
if test -n "${with_cuda}" -a "$with_cuda" != "no" ; then
AC_PATH_PROG([NVCC], [nvcc], [nvcc_not_found], [$with_cuda/bin:$PATH])
else
AC_PATH_PROG([NVCC], [nvcc], [nvcc_not_found])
fi
fi
# Update the cache first with the results of the previous configure steps
# We don't use the subdir cache because ensuring that the cache is consistent
# with the way in which configure wishes to use it is very difficult and
+17
Ver Arquivo
@@ -16,3 +16,20 @@ noinst_PROGRAMS = cpi
EXTRA_PROGRAMS = pmandel pmandel_spawn pmandel_service pmandel_spaserv \
pmandel_fence hellow icpi ircpi parent child srtest \
spawn_merge_parent spawn_merge_child1 spawn_merge_child2
# CUDA
#
# when compiling cuda/cudapi.cu with NVCC, mpi.h will pull in mpicxx.h
# because __cplusplus is defined (NVCC is a C++ compiler)
AM_CPPFLAGS += -I$(top_builddir)/src/binding/cxx
# How to compile .cu files
.cu.o:
$(NVCC) -arch=native $(AM_CPPFLAGS) -c -o $@ $<
EXTRA_PROGRAMS += cuda/cudapi
cuda_cudapi_SOURCES = cuda/cudapi.cu
cuda_cudapi_LDADD = ../lib/lib@MPILIBNAME@.la -lm
cuda_cudapi_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=link $(CC) $(AM_CFLAGS) $(CFLAGS) \
$(AM_LDFLAGS) $(LDFLAGS) -o $@
+86
Ver Arquivo
@@ -0,0 +1,86 @@
/*
* Copyright (C) by Argonne National Laboratory
* See COPYRIGHT in top-level directory
*/
#include "mpi.h"
#include <stdio.h>
#include <math.h>
#define THREADS_PER_BLOCK 256
__device__ double f(double a)
{
return (4.0 / (1.0 + a * a));
}
__global__ void do_sum(int n, double h, int stride, double *sum) {
int idx = 1 + (blockDim.x * blockIdx.x + threadIdx.x) + stride;
__shared__ double block_sum;
if (threadIdx.x == 0) {
block_sum = 0.0;
}
__syncthreads();
/* compute rectangles and add to block sum */
if (idx <= n) {
double x = h * ((double) idx - 0.5);
atomicAdd(&block_sum, f(x));
}
/* add block sum to total */
__syncthreads();
if (threadIdx.x == 0) {
atomicAdd(sum, block_sum * h);
}
}
int main(int argc, char *argv[])
{
int n, myid, numprocs;
double PI25DT = 3.141592653589793238462643;
double pi, h;
double *sum;
double startwtime = 0.0, endwtime;
int namelen;
char processor_name[MPI_MAX_PROCESSOR_NAME];
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Get_processor_name(processor_name, &namelen);
fprintf(stdout, "Process %d of %d is on %s\n", myid, numprocs, processor_name);
fflush(stdout);
cudaMalloc((void **)&sum, sizeof(double));
n = 10000;
if (myid == 0)
startwtime = MPI_Wtime();
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
h = 1.0 / (double) n;
int blocks = (n + (THREADS_PER_BLOCK * numprocs - 1)) / (THREADS_PER_BLOCK * numprocs);
int stride = blocks * THREADS_PER_BLOCK * myid;
/* compute partial sum using the GPU */
do_sum<<<blocks, THREADS_PER_BLOCK>>>(n, h, stride, sum);
cudaDeviceSynchronize();
MPI_Reduce(sum, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if (myid == 0) {
endwtime = MPI_Wtime();
printf("pi is approximately %.16f, Error is %.16f\n", pi, fabs(pi - PI25DT));
printf("wall clock time = %f\n", endwtime - startwtime);
fflush(stdout);
}
cudaFree(sum);
MPI_Finalize();
return 0;
}