examples: Add cudapi example code

Add a port of cpi.c to CUDA. Use a GPU kernel to compute the partial areas at each process, then sum them with a final MPI_Reduce from device memory into CPU memory. This is intended to be used as smoke test for functioning GPU support.
2023-01-24 16:03:49 -06:00
commit b78242cc38
@@ -329,6 +329,9 @@ Makefile.am-stamp
 /examples/spawn_merge_child2
 /examples/spawn_merge_parent

+# /examples/cuda/
+/examples/cuda/cudapi
+
 # /examples/cxx/
 /examples/cxx/cxxpi

@@ -3820,6 +3820,15 @@ AC_ARG_ENABLE(checkpointing,
      fi ],
 )

+# NVCC can be used to compile examples/cuda/cudapi.cu
+if test -z "$NVCC" ; then
+    if test -n "${with_cuda}" -a "$with_cuda" != "no" ; then
+        AC_PATH_PROG([NVCC], [nvcc], [nvcc_not_found], [$with_cuda/bin:$PATH])
+    else
+        AC_PATH_PROG([NVCC], [nvcc], [nvcc_not_found])
+    fi
+fi
+
 # Update the cache first with the results of the previous configure steps
 # We don't use the subdir cache because ensuring that the cache is consistent
 # with the way in which configure wishes to use it is very difficult and
@@ -16,3 +16,20 @@ noinst_PROGRAMS = cpi
 EXTRA_PROGRAMS = pmandel pmandel_spawn pmandel_service pmandel_spaserv    \
                 pmandel_fence hellow icpi ircpi parent child srtest \
                 spawn_merge_parent spawn_merge_child1 spawn_merge_child2
+
+# CUDA
+#
+# when compiling cuda/cudapi.cu with NVCC, mpi.h will pull in mpicxx.h
+# because __cplusplus is defined (NVCC is a C++ compiler)
+AM_CPPFLAGS += -I$(top_builddir)/src/binding/cxx
+
+# How to compile .cu files
+.cu.o:
+	$(NVCC) -arch=native $(AM_CPPFLAGS) -c -o $@ $<
+
+EXTRA_PROGRAMS += cuda/cudapi
+cuda_cudapi_SOURCES = cuda/cudapi.cu
+cuda_cudapi_LDADD = ../lib/lib@MPILIBNAME@.la -lm
+cuda_cudapi_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+       $(LIBTOOLFLAGS) --mode=link $(CC) $(AM_CFLAGS) $(CFLAGS) \
+       $(AM_LDFLAGS) $(LDFLAGS) -o $@
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ *     See COPYRIGHT in top-level directory
+ */
+
+#include "mpi.h"
+#include <stdio.h>
+#include <math.h>
+
+#define THREADS_PER_BLOCK 256
+
+__device__ double f(double a)
+{
+    return (4.0 / (1.0 + a * a));
+}
+
+__global__ void do_sum(int n, double h, int stride, double *sum) {
+    int idx = 1 + (blockDim.x * blockIdx.x + threadIdx.x) + stride;
+    __shared__ double block_sum;
+
+    if (threadIdx.x == 0) {
+        block_sum = 0.0;
+    }
+    __syncthreads();
+
+    /* compute rectangles and add to block sum */
+    if (idx <= n) {
+        double x = h * ((double) idx - 0.5);
+        atomicAdd(&block_sum, f(x));
+    }
+
+    /* add block sum to total */
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        atomicAdd(sum, block_sum * h);
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    int n, myid, numprocs;
+    double PI25DT = 3.141592653589793238462643;
+    double pi, h;
+    double *sum;
+    double startwtime = 0.0, endwtime;
+    int namelen;
+    char processor_name[MPI_MAX_PROCESSOR_NAME];
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+    MPI_Get_processor_name(processor_name, &namelen);
+
+    fprintf(stdout, "Process %d of %d is on %s\n", myid, numprocs, processor_name);
+    fflush(stdout);
+
+    cudaMalloc((void **)&sum, sizeof(double));
+
+    n = 10000;
+    if (myid == 0)
+        startwtime = MPI_Wtime();
+
+    MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+    h = 1.0 / (double) n;
+    int blocks = (n + (THREADS_PER_BLOCK * numprocs - 1)) / (THREADS_PER_BLOCK * numprocs);
+    int stride = blocks * THREADS_PER_BLOCK * myid;
+
+    /* compute partial sum using the GPU */
+    do_sum<<<blocks, THREADS_PER_BLOCK>>>(n, h, stride, sum);
+    cudaDeviceSynchronize();
+
+    MPI_Reduce(sum, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    if (myid == 0) {
+        endwtime = MPI_Wtime();
+        printf("pi is approximately %.16f, Error is %.16f\n", pi, fabs(pi - PI25DT));
+        printf("wall clock time = %f\n", endwtime - startwtime);
+        fflush(stdout);
+    }
+
+    cudaFree(sum);
+
+    MPI_Finalize();
+    return 0;
+}