diff --git a/.gitignore b/.gitignore index 644806f3b..fc58b0185 100644 --- a/.gitignore +++ b/.gitignore @@ -329,6 +329,9 @@ Makefile.am-stamp /examples/spawn_merge_child2 /examples/spawn_merge_parent +# /examples/cuda/ +/examples/cuda/cudapi + # /examples/cxx/ /examples/cxx/cxxpi diff --git a/configure.ac b/configure.ac index 9563ad203..b0ad8f72d 100644 --- a/configure.ac +++ b/configure.ac @@ -3820,6 +3820,15 @@ AC_ARG_ENABLE(checkpointing, fi ], ) +# NVCC can be used to compile examples/cuda/cudapi.cu +if test -z "$NVCC" ; then + if test -n "${with_cuda}" -a "$with_cuda" != "no" ; then + AC_PATH_PROG([NVCC], [nvcc], [nvcc_not_found], [$with_cuda/bin:$PATH]) + else + AC_PATH_PROG([NVCC], [nvcc], [nvcc_not_found]) + fi +fi + # Update the cache first with the results of the previous configure steps # We don't use the subdir cache because ensuring that the cache is consistent # with the way in which configure wishes to use it is very difficult and diff --git a/examples/Makefile.am b/examples/Makefile.am index 5c576eeb9..8124fab7a 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -16,3 +16,20 @@ noinst_PROGRAMS = cpi EXTRA_PROGRAMS = pmandel pmandel_spawn pmandel_service pmandel_spaserv \ pmandel_fence hellow icpi ircpi parent child srtest \ spawn_merge_parent spawn_merge_child1 spawn_merge_child2 + +# CUDA +# +# when compiling cuda/cudapi.cu with NVCC, mpi.h will pull in mpicxx.h +# because __cplusplus is defined (NVCC is a C++ compiler) +AM_CPPFLAGS += -I$(top_builddir)/src/binding/cxx + +# How to compile .cu files +.cu.o: + $(NVCC) -arch=native $(AM_CPPFLAGS) -c -o $@ $< + +EXTRA_PROGRAMS += cuda/cudapi +cuda_cudapi_SOURCES = cuda/cudapi.cu +cuda_cudapi_LDADD = ../lib/lib@MPILIBNAME@.la -lm +cuda_cudapi_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CC) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ diff --git a/examples/cuda/cudapi.cu b/examples/cuda/cudapi.cu new file mode 100644 index 000000000..81d7aead1 --- /dev/null +++ b/examples/cuda/cudapi.cu @@ -0,0 +1,86 @@ +/* + * Copyright (C) by Argonne National Laboratory + * See COPYRIGHT in top-level directory + */ + +#include "mpi.h" +#include +#include + +#define THREADS_PER_BLOCK 256 + +__device__ double f(double a) +{ + return (4.0 / (1.0 + a * a)); +} + +__global__ void do_sum(int n, double h, int stride, double *sum) { + int idx = 1 + (blockDim.x * blockIdx.x + threadIdx.x) + stride; + __shared__ double block_sum; + + if (threadIdx.x == 0) { + block_sum = 0.0; + } + __syncthreads(); + + /* compute rectangles and add to block sum */ + if (idx <= n) { + double x = h * ((double) idx - 0.5); + atomicAdd(&block_sum, f(x)); + } + + /* add block sum to total */ + __syncthreads(); + if (threadIdx.x == 0) { + atomicAdd(sum, block_sum * h); + } +} + +int main(int argc, char *argv[]) +{ + int n, myid, numprocs; + double PI25DT = 3.141592653589793238462643; + double pi, h; + double *sum; + double startwtime = 0.0, endwtime; + int namelen; + char processor_name[MPI_MAX_PROCESSOR_NAME]; + + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &numprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Get_processor_name(processor_name, &namelen); + + fprintf(stdout, "Process %d of %d is on %s\n", myid, numprocs, processor_name); + fflush(stdout); + + cudaMalloc((void **)&sum, sizeof(double)); + + n = 10000; + if (myid == 0) + startwtime = MPI_Wtime(); + + MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD); + + h = 1.0 / (double) n; + int blocks = (n + (THREADS_PER_BLOCK * numprocs - 1)) / (THREADS_PER_BLOCK * numprocs); + int stride = blocks * THREADS_PER_BLOCK * myid; + + /* compute partial sum using the GPU */ + do_sum<<>>(n, h, stride, sum); + cudaDeviceSynchronize(); + + MPI_Reduce(sum, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (myid == 0) { + endwtime = MPI_Wtime(); + printf("pi is approximately %.16f, Error is %.16f\n", pi, fabs(pi - PI25DT)); + printf("wall clock time = %f\n", endwtime - startwtime); + fflush(stdout); + } + + cudaFree(sum); + + MPI_Finalize(); + return 0; +}