examples: Add cudapi example code
Add a port of cpi.c to CUDA. Use a GPU kernel to compute the partial areas at each process, then sum them with a final MPI_Reduce from device memory into CPU memory. This is intended to be used as smoke test for functioning GPU support.
Esse commit está contido em:
@@ -329,6 +329,9 @@ Makefile.am-stamp
|
||||
/examples/spawn_merge_child2
|
||||
/examples/spawn_merge_parent
|
||||
|
||||
# /examples/cuda/
|
||||
/examples/cuda/cudapi
|
||||
|
||||
# /examples/cxx/
|
||||
/examples/cxx/cxxpi
|
||||
|
||||
|
||||
@@ -3820,6 +3820,15 @@ AC_ARG_ENABLE(checkpointing,
|
||||
fi ],
|
||||
)
|
||||
|
||||
# NVCC can be used to compile examples/cuda/cudapi.cu
|
||||
if test -z "$NVCC" ; then
|
||||
if test -n "${with_cuda}" -a "$with_cuda" != "no" ; then
|
||||
AC_PATH_PROG([NVCC], [nvcc], [nvcc_not_found], [$with_cuda/bin:$PATH])
|
||||
else
|
||||
AC_PATH_PROG([NVCC], [nvcc], [nvcc_not_found])
|
||||
fi
|
||||
fi
|
||||
|
||||
# Update the cache first with the results of the previous configure steps
|
||||
# We don't use the subdir cache because ensuring that the cache is consistent
|
||||
# with the way in which configure wishes to use it is very difficult and
|
||||
|
||||
@@ -16,3 +16,20 @@ noinst_PROGRAMS = cpi
|
||||
EXTRA_PROGRAMS = pmandel pmandel_spawn pmandel_service pmandel_spaserv \
|
||||
pmandel_fence hellow icpi ircpi parent child srtest \
|
||||
spawn_merge_parent spawn_merge_child1 spawn_merge_child2
|
||||
|
||||
# CUDA
|
||||
#
|
||||
# when compiling cuda/cudapi.cu with NVCC, mpi.h will pull in mpicxx.h
|
||||
# because __cplusplus is defined (NVCC is a C++ compiler)
|
||||
AM_CPPFLAGS += -I$(top_builddir)/src/binding/cxx
|
||||
|
||||
# How to compile .cu files
|
||||
.cu.o:
|
||||
$(NVCC) -arch=native $(AM_CPPFLAGS) -c -o $@ $<
|
||||
|
||||
EXTRA_PROGRAMS += cuda/cudapi
|
||||
cuda_cudapi_SOURCES = cuda/cudapi.cu
|
||||
cuda_cudapi_LDADD = ../lib/lib@MPILIBNAME@.la -lm
|
||||
cuda_cudapi_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=link $(CC) $(AM_CFLAGS) $(CFLAGS) \
|
||||
$(AM_LDFLAGS) $(LDFLAGS) -o $@
|
||||
|
||||
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
* Copyright (C) by Argonne National Laboratory
|
||||
* See COPYRIGHT in top-level directory
|
||||
*/
|
||||
|
||||
#include "mpi.h"
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#define THREADS_PER_BLOCK 256
|
||||
|
||||
__device__ double f(double a)
|
||||
{
|
||||
return (4.0 / (1.0 + a * a));
|
||||
}
|
||||
|
||||
__global__ void do_sum(int n, double h, int stride, double *sum) {
|
||||
int idx = 1 + (blockDim.x * blockIdx.x + threadIdx.x) + stride;
|
||||
__shared__ double block_sum;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
block_sum = 0.0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
/* compute rectangles and add to block sum */
|
||||
if (idx <= n) {
|
||||
double x = h * ((double) idx - 0.5);
|
||||
atomicAdd(&block_sum, f(x));
|
||||
}
|
||||
|
||||
/* add block sum to total */
|
||||
__syncthreads();
|
||||
if (threadIdx.x == 0) {
|
||||
atomicAdd(sum, block_sum * h);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int n, myid, numprocs;
|
||||
double PI25DT = 3.141592653589793238462643;
|
||||
double pi, h;
|
||||
double *sum;
|
||||
double startwtime = 0.0, endwtime;
|
||||
int namelen;
|
||||
char processor_name[MPI_MAX_PROCESSOR_NAME];
|
||||
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
|
||||
MPI_Get_processor_name(processor_name, &namelen);
|
||||
|
||||
fprintf(stdout, "Process %d of %d is on %s\n", myid, numprocs, processor_name);
|
||||
fflush(stdout);
|
||||
|
||||
cudaMalloc((void **)&sum, sizeof(double));
|
||||
|
||||
n = 10000;
|
||||
if (myid == 0)
|
||||
startwtime = MPI_Wtime();
|
||||
|
||||
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
||||
|
||||
h = 1.0 / (double) n;
|
||||
int blocks = (n + (THREADS_PER_BLOCK * numprocs - 1)) / (THREADS_PER_BLOCK * numprocs);
|
||||
int stride = blocks * THREADS_PER_BLOCK * myid;
|
||||
|
||||
/* compute partial sum using the GPU */
|
||||
do_sum<<<blocks, THREADS_PER_BLOCK>>>(n, h, stride, sum);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
MPI_Reduce(sum, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
|
||||
|
||||
if (myid == 0) {
|
||||
endwtime = MPI_Wtime();
|
||||
printf("pi is approximately %.16f, Error is %.16f\n", pi, fabs(pi - PI25DT));
|
||||
printf("wall clock time = %f\n", endwtime - startwtime);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
cudaFree(sum);
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
||||
Referência em uma Nova Issue
Bloquear um usuário