diff --git a/.gitignore b/.gitignore
index 644806f3b..fc58b0185 100644
--- a/.gitignore
+++ b/.gitignore
@@ -329,6 +329,9 @@ Makefile.am-stamp
 /examples/spawn_merge_child2
 /examples/spawn_merge_parent
 
+# /examples/cuda/
+/examples/cuda/cudapi
+
 # /examples/cxx/
 /examples/cxx/cxxpi
 
diff --git a/configure.ac b/configure.ac
index 9563ad203..b0ad8f72d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3820,6 +3820,15 @@ AC_ARG_ENABLE(checkpointing,
       fi ],
 )
 
+# NVCC can be used to compile examples/cuda/cudapi.cu
+if test -z "$NVCC" ; then
+    if test -n "${with_cuda}" -a "$with_cuda" != "no" ; then
+        AC_PATH_PROG([NVCC], [nvcc], [nvcc_not_found], [$with_cuda/bin:$PATH])
+    else
+        AC_PATH_PROG([NVCC], [nvcc], [nvcc_not_found])
+    fi
+fi
+
 # Update the cache first with the results of the previous configure steps
 # We don't use the subdir cache because ensuring that the cache is consistent
 # with the way in which configure wishes to use it is very difficult and
diff --git a/examples/Makefile.am b/examples/Makefile.am
index 5c576eeb9..8124fab7a 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -16,3 +16,20 @@ noinst_PROGRAMS = cpi
 EXTRA_PROGRAMS = pmandel pmandel_spawn pmandel_service pmandel_spaserv    \
                  pmandel_fence hellow icpi ircpi parent child srtest \
                  spawn_merge_parent spawn_merge_child1 spawn_merge_child2
+
+# CUDA
+#
+# when compiling cuda/cudapi.cu with NVCC, mpi.h will pull in mpicxx.h
+# because __cplusplus is defined (NVCC is a C++ compiler)
+AM_CPPFLAGS += -I$(top_builddir)/src/binding/cxx
+
+# How to compile .cu files
+.cu.o:
+	$(NVCC) -arch=native $(AM_CPPFLAGS) -c -o $@ $<
+
+EXTRA_PROGRAMS += cuda/cudapi
+cuda_cudapi_SOURCES = cuda/cudapi.cu
+cuda_cudapi_LDADD = ../lib/lib@MPILIBNAME@.la -lm
+cuda_cudapi_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+       $(LIBTOOLFLAGS) --mode=link $(CC) $(AM_CFLAGS) $(CFLAGS) \
+       $(AM_LDFLAGS) $(LDFLAGS) -o $@
diff --git a/examples/cuda/cudapi.cu b/examples/cuda/cudapi.cu
new file mode 100644
index 000000000..81d7aead1
--- /dev/null
+++ b/examples/cuda/cudapi.cu
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ *     See COPYRIGHT in top-level directory
+ */
+
+#include "mpi.h"
+#include <stdio.h>
+#include <math.h>
+
+#define THREADS_PER_BLOCK 256
+
+__device__ double f(double a)
+{
+    return (4.0 / (1.0 + a * a));
+}
+
+__global__ void do_sum(int n, double h, int stride, double *sum) {
+    int idx = 1 + (blockDim.x * blockIdx.x + threadIdx.x) + stride;
+    __shared__ double block_sum;
+
+    if (threadIdx.x == 0) {
+        block_sum = 0.0;
+    }
+    __syncthreads();
+
+    /* compute rectangles and add to block sum */
+    if (idx <= n) {
+        double x = h * ((double) idx - 0.5);
+        atomicAdd(&block_sum, f(x));
+    }
+
+    /* add block sum to total */
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        atomicAdd(sum, block_sum * h);
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    int n, myid, numprocs;
+    double PI25DT = 3.141592653589793238462643;
+    double pi, h;
+    double *sum;
+    double startwtime = 0.0, endwtime;
+    int namelen;
+    char processor_name[MPI_MAX_PROCESSOR_NAME];
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+    MPI_Get_processor_name(processor_name, &namelen);
+
+    fprintf(stdout, "Process %d of %d is on %s\n", myid, numprocs, processor_name);
+    fflush(stdout);
+
+    cudaMalloc((void **)&sum, sizeof(double));
+
+    n = 10000;
+    if (myid == 0)
+        startwtime = MPI_Wtime();
+
+    MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+    h = 1.0 / (double) n;
+    int blocks = (n + (THREADS_PER_BLOCK * numprocs - 1)) / (THREADS_PER_BLOCK * numprocs);
+    int stride = blocks * THREADS_PER_BLOCK * myid;
+
+    /* compute partial sum using the GPU */
+    do_sum<<<blocks, THREADS_PER_BLOCK>>>(n, h, stride, sum);
+    cudaDeviceSynchronize();
+
+    MPI_Reduce(sum, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    if (myid == 0) {
+        endwtime = MPI_Wtime();
+        printf("pi is approximately %.16f, Error is %.16f\n", pi, fabs(pi - PI25DT));
+        printf("wall clock time = %f\n", endwtime - startwtime);
+        fflush(stdout);
+    }
+
+    cudaFree(sum);
+
+    MPI_Finalize();
+    return 0;
+}