diff --git a/CUDA/cudampipi.cu b/CUDA/cudampipi.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a72189b703afeab44110320ffd6b2d63818e4175
--- /dev/null
+++ b/CUDA/cudampipi.cu
@@ -0,0 +1,82 @@
+//
+// cudampipi.cu
+// Neil Gershenfeld 10/14/20
+// calculation of pi by a CUDA+MPI sum
+// assumes one GPU per MPI rank
+// pi = 3.14159265358979323846 
+//
+#include <iostream>
+#include <cstdint>
+#include <mpi.h>
+//
+using namespace std;
+//
+uint64_t blocks = 1024;
+uint64_t threads = 1024;
+uint64_t npts = 1000000;
+uint64_t nthreads = blocks*threads;
+int nloop = 10;
+//
+__global__ void init(double *arr,uint64_t npts,uint64_t nthreads,int rank) {
+   uint64_t index = blockIdx.x*blockDim.x+threadIdx.x;
+   uint64_t start = npts*index+npts*nthreads*rank+1;
+   uint64_t end = npts*(index+1)+npts*nthreads*rank+1;
+   arr[index] = 0;
+   for (uint64_t j = start; j < end; ++j)
+      arr[index] += 0.5/((j-0.75)*(j-0.25));
+   }
+//
+__global__ void reduce_sum(double *arr,uint64_t len) {
+   uint64_t index = blockIdx.x*blockDim.x+threadIdx.x;
+   if (index < len)
+      arr[index] += arr[index+len];
+   }
+//
+void reduce(double *arr) {
+   uint64_t len = nthreads >> 1;
+   while (1) {
+      reduce_sum<<<blocks,threads>>>(arr,len);
+      len = len >> 1;
+      if (len == 0)
+         return;
+      }
+   }
+//
+int main(int argc, char** argv) {
+   int rank,nranks;
+   MPI_Init(&argc,&argv);
+   MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+   MPI_Comm_size(MPI_COMM_WORLD,&nranks);
+   double *arr,result,pi;
+   cudaMalloc(&arr,nthreads*sizeof(double));
+   if (rank == 0) {
+      for (int i = 0; i < nloop; ++i) {
+         MPI_Barrier(MPI_COMM_WORLD);
+	      double tstart = MPI_Wtime();
+         init<<<blocks,threads>>>(arr,npts,nthreads,rank);
+         reduce(arr);
+         cudaDeviceSynchronize();
+         cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
+         MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
+	      double tend = MPI_Wtime();
+         double dt = tend-tstart;
+         double gflops = npts*nthreads*nranks*5.0/dt/1e9;
+         printf("npts = %ld, nthreads = %ld, nranks = %d, pi = %lf\n",npts,nthreads,nranks,pi);
+         printf("time = %f, estimated GFlops = %f\n",dt,gflops);
+         }
+      }
+   else {
+      for (int i = 0; i < nloop; ++i) {
+         MPI_Barrier(MPI_COMM_WORLD);
+         init<<<blocks,threads>>>(arr,npts,nthreads,rank);
+         reduce(arr);
+         cudaDeviceSynchronize();
+         cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
+         MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
+         }
+      }
+   cudaFree(arr);
+   MPI_Finalize();
+   return 0;
+   }
+
diff --git a/README.md b/README.md
index 9cde361e1d9e723ddb0133019b69b33c0ebf3173..613137b737a5fc3af5d76ca362e31899c9724e01 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@
 
 |estimated GFlops|code|description|system|date|
 |---|---|---|---|---|
+|2,493,988|[cudampipi.cu](CUDA/cudampipi.cu)|C++, CUDA+MPI, 256 nodes, 1536 ranks<br>nvcc -arch=sm_70 -std=c++11|Oak Ridge OLCF Summit<br>IBM POWER9+NVIDA V100|October 15, 2020|
 |88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019|
 |12,589|[cudapit.cu](CUDA/cudapit.cu)|C++, CUDA, 8 GPUs, 5120 cores/GPU|NVIDIA V100|March 3, 2020|
 |11,083|[mpithreadpi.cpp](hybrid/mpithreadpi.cpp)|C++, MPI+threads, 128 nodes, 64 cores/node, 4 threads/core<br>CC mpithreadpi.cpp -o mpithreadpi -O3 -ffast-math -std=c++11|Argonne ALCF Theta<br>Cray XC40|Mar 8, 2020|