diff --git a/CUDA/cudampipi.cu b/CUDA/cudampipi.cu new file mode 100644 index 0000000000000000000000000000000000000000..a72189b703afeab44110320ffd6b2d63818e4175 --- /dev/null +++ b/CUDA/cudampipi.cu @@ -0,0 +1,82 @@ +// +// cudampipi.cu +// Neil Gershenfeld 10/14/20 +// calculation of pi by a CUDA+MPI sum +// assumes one GPU per MPI rank +// pi = 3.14159265358979323846 +// +#include <iostream> +#include <cstdint> +#include <mpi.h> +// +using namespace std; +// +uint64_t blocks = 1024; +uint64_t threads = 1024; +uint64_t npts = 1000000; +uint64_t nthreads = blocks*threads; +int nloop = 10; +// +__global__ void init(double *arr,uint64_t npts,uint64_t nthreads,int rank) { + uint64_t index = blockIdx.x*blockDim.x+threadIdx.x; + uint64_t start = npts*index+npts*nthreads*rank+1; + uint64_t end = npts*(index+1)+npts*nthreads*rank+1; + arr[index] = 0; + for (uint64_t j = start; j < end; ++j) + arr[index] += 0.5/((j-0.75)*(j-0.25)); + } +// +__global__ void reduce_sum(double *arr,uint64_t len) { + uint64_t index = blockIdx.x*blockDim.x+threadIdx.x; + if (index < len) + arr[index] += arr[index+len]; + } +// +void reduce(double *arr) { + uint64_t len = nthreads >> 1; + while (1) { + reduce_sum<<<blocks,threads>>>(arr,len); + len = len >> 1; + if (len == 0) + return; + } + } +// +int main(int argc, char** argv) { + int rank,nranks; + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&rank); + MPI_Comm_size(MPI_COMM_WORLD,&nranks); + double *arr,result,pi; + cudaMalloc(&arr,nthreads*sizeof(double)); + if (rank == 0) { + for (int i = 0; i < nloop; ++i) { + MPI_Barrier(MPI_COMM_WORLD); + double tstart = MPI_Wtime(); + init<<<blocks,threads>>>(arr,npts,nthreads,rank); + reduce(arr); + cudaDeviceSynchronize(); + cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost); + MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + double tend = MPI_Wtime(); + double dt = tend-tstart; + double gflops = npts*nthreads*nranks*5.0/dt/1e9; + printf("npts = %ld, nthreads = %ld, nranks = %d, pi = %lf\n",npts,nthreads,nranks,pi); + printf("time = %f, estimated GFlops = %f\n",dt,gflops); + } + } + else { + for (int i = 0; i < nloop; ++i) { + MPI_Barrier(MPI_COMM_WORLD); + init<<<blocks,threads>>>(arr,npts,nthreads,rank); + reduce(arr); + cudaDeviceSynchronize(); + cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost); + MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + } + } + cudaFree(arr); + MPI_Finalize(); + return 0; + } + diff --git a/README.md b/README.md index 9cde361e1d9e723ddb0133019b69b33c0ebf3173..613137b737a5fc3af5d76ca362e31899c9724e01 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ |estimated GFlops|code|description|system|date| |---|---|---|---|---| +|2,493,988|[cudampipi.cu](CUDA/cudampipi.cu)|C++, CUDA+MPI, 256 nodes, 1536 ranks<br>nvcc -arch=sm_70 -std=c++11|Oak Ridge OLCF Summit<br>IBM POWER9+NVIDA V100|October 15, 2020| |88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019| |12,589|[cudapit.cu](CUDA/cudapit.cu)|C++, CUDA, 8 GPUs, 5120 cores/GPU|NVIDIA V100|March 3, 2020| |11,083|[mpithreadpi.cpp](hybrid/mpithreadpi.cpp)|C++, MPI+threads, 128 nodes, 64 cores/node, 4 threads/core<br>CC mpithreadpi.cpp -o mpithreadpi -O3 -ffast-math -std=c++11|Argonne ALCF Theta<br>Cray XC40|Mar 8, 2020|