Skip to content
Snippets Groups Projects
cudampipi.cu 2.44 KiB
Newer Older
  • Learn to ignore specific revisions
  • Neil Gershenfeld's avatar
    wip
    Neil Gershenfeld committed
    //
    // cudampipi.cu
    // Neil Gershenfeld 10/14/20
    // calculation of pi by a CUDA+MPI sum
    // assumes one GPU per MPI rank
    // pi = 3.14159265358979323846 
    //
    #include <iostream>
    #include <cstdint>
    #include <mpi.h>
    //
    using namespace std;
    //
    uint64_t blocks = 1024;
    uint64_t threads = 1024;
    uint64_t npts = 1000000;
    uint64_t nthreads = blocks*threads;
    int nloop = 10;
    //
    __global__ void init(double *arr,uint64_t npts,uint64_t nthreads,int rank) {
       uint64_t index = blockIdx.x*blockDim.x+threadIdx.x;
       uint64_t start = npts*index+npts*nthreads*rank+1;
       uint64_t end = npts*(index+1)+npts*nthreads*rank+1;
       arr[index] = 0;
       for (uint64_t j = start; j < end; ++j)
          arr[index] += 0.5/((j-0.75)*(j-0.25));
       }
    //
    __global__ void reduce_sum(double *arr,uint64_t len) {
       uint64_t index = blockIdx.x*blockDim.x+threadIdx.x;
       if (index < len)
          arr[index] += arr[index+len];
       }
    //
    void reduce(double *arr) {
       uint64_t len = nthreads >> 1;
       while (1) {
          reduce_sum<<<blocks,threads>>>(arr,len);
          len = len >> 1;
          if (len == 0)
             return;
          }
       }
    //
    int main(int argc, char** argv) {
       int rank,nranks;
       MPI_Init(&argc,&argv);
       MPI_Comm_rank(MPI_COMM_WORLD,&rank);
       MPI_Comm_size(MPI_COMM_WORLD,&nranks);
       double *arr,result,pi;
       cudaMalloc(&arr,nthreads*sizeof(double));
       if (rank == 0) {
          for (int i = 0; i < nloop; ++i) {
             MPI_Barrier(MPI_COMM_WORLD);
    	      double tstart = MPI_Wtime();
             init<<<blocks,threads>>>(arr,npts,nthreads,rank);
             reduce(arr);
             cudaDeviceSynchronize();
             cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
             MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
    	      double tend = MPI_Wtime();
             double dt = tend-tstart;
             double gflops = npts*nthreads*nranks*5.0/dt/1e9;
             printf("npts = %ld, nthreads = %ld, nranks = %d, pi = %lf\n",npts,nthreads,nranks,pi);
             printf("time = %f, estimated GFlops = %f\n",dt,gflops);
             }
          }
       else {
          for (int i = 0; i < nloop; ++i) {
             MPI_Barrier(MPI_COMM_WORLD);
             init<<<blocks,threads>>>(arr,npts,nthreads,rank);
             reduce(arr);
             cudaDeviceSynchronize();
             cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
             MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
             }
          }
       cudaFree(arr);
       MPI_Finalize();
       return 0;
       }