Skip to content
Snippets Groups Projects
cudapi.cu 1.83 KiB
Newer Older
  • Learn to ignore specific revisions
  • Neil Gershenfeld's avatar
    wip
    Neil Gershenfeld committed
    //
    // cudapi.cu
    // Neil Gershenfeld 3/1/20
    // calculation of pi by a CUDA sum
    // pi = 3.14159265358979323846 
    //
    #include <iostream>
    #include <chrono>
    
    Neil Gershenfeld's avatar
    Neil Gershenfeld committed
    #include <string>
    
    Neil Gershenfeld's avatar
    Neil Gershenfeld committed
    using namespace std;
    
    Neil Gershenfeld's avatar
    wip
    Neil Gershenfeld committed
    uint64_t blocks = 1024;
    uint64_t threads = 1024;
    uint64_t nloop = 1000000;
    uint64_t npts = blocks*threads;
    
    Neil Gershenfeld's avatar
    Neil Gershenfeld committed
    void cudaCheck(string msg) {
    
    Neil Gershenfeld's avatar
    Neil Gershenfeld committed
       cudaError err;
       err = cudaGetLastError();
       if (cudaSuccess != err)
       cerr << msg << ": " << cudaGetErrorString(err) << endl;
       }
    
    Neil Gershenfeld's avatar
    wip
    Neil Gershenfeld committed
    __global__ void init(double *arr,uint64_t nloop) {
       uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
       uint64_t start = nloop*i+1;
       uint64_t end = nloop*(i+1)+1;
       arr[i] = 0;
       for (uint64_t j = start; j < end; ++j)
          arr[i] += 0.5/((j-0.75)*(j-0.25));
       }
    __global__ void reduce_sum(double *arr,uint64_t len) {
       uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
       if (i < len)
          arr[i] += arr[i+len];
       }
    
    Neil Gershenfeld's avatar
    Neil Gershenfeld committed
    void reduce(double *arr) {
       uint64_t len = npts >> 1;
       while (1) {
          reduce_sum<<<blocks,threads>>>(arr,len);
    
    Neil Gershenfeld's avatar
    Neil Gershenfeld committed
          cudaCheck("reduce_sum");
    
    Neil Gershenfeld's avatar
    Neil Gershenfeld committed
          len = len >> 1;
          if (len == 0)
             return;
          }
       }
    
    Neil Gershenfeld's avatar
    wip
    Neil Gershenfeld committed
    int main(void) {
       double harr[1],*darr;
       cudaMalloc(&darr,npts*sizeof(double));
    
    Neil Gershenfeld's avatar
    Neil Gershenfeld committed
       cudaCheck("cudaMalloc");
    
    Neil Gershenfeld's avatar
    wip
    Neil Gershenfeld committed
       auto tstart = std::chrono::high_resolution_clock::now();        
       init<<<blocks,threads>>>(darr,nloop);
    
    Neil Gershenfeld's avatar
    Neil Gershenfeld committed
       cudaCheck("init");
    
    Neil Gershenfeld's avatar
    wip
    Neil Gershenfeld committed
       reduce(darr);
       cudaDeviceSynchronize();
    
    Neil Gershenfeld's avatar
    Neil Gershenfeld committed
       cudaCheck("cudaDeviceSynchronize");
    
    Neil Gershenfeld's avatar
    wip
    Neil Gershenfeld committed
       auto tend = std::chrono::high_resolution_clock::now();        
    	auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count();
       auto mflops = npts*nloop*5.0/dt;
       cudaMemcpy(harr,darr,8,cudaMemcpyDeviceToHost);
    
    Neil Gershenfeld's avatar
    Neil Gershenfeld committed
       cudaCheck("cudaMemcpy");
    
    Neil Gershenfeld's avatar
    wip
    Neil Gershenfeld committed
       printf("npts = %ld, nloop = %ld, pi = %lf\n",npts,nloop,harr[0]);
       printf("time = %f, estimated MFlops = %f\n",1e-6*dt,mflops);
       cudaFree(darr);
       return 0;
       }