Skip to content
Snippets Groups Projects
pi.cu 2.23 KiB
Newer Older
  • Learn to ignore specific revisions
  • /*
    * pi.cu
    * Thras Karydis  2/4/2017
    * use CUDA to evaluate pi by summation
    */
    
    #include <stdio.h>
    #include <cuda.h>
    #include <sys/time.h>
    
    #define NLOOP 1
    #define NPTS 1000000000  // Number of points (one billion)
    #define NUM_THREAD  256  // Number of threads per block (max is 1024)
    #define NUM_BLOCK  (NPTS + NUM_THREAD - 1) / NUM_THREAD // Number of blocks per grid
    
    int tid;
    double pi = 0;
    
    // Kernel that executes on the GPU
    __global__ void compute_pi(float *sum) {
    
            unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;  // Sequential thread index across the blocks
            if (idx < NPTS)
            sum[idx] = 0.5/((idx-0.75)*(idx-0.25));
    }
    
    // Main routine that executes on the host
    int main(void) {
    
            unsigned long int start_time,end_time;
            struct timeval start,end;
            double flops;
    
            int blocksPerGrid = (NPTS + NUM_THREAD - 1) / NUM_THREAD; // Compute number of blocks needed
            float *sumHost, *sumDev;  // Pointer to host & device arrays
    
            size_t size = blocksPerGrid*NUM_THREAD*sizeof(float);  //Array memory size
            sumHost = (float *)malloc(size);  //  Allocate array on host
            cudaMalloc((void **) &sumDev, size);  // Allocate array on device
            // Initialize array in device to 0
            cudaMemset(sumDev, 0, size);
    
            gettimeofday(&start, NULL);
    
            // Do calculation on device
            compute_pi <<<blocksPerGrid, NUM_THREAD>>> (sumDev); // call CUDA kernel
            // Retrieve result from device and store it in host array
            cudaMemcpy(sumHost, sumDev, size, cudaMemcpyDeviceToHost);
            // Reduce
            for(tid=1; tid<NPTS; tid++)
                    pi += sumHost[tid];
    
            gettimeofday(&end, NULL);
    
            // Print results
            start_time = start.tv_sec * 1e6 + start.tv_usec;
            end_time = end.tv_sec * 1e6 + end.tv_usec;
            flops = NLOOP*(NPTS*(5.0/(end_time-start_time)));
            printf("processes = %d: %d blocks and %d threads per block\n", blocksPerGrid*NUM_THREAD, blocksPerGrid, NUM_THREAD);
            printf("NPTS = %d, NLOOP = %d, pi = %f\n", NPTS, NLOOP, pi/NLOOP);
            printf("time = %f, estimated GFlops = %f\n",(end_time-start_time)/1.0e9,flops);
    
            // Cleanup
            free(sumHost);
            cudaFree(sumDev);
    
            return 0;
    }