diff --git a/CUDA/cudapit.cu b/CUDA/cudapit.cu new file mode 100755 index 0000000000000000000000000000000000000000..e168bf0ebf1c817fdb98e1ef4a65fb7052cf5b2b --- /dev/null +++ b/CUDA/cudapit.cu @@ -0,0 +1,70 @@ +// +// cudapit.cu +// Neil Gershenfeld 3/1/20 +// calculation of pi by a CUDA multi-GPU thread sum +// pi = 3.14159265358979323846 +// +#include <iostream> +#include <chrono> +#include <thread> +#include <vector> +#include <cstdint> +uint64_t blocks = 1024; +uint64_t threads = 1024; +uint64_t nloop = 1000000; +uint64_t npts = blocks*threads; +std::vector<double> results; +__global__ void init(double *arr,uint64_t nloop,uint64_t npts,int index) { + uint64_t i = blockIdx.x*blockDim.x+threadIdx.x; + uint64_t start = nloop*i+npts*nloop*index+1; + uint64_t end = nloop*(i+1)+npts*nloop*index+1; + arr[i] = 0; + for (uint64_t j = start; j < end; ++j) + arr[i] += 0.5/((j-0.75)*(j-0.25)); + } +__global__ void reduce_sum(double *arr,uint64_t len) { + uint64_t i = blockIdx.x*blockDim.x+threadIdx.x; + if (i < len) + arr[i] += arr[i+len]; + } +void reduce(double *arr) { + uint64_t len = npts >> 1; + while (1) { + reduce_sum<<<blocks,threads>>>(arr,len); + len = len >> 1; + if (len == 0) + return; + } + } +void sum(int index) { + cudaSetDevice(index); + double harr[1],*darr; + cudaMalloc(&darr,npts*sizeof(double)); + init<<<blocks,threads>>>(darr,nloop,npts,index); + reduce(darr); + cudaDeviceSynchronize(); + cudaMemcpy(harr,darr,8,cudaMemcpyDeviceToHost); + results[index] = harr[0]; + cudaFree(darr); + } +int main(void) { + int ngpus; + cudaGetDeviceCount(&ngpus); + std::thread threads[ngpus]; + double pi = 0; + auto tstart = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < ngpus; ++i) { + results.push_back(0); + threads[i] = std::thread(sum,i); + } + for (int i = 0; i < ngpus; ++i) { + threads[i].join(); + pi += results[i]; + } + auto tend = std::chrono::high_resolution_clock::now(); + auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count(); + auto gflops = npts*nloop*ngpus*5.0/dt/1e3; + std::cout << "npts: " << npts << " nloop: " << nloop << " ngpus: " << ngpus << " pi: " << pi << '\n'; + std::cout << "time: " << 1e-6*dt << " estimated GFlops: " << gflops << '\n'; + return 0; + } diff --git a/README.md b/README.md index a4ce0b087d155aff1d262730057ab5eb8b73783a..b3850a10ef2e931997e535d90a7a755a543761aa 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ |estimated GFlops|code|description|system|date| |---|---|---|---|---| |88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019| +|5,738.|[cudapit.cu](CUDA/cudapit.cu)|C++, CUDA, 4 GPUs, 5120 cores/GPU|NVIDIA V100|March 1, 2020| |2,117|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 10 nodes, 96 cores/node<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019| |2,102|[mpipi2.py](Python/mpipi2.py)|Python, Numba, MPI<br>10 nodes, 96 cores/node|Intel 2x Xeon Platinum 8175M|Feb 6, 2020| |1,635|[cudapi.cu](CUDA/cudapi.cu)|C++, CUDA, 5120 cores|NVIDIA V100|March 1, 2020|