Skip to content
Snippets Groups Projects
Commit 7e08688e authored by Neil Gershenfeld's avatar Neil Gershenfeld
Browse files

wip

parent c28495fc
No related branches found
No related tags found
No related merge requests found
Pipeline #5102 passed
//
// cudapit.cu
// Neil Gershenfeld 3/1/20
// calculation of pi by a CUDA multi-GPU thread sum
// pi = 3.14159265358979323846
//
#include <iostream>
#include <chrono>
#include <thread>
#include <vector>
#include <cstdint>
uint64_t blocks = 1024;
uint64_t threads = 1024;
uint64_t nloop = 1000000;
uint64_t npts = blocks*threads;
std::vector<double> results;
__global__ void init(double *arr,uint64_t nloop,uint64_t npts,int index) {
uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
uint64_t start = nloop*i+npts*nloop*index+1;
uint64_t end = nloop*(i+1)+npts*nloop*index+1;
arr[i] = 0;
for (uint64_t j = start; j < end; ++j)
arr[i] += 0.5/((j-0.75)*(j-0.25));
}
__global__ void reduce_sum(double *arr,uint64_t len) {
uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
if (i < len)
arr[i] += arr[i+len];
}
void reduce(double *arr) {
uint64_t len = npts >> 1;
while (1) {
reduce_sum<<<blocks,threads>>>(arr,len);
len = len >> 1;
if (len == 0)
return;
}
}
void sum(int index) {
cudaSetDevice(index);
double harr[1],*darr;
cudaMalloc(&darr,npts*sizeof(double));
init<<<blocks,threads>>>(darr,nloop,npts,index);
reduce(darr);
cudaDeviceSynchronize();
cudaMemcpy(harr,darr,8,cudaMemcpyDeviceToHost);
results[index] = harr[0];
cudaFree(darr);
}
int main(void) {
int ngpus;
cudaGetDeviceCount(&ngpus);
std::thread threads[ngpus];
double pi = 0;
auto tstart = std::chrono::high_resolution_clock::now();
for (int i = 0; i < ngpus; ++i) {
results.push_back(0);
threads[i] = std::thread(sum,i);
}
for (int i = 0; i < ngpus; ++i) {
threads[i].join();
pi += results[i];
}
auto tend = std::chrono::high_resolution_clock::now();
auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count();
auto gflops = npts*nloop*ngpus*5.0/dt/1e3;
std::cout << "npts: " << npts << " nloop: " << nloop << " ngpus: " << ngpus << " pi: " << pi << '\n';
std::cout << "time: " << 1e-6*dt << " estimated GFlops: " << gflops << '\n';
return 0;
}
......@@ -4,6 +4,7 @@
|estimated GFlops|code|description|system|date|
|---|---|---|---|---|
|88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019|
|5,738.|[cudapit.cu](CUDA/cudapit.cu)|C++, CUDA, 4 GPUs, 5120 cores/GPU|NVIDIA V100|March 1, 2020|
|2,117|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 10 nodes, 96 cores/node<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019|
|2,102|[mpipi2.py](Python/mpipi2.py)|Python, Numba, MPI<br>10 nodes, 96 cores/node|Intel 2x Xeon Platinum 8175M|Feb 6, 2020|
|1,635|[cudapi.cu](CUDA/cudapi.cu)|C++, CUDA, 5120 cores|NVIDIA V100|March 1, 2020|
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment