Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
//
// cudampipi.cu
// Neil Gershenfeld 10/14/20
// calculation of pi by a CUDA+MPI sum
// assumes one GPU per MPI rank
// pi = 3.14159265358979323846
//
#include <iostream>
#include <cstdint>
#include <mpi.h>
//
using namespace std;
//
uint64_t blocks = 1024;
uint64_t threads = 1024;
uint64_t npts = 1000000;
uint64_t nthreads = blocks*threads;
int nloop = 10;
//
__global__ void init(double *arr,uint64_t npts,uint64_t nthreads,int rank) {
uint64_t index = blockIdx.x*blockDim.x+threadIdx.x;
uint64_t start = npts*index+npts*nthreads*rank+1;
uint64_t end = npts*(index+1)+npts*nthreads*rank+1;
arr[index] = 0;
for (uint64_t j = start; j < end; ++j)
arr[index] += 0.5/((j-0.75)*(j-0.25));
}
//
__global__ void reduce_sum(double *arr,uint64_t len) {
uint64_t index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < len)
arr[index] += arr[index+len];
}
//
void reduce(double *arr) {
uint64_t len = nthreads >> 1;
while (1) {
reduce_sum<<<blocks,threads>>>(arr,len);
len = len >> 1;
if (len == 0)
return;
}
}
//
int main(int argc, char** argv) {
int rank,nranks;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&nranks);
double *arr,result,pi;
cudaMalloc(&arr,nthreads*sizeof(double));
if (rank == 0) {
for (int i = 0; i < nloop; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
double tstart = MPI_Wtime();
init<<<blocks,threads>>>(arr,npts,nthreads,rank);
reduce(arr);
cudaDeviceSynchronize();
cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
double tend = MPI_Wtime();
double dt = tend-tstart;
double gflops = npts*nthreads*nranks*5.0/dt/1e9;
printf("npts = %ld, nthreads = %ld, nranks = %d, pi = %lf\n",npts,nthreads,nranks,pi);
printf("time = %f, estimated GFlops = %f\n",dt,gflops);
}
}
else {
for (int i = 0; i < nloop; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
init<<<blocks,threads>>>(arr,npts,nthreads,rank);
reduce(arr);
cudaDeviceSynchronize();
cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
}
}
cudaFree(arr);
MPI_Finalize();
return 0;
}