Newer
Older
# pi = 3.14159265358979323846
#
from numba import cuda
import numpy as np
import time
#
# problem size
#
block_size = 2**10
@cuda.jit
def CUDA_sum(arr,len):
i = cuda.grid(1)
if (i < len):
arr[i] += arr[i+len]
#
def CUDA_reduce(arr,NPTS):
len = NPTS >> 1
while (1):
CUDA_sum[grid_size,block_size](arr,len)
@cuda.jit
def CUDA_result(arr,result):
i = cuda.grid(1)
if (i == 0):
result[0] = arr[0]
#
result = cuda.device_array(1,np.float32)
#arr = cuda.device_array(NPTS,np.int64) # for testing reduction
#result = cuda.device_array(1,np.int64) # for testing reduction
end_time = time.time()
mflops = NPTS*4.0/(1.0e6*(end_time-start_time))
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
end_time = time.time()
mflops = NPTS*1.0/(1.0e6*(end_time-start_time))
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
end_time = time.time()
mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
print(" NPTS = %d, pi = %f"%(NPTS,pi))
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
# CUDA kernel reduction
#
init[grid_size,block_size](arr)
start_time = time.time()
CUDA_reduce(arr,NPTS)
end_time = time.time()
mflops = NPTS*1.0/(1.0e6*(end_time-start_time))
print("CUDA kernel reduction:")
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
# both with CUDA kernel reduction
#
start_time = time.time()
init[grid_size,block_size](arr)
mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
print("both with CUDA kernel reduction:")
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
# both with CUDA kernel reduction and transfer
#
start_time = time.time()
init[grid_size,block_size](arr)
CUDA_reduce(arr,NPTS)
CUDA_result(arr,result)
pi = result.copy_to_host()
end_time = time.time()
mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
print("both with CUDA kernel reduction and transfer:")
print(" NPTS = %d, pi = %f"%(NPTS,pi[0]))
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))