diff --git a/Python/numbapig.py b/Python/numbapig.py index f8a562e66edc598d9cd7c6e83f3c8d28eb634d11..71c8eb8a00e394e92b8bae309fadb6d222edfd49 100644 --- a/Python/numbapig.py +++ b/Python/numbapig.py @@ -36,6 +36,7 @@ def CUDA_reduce(arr,NPTS): len = NPTS >> 1 while (1): CUDA_sum[grid_size,block_size](arr,len) + cuda.synchronize() len = len >> 1 if (len == 0): return @@ -64,6 +65,7 @@ CUDA_result(arr,result) # start_time = time.time() init[grid_size,block_size](arr) +cuda.synchronize() end_time = time.time() mflops = NPTS*4.0/(1.0e6*(end_time-start_time)) print("CUDA kernel array calculation:") @@ -83,6 +85,7 @@ print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) # start_time = time.time() init[grid_size,block_size](arr) +cuda.synchronize() pi = Numba_reduce(arr) end_time = time.time() mflops = NPTS*5.0/(1.0e6*(end_time-start_time)) @@ -104,8 +107,10 @@ print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops)) # start_time = time.time() init[grid_size,block_size](arr) +cuda.synchronize() CUDA_reduce(arr,NPTS) CUDA_result(arr,result) +cuda.synchronize() end_time = time.time() pi = result.copy_to_host() mflops = NPTS*5.0/(1.0e6*(end_time-start_time))