diff --git a/Python/numbapig.py b/Python/numbapig.py
index f8a562e66edc598d9cd7c6e83f3c8d28eb634d11..71c8eb8a00e394e92b8bae309fadb6d222edfd49 100644
--- a/Python/numbapig.py
+++ b/Python/numbapig.py
@@ -36,6 +36,7 @@ def CUDA_reduce(arr,NPTS):
    len = NPTS >> 1
    while (1):
       CUDA_sum[grid_size,block_size](arr,len)
+      cuda.synchronize()
       len = len >> 1
       if (len == 0):
          return
@@ -64,6 +65,7 @@ CUDA_result(arr,result)
 #
 start_time = time.time()
 init[grid_size,block_size](arr)
+cuda.synchronize()
 end_time = time.time()
 mflops = NPTS*4.0/(1.0e6*(end_time-start_time))
 print("CUDA kernel array calculation:")
@@ -83,6 +85,7 @@ print("   time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
 #
 start_time = time.time()
 init[grid_size,block_size](arr)
+cuda.synchronize()
 pi = Numba_reduce(arr)
 end_time = time.time()
 mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
@@ -104,8 +107,10 @@ print("   time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
 #
 start_time = time.time()
 init[grid_size,block_size](arr)
+cuda.synchronize()
 CUDA_reduce(arr,NPTS)
 CUDA_result(arr,result)
+cuda.synchronize()
 end_time = time.time()
 pi = result.copy_to_host()
 mflops = NPTS*5.0/(1.0e6*(end_time-start_time))