diff --git a/CUDA/cudapi.cu b/CUDA/cudapi.cu
index b755b845b2f2d0466db1e9087724f86058ee0d0d..987094532503e7331a816f68489893d4df045249 100644
--- a/CUDA/cudapi.cu
+++ b/CUDA/cudapi.cu
@@ -35,7 +35,7 @@ void reduce(double *arr) {
    uint64_t len = npts >> 1;
    while (1) {
       reduce_sum<<<blocks,threads>>>(arr,len);
-      cudaCheck("reduce");
+      cudaCheck("reduce_sum");
       len = len >> 1;
       if (len == 0)
          return;
@@ -50,10 +50,12 @@ int main(void) {
    cudaCheck("init");
    reduce(darr);
    cudaDeviceSynchronize();
+   cudaCheck("cudaDeviceSynchronize");
    auto tend = std::chrono::high_resolution_clock::now();        
 	auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count();
    auto mflops = npts*nloop*5.0/dt;
    cudaMemcpy(harr,darr,8,cudaMemcpyDeviceToHost);
+   cudaCheck("cudaMemcpy");
    printf("npts = %ld, nloop = %ld, pi = %lf\n",npts,nloop,harr[0]);
    printf("time = %f, estimated MFlops = %f\n",1e-6*dt,mflops);
    cudaFree(darr);