diff --git a/CUDA/cudapi.cu b/CUDA/cudapi.cu index b755b845b2f2d0466db1e9087724f86058ee0d0d..987094532503e7331a816f68489893d4df045249 100644 --- a/CUDA/cudapi.cu +++ b/CUDA/cudapi.cu @@ -35,7 +35,7 @@ void reduce(double *arr) { uint64_t len = npts >> 1; while (1) { reduce_sum<<<blocks,threads>>>(arr,len); - cudaCheck("reduce"); + cudaCheck("reduce_sum"); len = len >> 1; if (len == 0) return; @@ -50,10 +50,12 @@ int main(void) { cudaCheck("init"); reduce(darr); cudaDeviceSynchronize(); + cudaCheck("cudaDeviceSynchronize"); auto tend = std::chrono::high_resolution_clock::now(); auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count(); auto mflops = npts*nloop*5.0/dt; cudaMemcpy(harr,darr,8,cudaMemcpyDeviceToHost); + cudaCheck("cudaMemcpy"); printf("npts = %ld, nloop = %ld, pi = %lf\n",npts,nloop,harr[0]); printf("time = %f, estimated MFlops = %f\n",1e-6*dt,mflops); cudaFree(darr);