diff --git a/CUDA/cudapi.cu b/CUDA/cudapi.cu index 73276a9ae9651416ff20c4e0ce677dbc2d925f15..2586924fe8b629ab7f9ba8b44e961b18ee6ca969 100755 --- a/CUDA/cudapi.cu +++ b/CUDA/cudapi.cu @@ -7,10 +7,27 @@ #include <iostream> #include <chrono> #include <cstdint> +#include <string> uint64_t blocks = 1024; uint64_t threads = 1024; uint64_t nloop = 1000000; uint64_t npts = blocks*threads; +void cudaCheck(string msg) { + cudaError err; + err = cudaGetLastError(); + if (cudaSuccess != err) + cerr << msg << ": " << cudaGetErrorString(err) << endl; + } +void reduce(double *arr) { + uint64_t len = npts >> 1; + while (1) { + reduce_sum<<<blocks,threads>>>(arr,len); + cudaCheck("reduce"); + len = len >> 1; + if (len == 0) + return; + } + } __global__ void init(double *arr,uint64_t nloop) { uint64_t i = blockIdx.x*blockDim.x+threadIdx.x; uint64_t start = nloop*i+1; @@ -24,22 +41,6 @@ __global__ void reduce_sum(double *arr,uint64_t len) { if (i < len) arr[i] += arr[i+len]; } -void reduce(double *arr) { - uint64_t len = npts >> 1; - while (1) { - reduce_sum<<<blocks,threads>>>(arr,len); - cudaCheck("reduce"); - len = len >> 1; - if (len == 0) - return; - } - } -void cudaCheck(string msg) { - cudaError err; - err = cudaGetLastError(); - if (cudaSuccess != err) - cerr << msg << ": " << cudaGetErrorString(err) << endl; - } int main(void) { double harr[1],*darr; cudaMalloc(&darr,npts*sizeof(double));