diff --git a/psrdada_cpp/effelsberg/edd/eddfft.cuh b/psrdada_cpp/effelsberg/edd/eddfft.cuh index bc18907f7aaffe0fc0f05a88a5973ec2bcfe319a..94dc6229f7429af6ce25fb08784275e393b51db8 100644 --- a/psrdada_cpp/effelsberg/edd/eddfft.cuh +++ b/psrdada_cpp/effelsberg/edd/eddfft.cuh @@ -6,6 +6,8 @@ #include "thrust/host_vector.h" #include "cufft.h" +#define NTHREADS_UNPACK 512 + namespace psrdada_cpp { namespace effelsberg { namespace edd { @@ -15,7 +17,7 @@ namespace kernels { void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__ out, int n); __global__ - void detect_and_accumulate(cufftComplex* __restrict__ in, float* __restrict__ out, int nchans, int nsamps, int naccumulate) + void detect_and_accumulate(cufftComplex* __restrict__ in, float* __restrict__ out, int nchans, int nsamps, int naccumulate); } //kernels diff --git a/psrdada_cpp/effelsberg/edd/src/eddfft.cu b/psrdada_cpp/effelsberg/edd/src/eddfft.cu index 0c4ab129c50dcca26fa3d305ec5cd2dc9f0f5bad..4d8ddfba21ce8be73b4ae7cea8bc1ef83e16151b 100644 --- a/psrdada_cpp/effelsberg/edd/src/eddfft.cu +++ b/psrdada_cpp/effelsberg/edd/src/eddfft.cu @@ -39,8 +39,8 @@ void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__ * Note: This kernels will not work with more than 512 threads. */ - __shared__ volatile float tmp_out[NTHREADS * 16]; - __shared__ volatile uint64_t tmp_in[NTHREADS * 3]; + __shared__ volatile float tmp_out[NTHREADS_UNPACK * 16]; + __shared__ volatile uint64_t tmp_in[NTHREADS_UNPACK * 3]; int block_idx = blockIdx.x; uint64_t val; @@ -51,10 +51,10 @@ void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__ { //Read to shared memeory - int block_read_start = block_idx * NTHREADS * 3; + int block_read_start = block_idx * NTHREADS_UNPACK * 3; tmp_in[threadIdx.x] = in[block_read_start + threadIdx.x]; - tmp_in[NTHREADS + threadIdx.x] = in[block_read_start + NTHREADS + threadIdx.x]; - tmp_in[NTHREADS * 2 + threadIdx.x] = in[block_read_start + NTHREADS * 2 + threadIdx.x]; + tmp_in[NTHREADS_UNPACK + threadIdx.x] = in[block_read_start + NTHREADS_UNPACK + threadIdx.x]; + tmp_in[NTHREADS_UNPACK * 2 + threadIdx.x] = in[block_read_start + NTHREADS_UNPACK * 2 + threadIdx.x]; __syncthreads(); @@ -83,9 +83,9 @@ void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__ __syncthreads(); - int block_write_start = block_idx * NTHREADS * 16; + int block_write_start = block_idx * NTHREADS_UNPACK * 16; - for (int ii = threadIdx.x; ii < 16 * NTHREADS; ii+=blockDim.x) + for (int ii = threadIdx.x; ii < 16 * NTHREADS_UNPACK; ii+=blockDim.x) { out[block_write_start+ii] = tmp_out[ii]; }