diff --git a/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh b/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh index 6302d105d6dd02c35460e3130648b4ee3322effa..8393c6815144278fde4250c08b0c68b0b940044f 100644 --- a/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh +++ b/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh @@ -16,7 +16,7 @@ __global__ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ out, int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset) { - // grid stride loop over output array to keep + // grid stride loop over output array to keep for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x) { double sum = 0.0f; @@ -30,36 +30,52 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o double y = tmp.y * tmp.y; sum += x + y; } - size_t toff = out_offset * nchans + currentOutputSpectra * nchans *stride; + size_t toff = out_offset * nchans + currentOutputSpectra * nchans *stride; out[toff + i] += (int8_t) ((sum - offset)/scale); + // no atomic add for int8, thus no optimized version here.A tomic add can be + // implemented using an in32 atomicAdd and bit shifting, but this needs more effort. } } - template <typename T> __global__ void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out, int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x) + + const int nb = naccumulate / blockDim.x + 1; + const int bs = blockDim.x; + const int number_of_spectra = nsamps /( nchans * naccumulate); + + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate * nb); i += blockDim.x * gridDim.x) { - double sum = 0; - size_t currentOutputSpectra = i / nchans; - size_t currentChannel = i % nchans; + const size_t bn = i / nchans / number_of_spectra; + const size_t currentOutputSpectra = i / nchans; + const size_t currentChannel = i % nchans; - for (size_t j = 0; j < naccumulate; j++) + double sum = 0; + for (size_t k = 0; k < bs; k++) { + size_t j = k + bn * bs; + if (j >= naccumulate) + break; + float2 tmp = in[ j * nchans + currentOutputSpectra * nchans * naccumulate + currentChannel]; double x = tmp.x * tmp.x; double y = tmp.y * tmp.y; sum += x + y; } size_t toff = out_offset * nchans + currentOutputSpectra * nchans * stride; - out[i + toff] += sum; + + atomicAdd(&out[toff + currentChannel], ((sum - offset)/scale)); } } + + + + } // namespace kernels