Commit 743c795c authored by Tobias Winchen's avatar Tobias Winchen
Browse files

Improve speed in detect accuulate for low channel number and float output

parent e179b93f
...@@ -16,7 +16,7 @@ __global__ ...@@ -16,7 +16,7 @@ __global__
void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ out, void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ out,
int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset) int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
{ {
// grid stride loop over output array to keep // grid stride loop over output array to keep
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x) for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x)
{ {
double sum = 0.0f; double sum = 0.0f;
...@@ -30,36 +30,52 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o ...@@ -30,36 +30,52 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
double y = tmp.y * tmp.y; double y = tmp.y * tmp.y;
sum += x + y; sum += x + y;
} }
size_t toff = out_offset * nchans + currentOutputSpectra * nchans *stride; size_t toff = out_offset * nchans + currentOutputSpectra * nchans *stride;
out[toff + i] += (int8_t) ((sum - offset)/scale); out[toff + i] += (int8_t) ((sum - offset)/scale);
// no atomic add for int8, thus no optimized version here.A tomic add can be
// implemented using an in32 atomicAdd and bit shifting, but this needs more effort.
} }
} }
template <typename T> template <typename T>
__global__ __global__
void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out, void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out,
int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset) int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
{ {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x)
const int nb = naccumulate / blockDim.x + 1;
const int bs = blockDim.x;
const int number_of_spectra = nsamps /( nchans * naccumulate);
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate * nb); i += blockDim.x * gridDim.x)
{ {
double sum = 0; const size_t bn = i / nchans / number_of_spectra;
size_t currentOutputSpectra = i / nchans; const size_t currentOutputSpectra = i / nchans;
size_t currentChannel = i % nchans; const size_t currentChannel = i % nchans;
for (size_t j = 0; j < naccumulate; j++) double sum = 0;
for (size_t k = 0; k < bs; k++)
{ {
size_t j = k + bn * bs;
if (j >= naccumulate)
break;
float2 tmp = in[ j * nchans + currentOutputSpectra * nchans * naccumulate + currentChannel]; float2 tmp = in[ j * nchans + currentOutputSpectra * nchans * naccumulate + currentChannel];
double x = tmp.x * tmp.x; double x = tmp.x * tmp.x;
double y = tmp.y * tmp.y; double y = tmp.y * tmp.y;
sum += x + y; sum += x + y;
} }
size_t toff = out_offset * nchans + currentOutputSpectra * nchans * stride; size_t toff = out_offset * nchans + currentOutputSpectra * nchans * stride;
out[i + toff] += sum;
atomicAdd(&out[toff + currentChannel], ((sum - offset)/scale));
} }
} }
} // namespace kernels } // namespace kernels
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment