Commit 743c795c authored by Tobias Winchen's avatar Tobias Winchen
Browse files

Improve speed in detect accuulate for low channel number and float output

parent e179b93f
......@@ -16,7 +16,7 @@ __global__
void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ out,
int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
{
// grid stride loop over output array to keep
// grid stride loop over output array to keep
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x)
{
double sum = 0.0f;
......@@ -30,36 +30,52 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
double y = tmp.y * tmp.y;
sum += x + y;
}
size_t toff = out_offset * nchans + currentOutputSpectra * nchans *stride;
size_t toff = out_offset * nchans + currentOutputSpectra * nchans *stride;
out[toff + i] += (int8_t) ((sum - offset)/scale);
// no atomic add for int8, thus no optimized version here.A tomic add can be
// implemented using an in32 atomicAdd and bit shifting, but this needs more effort.
}
}
template <typename T>
__global__
void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out,
int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
{
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x)
const int nb = naccumulate / blockDim.x + 1;
const int bs = blockDim.x;
const int number_of_spectra = nsamps /( nchans * naccumulate);
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate * nb); i += blockDim.x * gridDim.x)
{
double sum = 0;
size_t currentOutputSpectra = i / nchans;
size_t currentChannel = i % nchans;
const size_t bn = i / nchans / number_of_spectra;
const size_t currentOutputSpectra = i / nchans;
const size_t currentChannel = i % nchans;
for (size_t j = 0; j < naccumulate; j++)
double sum = 0;
for (size_t k = 0; k < bs; k++)
{
size_t j = k + bn * bs;
if (j >= naccumulate)
break;
float2 tmp = in[ j * nchans + currentOutputSpectra * nchans * naccumulate + currentChannel];
double x = tmp.x * tmp.x;
double y = tmp.y * tmp.y;
sum += x + y;
}
size_t toff = out_offset * nchans + currentOutputSpectra * nchans * stride;
out[i + toff] += sum;
atomicAdd(&out[toff + currentChannel], ((sum - offset)/scale));
}
}
} // namespace kernels
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment