Commit de4c3c1c authored by Tobias Winchen's avatar Tobias Winchen
Browse files

Merge branch 'detect_accumualte_fix' into 'devel'

Detect accumulate fix

See merge request !14
parents c673c39a 6ab413cc
Pipeline #104503 passed with stages
in 12 minutes and 12 seconds
......@@ -38,38 +38,59 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
}
/**
* @brief Integrates output power from complex voltage input.
*
* @param in Pointer to input data of size nsamps.
* @param out Pointer to output data of size nsamps / (naccumualte * nchans) * nchans
* @param nchans Number of channels
* @param naccumulate Number of input spectra to integrate into one output spectrum
* @param scale Normalization constant for the utput spectrum
* @param offset Output is shifted by this value
* @param stride Stride of the output spectra. For 2, leave a gap of
* nchan between every output spetrum, for 1 no gap.
* @param out_offset
*
*
*/
template <typename T>
__global__
void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out,
int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
size_t nchans, size_t nsamps, size_t naccumulate, float scale, float offset, int stride, int out_offset)
{
const int nb = naccumulate / blockDim.x + 1;
const int bs = blockDim.x;
const int number_of_spectra = nsamps /( nchans * naccumulate);
const int number_of_spectra = nsamps / (nchans * naccumulate);
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate * nb); i += blockDim.x * gridDim.x)
{
const size_t bn = i / nchans / number_of_spectra;
const size_t currentOutputSpectra = i / nchans;
const size_t currentChannel = i % nchans;
for (size_t current_spectrum =0; current_spectrum < number_of_spectra; current_spectrum++)
{ // I could just launch more blocks in a second dimension
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nchans * nb); i += blockDim.x * gridDim.x)
{
const size_t channel_number = i % nchans;
const size_t bn = i / nchans;
double sum = 0;
for (size_t k = 0; k < bs; k++)
{
size_t j = k + bn * bs;
if (j >= naccumulate)
break;
double sum = 0;
for (size_t k = 0; k < bs; k++)
{
int cidx = k + bn * bs;
float2 tmp = in[ j * nchans + currentOutputSpectra * nchans * naccumulate + currentChannel];
double x = tmp.x * tmp.x;
double y = tmp.y * tmp.y;
sum += x + y;
}
size_t toff = out_offset * nchans + currentOutputSpectra * nchans * stride;
if (cidx >= naccumulate)
break;
atomicAdd(&out[toff + currentChannel], ((sum - offset)/scale));
const float2 tmp = in[channel_number + cidx * nchans + current_spectrum * naccumulate * nchans];
double x = tmp.x * tmp.x;
double y = tmp.y * tmp.y;
sum += x + y;
}
size_t toff = out_offset * nchans + current_spectrum * nchans * stride;
atomicAdd(&out[toff + channel_number], ((sum - offset)/scale));
}
}
}
......
......@@ -99,6 +99,112 @@ TEST_F(DetectorAccumulatorTester, noise_test)
compare_against_host(gpu_output, host_output);
}
// parameters for the detector accumulator test
struct detect_accumulate_params
{
size_t nchan; // umber of channels
size_t nspectra; // number of output spectra
size_t naccumulate; // number of spectra to accumulate
};
class detect_and_accumulate32bit: public testing::TestWithParam<detect_accumulate_params> {
// Test that the kernel executes in certain parameter
// settings
};
TEST_P(detect_and_accumulate32bit, no_stride)
{
thrust::device_vector<float2> input;
thrust::device_vector<float> output;
detect_accumulate_params params = GetParam();
input.resize(params.nspectra * params.nchan * params.naccumulate);
output.resize(params.nspectra * params.nchan);
float2 v;
v.x = 1.0;
v.y = 1.0;
thrust::fill(input.begin(), input.end(), v);
thrust::fill(output.begin(), output.end(), 0.);
kernels::detect_and_accumulate<float> <<<1024, 1024>>>(
thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()),
params.nchan,
input.size(),
params.naccumulate,
1, 0., 1, 0);
cudaDeviceSynchronize();
thrust::host_vector<float> output_host = output;
for (size_t i =0; i < output.size(); i++)
{
ASSERT_FLOAT_EQ(output_host[i], (v.x * v.x + v.y * v.y) * params.naccumulate) << "i = " << i << " for nchan = " << params.nchan << ", nspectra = " << params.nspectra << ", naccumulate = " << params.naccumulate;
}
}
TEST_P(detect_and_accumulate32bit, w_stride)
{
thrust::device_vector<float2> input;
thrust::device_vector<float> output;
detect_accumulate_params params = GetParam();
input.resize(params.nspectra * params.nchan * params.naccumulate);
output.resize(params.nspectra * params.nchan * 2);
float2 v;
v.x = 1.0;
v.y = 1.0;
thrust::fill(input.begin(), input.end(), v);
thrust::fill(output.begin(), output.end(), 0.);
kernels::detect_and_accumulate<float> <<<1024, 1024>>>(
thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()),
params.nchan,
input.size(),
params.naccumulate,
1, 0., 2, 0);
cudaDeviceSynchronize();
thrust::host_vector<float> output_host = output;
for (size_t i =0; i < params.nchan * params.nspectra; i++)
{
size_t current_spectrum = i / params.nchan;
ASSERT_FLOAT_EQ(output_host[i + current_spectrum * params.nchan], (v.x * v.x + v.y * v.y) * params.naccumulate) << "i = " << i << " for nchan = " << params.nchan << ", nspectra = " << params.nspectra << ", naccumulate = " << params.naccumulate;
}
}
INSTANTIATE_TEST_CASE_P (DetectorAccumulatorTester,
detect_and_accumulate32bit,
testing::Values(
// nchan; nspectra; naccumulate;
detect_accumulate_params({1024, 1, 128}),
detect_accumulate_params({1024, 1, 1024}),
detect_accumulate_params({1024, 1, 2048}),
detect_accumulate_params({1024, 1, 16384}),
detect_accumulate_params({1024, 16, 1024}),
detect_accumulate_params({8388608, 2, 16}),
detect_accumulate_params({8388608, 1, 32})
)
);
} //namespace test
} //namespace edd
} //namespace meerkat
......
......@@ -146,6 +146,7 @@ TEST(GatedSpectrometer, stokes_accumulate)
testStokesAccumulateParam(8 * 1024 * 1024 + 1, 32);
testStokesAccumulateParam(1024 + 1, 5);
testStokesAccumulateParam(1024 + 1, 64);
testStokesAccumulateParam(1024, 65536);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment