diff --git a/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh b/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh index 8393c6815144278fde4250c08b0c68b0b940044f..985bfaffb03508e56ba58399e18694cc1ee9ac24 100644 --- a/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh +++ b/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh @@ -38,38 +38,59 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o } + +/** + * @brief Integrates output power from complex voltage input. + * + * @param in Pointer to input data of size nsamps. + * @param out Pointer to output data of size nsamps / (naccumualte * nchans) * nchans + * @param nchans Number of channels + * @param naccumulate Number of input spectra to integrate into one output spectrum + * @param scale Normalization constant for the utput spectrum + * @param offset Output is shifted by this value + * @param stride Stride of the output spectra. For 2, leave a gap of + * nchan between every output spetrum, for 1 no gap. + * @param out_offset + * + * + */ template <typename T> __global__ void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out, - int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset) + size_t nchans, size_t nsamps, size_t naccumulate, float scale, float offset, int stride, int out_offset) { const int nb = naccumulate / blockDim.x + 1; const int bs = blockDim.x; - const int number_of_spectra = nsamps /( nchans * naccumulate); + const int number_of_spectra = nsamps / (nchans * naccumulate); - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate * nb); i += blockDim.x * gridDim.x) - { - const size_t bn = i / nchans / number_of_spectra; - const size_t currentOutputSpectra = i / nchans; - const size_t currentChannel = i % nchans; + for (size_t current_spectrum =0; current_spectrum < number_of_spectra; current_spectrum++) + { // I could just launch more blocks in a second dimension + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nchans * nb); i += blockDim.x * gridDim.x) + { + const size_t channel_number = i % nchans; + const size_t bn = i / nchans; - double sum = 0; - for (size_t k = 0; k < bs; k++) - { - size_t j = k + bn * bs; - if (j >= naccumulate) - break; + double sum = 0; + for (size_t k = 0; k < bs; k++) + { + int cidx = k + bn * bs; - float2 tmp = in[ j * nchans + currentOutputSpectra * nchans * naccumulate + currentChannel]; - double x = tmp.x * tmp.x; - double y = tmp.y * tmp.y; - sum += x + y; - } - size_t toff = out_offset * nchans + currentOutputSpectra * nchans * stride; + if (cidx >= naccumulate) + break; - atomicAdd(&out[toff + currentChannel], ((sum - offset)/scale)); + const float2 tmp = in[channel_number + cidx * nchans + current_spectrum * naccumulate * nchans]; + + double x = tmp.x * tmp.x; + double y = tmp.y * tmp.y; + sum += x + y; + } + size_t toff = out_offset * nchans + current_spectrum * nchans * stride; + + atomicAdd(&out[toff + channel_number], ((sum - offset)/scale)); + } } + } diff --git a/psrdada_cpp/effelsberg/edd/test/src/DetectorAccumulatorTester.cu b/psrdada_cpp/effelsberg/edd/test/src/DetectorAccumulatorTester.cu index fdcacff20a361028cb758a81326e0bf79d33ba02..7fbd556e63c45e41fc4da0b75d715c5cdc4d929e 100644 --- a/psrdada_cpp/effelsberg/edd/test/src/DetectorAccumulatorTester.cu +++ b/psrdada_cpp/effelsberg/edd/test/src/DetectorAccumulatorTester.cu @@ -99,6 +99,112 @@ TEST_F(DetectorAccumulatorTester, noise_test) compare_against_host(gpu_output, host_output); } + +// parameters for the detector accumulator test +struct detect_accumulate_params +{ + size_t nchan; // umber of channels + size_t nspectra; // number of output spectra + size_t naccumulate; // number of spectra to accumulate +}; + + +class detect_and_accumulate32bit: public testing::TestWithParam<detect_accumulate_params> { +// Test that the kernel executes in certain parameter +// settings +}; + + +TEST_P(detect_and_accumulate32bit, no_stride) +{ + + thrust::device_vector<float2> input; + thrust::device_vector<float> output; + + detect_accumulate_params params = GetParam(); + + input.resize(params.nspectra * params.nchan * params.naccumulate); + output.resize(params.nspectra * params.nchan); + float2 v; + v.x = 1.0; + v.y = 1.0; + thrust::fill(input.begin(), input.end(), v); + thrust::fill(output.begin(), output.end(), 0.); + + kernels::detect_and_accumulate<float> <<<1024, 1024>>>( + thrust::raw_pointer_cast(input.data()), + thrust::raw_pointer_cast(output.data()), + params.nchan, + input.size(), + params.naccumulate, + 1, 0., 1, 0); + + cudaDeviceSynchronize(); + + thrust::host_vector<float> output_host = output; + + for (size_t i =0; i < output.size(); i++) + { + ASSERT_FLOAT_EQ(output_host[i], (v.x * v.x + v.y * v.y) * params.naccumulate) << "i = " << i << " for nchan = " << params.nchan << ", nspectra = " << params.nspectra << ", naccumulate = " << params.naccumulate; + } +} + +TEST_P(detect_and_accumulate32bit, w_stride) +{ + + thrust::device_vector<float2> input; + thrust::device_vector<float> output; + + detect_accumulate_params params = GetParam(); + + input.resize(params.nspectra * params.nchan * params.naccumulate); + output.resize(params.nspectra * params.nchan * 2); + float2 v; + v.x = 1.0; + v.y = 1.0; + thrust::fill(input.begin(), input.end(), v); + thrust::fill(output.begin(), output.end(), 0.); + + kernels::detect_and_accumulate<float> <<<1024, 1024>>>( + thrust::raw_pointer_cast(input.data()), + thrust::raw_pointer_cast(output.data()), + params.nchan, + input.size(), + params.naccumulate, + 1, 0., 2, 0); + + cudaDeviceSynchronize(); + + thrust::host_vector<float> output_host = output; + + for (size_t i =0; i < params.nchan * params.nspectra; i++) + { + size_t current_spectrum = i / params.nchan; + ASSERT_FLOAT_EQ(output_host[i + current_spectrum * params.nchan], (v.x * v.x + v.y * v.y) * params.naccumulate) << "i = " << i << " for nchan = " << params.nchan << ", nspectra = " << params.nspectra << ", naccumulate = " << params.naccumulate; + } +} + + + + + + +INSTANTIATE_TEST_CASE_P (DetectorAccumulatorTester, + detect_and_accumulate32bit, + testing::Values( + // nchan; nspectra; naccumulate; + detect_accumulate_params({1024, 1, 128}), + detect_accumulate_params({1024, 1, 1024}), + detect_accumulate_params({1024, 1, 2048}), + detect_accumulate_params({1024, 1, 16384}), + detect_accumulate_params({1024, 16, 1024}), + detect_accumulate_params({8388608, 2, 16}), + detect_accumulate_params({8388608, 1, 32}) + ) + ); + + + } //namespace test } //namespace edd } //namespace meerkat diff --git a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu index 52b6f0d5e4b700041fc2852cc04eb85b1d31b563..db68c4ece82b084c51dc55c16f1cba7ab18268e9 100644 --- a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu +++ b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu @@ -146,6 +146,7 @@ TEST(GatedSpectrometer, stokes_accumulate) testStokesAccumulateParam(8 * 1024 * 1024 + 1, 32); testStokesAccumulateParam(1024 + 1, 5); testStokesAccumulateParam(1024 + 1, 64); + testStokesAccumulateParam(1024, 65536); }