From d22f63f563050aca96c3b7239f1486d51de11a81 Mon Sep 17 00:00:00 2001 From: Tobias Winchen <tobias.winchen@rwth-aachen.de> Date: Fri, 8 May 2020 11:50:34 +0200 Subject: [PATCH] Fixed mem transfer compute overlap --- .../effelsberg/edd/GatedSpectrometer.cuh | 10 ++- .../edd/detail/GatedSpectrometer.cu | 77 ++++++++++++++----- .../edd/test/src/GatedSpectrometerTest.cu | 35 ++++++--- 3 files changed, 88 insertions(+), 34 deletions(-) diff --git a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh index 847106d0..02a64e60 100644 --- a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh +++ b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh @@ -46,6 +46,12 @@ struct PolarizationData thrust::device_vector<UnpackedVoltageType> _baseLineG0; /// Baseline in gate 1 state thrust::device_vector<UnpackedVoltageType> _baseLineG1; + + /// Baseline in gate 0 state after update + thrust::device_vector<UnpackedVoltageType> _baseLineG0_update; + /// Baseline in gate 1 state after update + thrust::device_vector<UnpackedVoltageType> _baseLineG1_update; + /// Channelized voltage in gate 0 state thrust::device_vector<ChannelisedVoltageType> _channelised_voltage_G0; /// Channelized voltage in gate 1 state @@ -241,8 +247,8 @@ private: __global__ void gating(float *G0, float *G1, const int64_t *sideChannelData, size_t N, size_t heapSize, size_t bitpos, size_t noOfSideChannels, size_t selectedSideChannel, - const float baseLineG0, - const float baseLineG1, + const float* __restrict__ _baseLineG0, + const float* __restrict__ _baseLineG1, float* __restrict__ baseLineNG0, float* __restrict__ baseLineNG1, uint64_cu* stats_G0, diff --git a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu index 23e1647b..679b7fc2 100644 --- a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu +++ b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu @@ -51,8 +51,8 @@ __global__ void gating(float* __restrict__ G0, const uint64_t* __restrict__ sideChannelData, size_t N, size_t heapSize, size_t bitpos, size_t noOfSideChannels, size_t selectedSideChannel, - const float baseLineG0, - const float baseLineG1, + const float* __restrict__ _baseLineG0, + const float* __restrict__ _baseLineG1, float* __restrict__ baseLineNG0, float* __restrict__ baseLineNG1, uint64_cu* stats_G0, uint64_cu* stats_G1) { @@ -60,6 +60,9 @@ __global__ void gating(float* __restrict__ G0, uint32_t _G0stats = 0; uint32_t _G1stats = 0; + const float baseLineG0 = _baseLineG0[0]; + const float baseLineG1 = _baseLineG1[0]; + float baselineUpdateG0 = 0; float baselineUpdateG1 = 0; @@ -115,6 +118,36 @@ __global__ void gating(float* __restrict__ G0, +// Updates the baselines of the gates for the polarization set for the next +// block +// only few output blocks per input block thus execution on only one thread. +// Important is that the execution is async on the GPU. +__global__ void update_baselines(float* __restrict__ baseLineG0, + float* __restrict__ baseLineG1, + float* __restrict__ baseLineNG0, + float* __restrict__ baseLineNG1, + uint64_cu* stats_G0, uint64_cu* stats_G1, + size_t N) +{ + size_t NG0 = 0; + size_t NG1 = 0; + + for (size_t i =0; i < N; i++) + { + NG0 += stats_G0[i]; + NG1 += stats_G1[i]; + } + + baseLineG0[0] = baseLineNG0[0] / NG0; + baseLineG1[0] = baseLineNG1[0] / NG1; + baseLineNG0[0] = 0; + baseLineNG1[0] = 0; +} + + + + + template <class HandlerType> GatedSpectrometer<HandlerType>::GatedSpectrometer( const DadaBufferLayout &dadaBufferLayout, @@ -194,10 +227,14 @@ GatedSpectrometer<HandlerType>::GatedSpectrometer( _unpacked_voltage_G0.resize(_nsamps_per_buffer); _unpacked_voltage_G1.resize(_nsamps_per_buffer); - polarization0._baseLineG0.resize(1); - polarization0._baseLineG1.resize(1); - polarization1._baseLineG0.resize(1); - polarization1._baseLineG1.resize(1); + polarization0._baseLineG0.resize(1); + polarization0._baseLineG0_update.resize(1); + polarization0._baseLineG1.resize(1); + polarization0._baseLineG1_update.resize(1); + polarization1._baseLineG0.resize(1); + polarization1._baseLineG0_update.resize(1); + polarization1._baseLineG1.resize(1); + polarization1._baseLineG1_update.resize(1); BOOST_LOG_TRIVIAL(debug) << " Unpacked voltages size (in samples): " << _unpacked_voltage_G0.size(); @@ -286,14 +323,7 @@ void GatedSpectrometer<HandlerType>::gated_fft( throw std::runtime_error("Unsupported number of bits"); } - // Get baseline from previous block - float previous_baseLineG0 = data._baseLineG0[0]; - float previous_baseLineG1 = data._baseLineG1[0]; - - uint64_t NG0 = 0; - uint64_t NG1 = 0; - -// Loop over outputblocks, for case of multiple output blocks per input block + // Loop over outputblocks, for case of multiple output blocks per input block int step = data._sideChannelData.b().size() / _noOfBitSetsIn_G0.size(); for (size_t i = 0; i < _noOfBitSetsIn_G0.size(); i++) @@ -307,19 +337,26 @@ void GatedSpectrometer<HandlerType>::gated_fft( _selectedBit, _dadaBufferLayout.getNSideChannels(), _selectedSideChannel, - previous_baseLineG0, previous_baseLineG1, thrust::raw_pointer_cast(data._baseLineG0.data()), thrust::raw_pointer_cast(data._baseLineG1.data()), + thrust::raw_pointer_cast(data._baseLineG0_update.data()), + thrust::raw_pointer_cast(data._baseLineG1_update.data()), thrust::raw_pointer_cast(_noOfBitSetsIn_G0.data() + i), thrust::raw_pointer_cast(_noOfBitSetsIn_G1.data() + i) ); - NG0 += _noOfBitSetsIn_G0[i]; - NG1 += _noOfBitSetsIn_G1[i]; } - data._baseLineG0[0] /= NG0; - data._baseLineG1[0] /= NG1; - BOOST_LOG_TRIVIAL(debug) << "Updating Baselines\n G0: " << previous_baseLineG0 << " -> " << data._baseLineG0[0] << ", " << previous_baseLineG1 << " -> " << data._baseLineG1[0] ; + // only few output blocks per input block thus execution on only one thread. + // Important is that the execution is async on the GPU. + update_baselines<<<1,1,0, _proc_stream>>>( + thrust::raw_pointer_cast(data._baseLineG0.data()), + thrust::raw_pointer_cast(data._baseLineG1.data()), + thrust::raw_pointer_cast(data._baseLineG0_update.data()), + thrust::raw_pointer_cast(data._baseLineG1_update.data()), + thrust::raw_pointer_cast(_noOfBitSetsIn_G0.data()), + thrust::raw_pointer_cast(_noOfBitSetsIn_G1.data()), + _noOfBitSetsIn_G0.size() + ); BOOST_LOG_TRIVIAL(debug) << "Performing FFT 1"; UnpackedVoltageType *_unpacked_voltage_ptr = diff --git a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu index 18aa9fea..495375dd 100644 --- a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu +++ b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu @@ -152,6 +152,8 @@ TEST(GatedSpectrometer, GatingKernel) thrust::device_vector<float> baseLineG0(1); thrust::device_vector<float> baseLineG1(1); + thrust::device_vector<float> baseLineG0_update(1); + thrust::device_vector<float> baseLineG1_update(1); thrust::fill(G0.begin(), G0.end(), 42); thrust::fill(G1.begin(), G1.end(), 23); thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 0); @@ -160,18 +162,22 @@ TEST(GatedSpectrometer, GatingKernel) { thrust::fill(_nG0.begin(), _nG0.end(), 0); thrust::fill(_nG1.begin(), _nG1.end(), 0); - baseLineG0[0] = 0.; - baseLineG1[0] = 0.; + baseLineG0[0] = -3; + baseLineG1[0] = -4; + baseLineG0_update[0] = 0; + baseLineG1_update[0] = 0; + const uint64_t *sideCD = (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data())); psrdada_cpp::effelsberg::edd::gating<<<1024 , 1024>>>( thrust::raw_pointer_cast(G0.data()), thrust::raw_pointer_cast(G1.data()), sideCD, - G0.size(), G0.size(), 0, 1, + G0.size(), blockSize, 0, 1, 0, - -3., -4, thrust::raw_pointer_cast(baseLineG0.data()), thrust::raw_pointer_cast(baseLineG1.data()), + thrust::raw_pointer_cast(baseLineG0_update.data()), + thrust::raw_pointer_cast(baseLineG1_update.data()), thrust::raw_pointer_cast(_nG0.data()), thrust::raw_pointer_cast(_nG1.data()) ); @@ -188,27 +194,31 @@ TEST(GatedSpectrometer, GatingKernel) EXPECT_EQ(_nG0[0], G0.size()); EXPECT_EQ(_nG1[0], 0u); - EXPECT_FLOAT_EQ(baseLineG0[0] / (_nG0[0] + 1E-127), 42.f); - EXPECT_FLOAT_EQ(baseLineG1[0] / (_nG1[0] + 1E-127), 0.f); + EXPECT_FLOAT_EQ(42.f, baseLineG0_update[0] / (_nG0[0] + 1E-121)); + EXPECT_FLOAT_EQ(0.f, baseLineG1_update[0] / (_nG1[0] + 1E-121)); } // everything to G1 // with baseline -5 { thrust::fill(_nG0.begin(), _nG0.end(), 0); thrust::fill(_nG1.begin(), _nG1.end(), 0); - baseLineG0[0] = 0.; - baseLineG1[0] = 0.; + baseLineG0[0] = 5.; + baseLineG1[0] = -2; + baseLineG0_update[0] = 0; + baseLineG1_update[0] = 0; + thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 1L); const uint64_t *sideCD = (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data())); psrdada_cpp::effelsberg::edd::gating<<<1024, 1024>>>( thrust::raw_pointer_cast(G0.data()), thrust::raw_pointer_cast(G1.data()), sideCD, - G0.size(), G0.size(), 0, 1, + G0.size(), blockSize, 0, 1, 0, - 5., -2., thrust::raw_pointer_cast(baseLineG0.data()), thrust::raw_pointer_cast(baseLineG1.data()), + thrust::raw_pointer_cast(baseLineG0_update.data()), + thrust::raw_pointer_cast(baseLineG1_update.data()), thrust::raw_pointer_cast(_nG0.data()), thrust::raw_pointer_cast(_nG1.data()) ); @@ -223,8 +233,9 @@ TEST(GatedSpectrometer, GatingKernel) EXPECT_EQ(_nG0[0], 0u); EXPECT_EQ(_nG1[0], G1.size()); - EXPECT_FLOAT_EQ(baseLineG0[0] / (_nG0[0] + 1E-127), 0.); - EXPECT_FLOAT_EQ(baseLineG1[0] / (_nG1[0] + 1E-127), 42.); + + EXPECT_FLOAT_EQ(0.f, baseLineG0_update[0] / (_nG0[0] + 1E-121)); + EXPECT_FLOAT_EQ(42.f, baseLineG1_update[0] / (_nG1[0] + 1E-121)); } } -- GitLab