From d0ef28a2b174534e998bfd37385c3e876493601a Mon Sep 17 00:00:00 2001 From: Tobias Winchen <tobias.winchen@rwth-aachen.de> Date: Fri, 17 May 2019 12:58:02 +0000 Subject: [PATCH] Count number of samples per spectra instead og ehaps --- .../effelsberg/edd/GatedSpectrometer.cuh | 5 +- .../edd/detail/GatedSpectrometer.cu | 50 +++++++++++-------- .../edd/test/src/GatedSpectrometerTest.cu | 28 +++++------ 3 files changed, 45 insertions(+), 38 deletions(-) diff --git a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh index 145bce51..44b3ffb4 100644 --- a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh +++ b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh @@ -89,7 +89,7 @@ public: private: void process(thrust::device_vector<RawVoltageType> const &digitiser_raw, - thrust::device_vector<int64_t> const &sideChannelData, + thrust::device_vector<uint64_t> const &sideChannelData, thrust::device_vector<IntegratedPowerType> &detected, thrust::device_vector<size_t> &noOfBitSet); @@ -103,6 +103,7 @@ private: std::size_t _batch; std::size_t _nsamps_per_output_spectra; std::size_t _nsamps_per_buffer; + std::size_t _nsamps_per_heap; HandlerType &_handler; cufftHandle _fft_plan; @@ -113,7 +114,7 @@ private: // Input data DoubleDeviceBuffer<RawVoltageType> _raw_voltage_db; - DoubleDeviceBuffer<int64_t> _sideChannelData_db; + DoubleDeviceBuffer<uint64_t> _sideChannelData_db; // Output data DoubleDeviceBuffer<IntegratedPowerType> _power_db; diff --git a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu index 0cfb1af5..4a4e0d51 100644 --- a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu +++ b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu @@ -8,6 +8,7 @@ #include <thrust/system/cuda/execution_policy.h> #include <iostream> +#include <iomanip> #include <cstring> #include <sstream> @@ -15,14 +16,14 @@ namespace psrdada_cpp { namespace effelsberg { namespace edd { -__global__ void gating(float* __restrict__ G0, float* __restrict__ G1, const int64_t* __restrict__ sideChannelData, +__global__ void gating(float* __restrict__ G0, float* __restrict__ G1, const uint64_t* __restrict__ sideChannelData, size_t N, size_t heapSize, size_t bitpos, size_t noOfSideChannels, size_t selectedSideChannel, const float* __restrict__ _baseLineN) { float baseLine = (*_baseLineN) / N; for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < N); i += blockDim.x * gridDim.x) { const float w = G0[i] - baseLine; - const int64_t sideChannelItem = + const uint64_t sideChannelItem = sideChannelData[((i / heapSize) * (noOfSideChannels)) + selectedSideChannel]; // Probably not optimal access as // same data is copied for several @@ -36,23 +37,22 @@ __global__ void gating(float* __restrict__ G0, float* __restrict__ G1, const int } -__global__ void countBitSet(const int64_t *sideChannelData, size_t N, size_t +__global__ void countBitSet(const uint64_t *sideChannelData, size_t N, size_t bitpos, size_t noOfSideChannels, size_t selectedSideChannel, size_t *nBitsSet) { // really not optimized reduction, but here only trivial array sizes. - int i = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ unsigned int x[256]; + // run only in one block! + __shared__ size_t x[1024]; + size_t ls = 0; - if (i == 0) - nBitsSet[0] = 0; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < N); + i += blockDim.x * gridDim.x) { + ls += TEST_BIT(sideChannelData[i * noOfSideChannels + selectedSideChannel], bitpos); + } + x[threadIdx.x] = ls; - if (i * noOfSideChannels + selectedSideChannel < N) - x[threadIdx.x] = TEST_BIT(sideChannelData[i * noOfSideChannels + selectedSideChannel], bitpos); - else - x[threadIdx.x] = 0; __syncthreads(); - for(int s = blockDim.x / 2; s > 0; s = s / 2) { if (threadIdx.x < s) @@ -111,7 +111,7 @@ GatedSpectrometer<HandlerType, IntegratedPowerType>::GatedSpectrometer( _selectedSideChannel(selectedSideChannel), _selectedBit(selectedBit), _fft_length(fft_length), _naccumulate(naccumulate), _nbits(nbits), _handler(handler), _fft_plan(0), - _call_count(0) { + _call_count(0), _nsamps_per_heap(4096) { // Sanity checks assert(((_nbits == 12) || (_nbits == 8))); @@ -191,8 +191,9 @@ GatedSpectrometer<HandlerType, IntegratedPowerType>::GatedSpectrometer( BOOST_LOG_TRIVIAL(debug) << " Powers size: " << _power_db.size() / 2; _noOfBitSetsInSideChannel.resize( batch / (_naccumulate / nBlocks)); - thrust::fill(_noOfBitSetsInSideChannel.a().begin(), _noOfBitSetsInSideChannel.a().end(), 0.); - thrust::fill(_noOfBitSetsInSideChannel.b().begin(), _noOfBitSetsInSideChannel.b().end(), 0.); + thrust::fill(_noOfBitSetsInSideChannel.a().begin(), _noOfBitSetsInSideChannel.a().end(), 0L); + thrust::fill(_noOfBitSetsInSideChannel.b().begin(), _noOfBitSetsInSideChannel.b().end(), 0L); + BOOST_LOG_TRIVIAL(debug) << " Bit set counrer size: " << _noOfBitSetsInSideChannel.size(); // on the host both power are stored in the same data buffer together with // the number of bit sets @@ -253,7 +254,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::init(RawBytes &block) template <class HandlerType, typename IntegratedPowerType> void GatedSpectrometer<HandlerType, IntegratedPowerType>::process( thrust::device_vector<RawVoltageType> const &digitiser_raw, - thrust::device_vector<int64_t> const &sideChannelData, + thrust::device_vector<uint64_t> const &sideChannelData, thrust::device_vector<IntegratedPowerType> &detected, thrust::device_vector<size_t> &noOfBitSet) { BOOST_LOG_TRIVIAL(debug) << "Unpacking raw voltages"; switch (_nbits) { @@ -280,11 +281,12 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process( for (size_t i = 0; i < _noOfBitSetsInSideChannel.size(); i++) { // ToDo: Should be in one kernel call - countBitSet<<<(sideChannelData.size()+255)/256, 256, 0, - _proc_stream>>>(thrust::raw_pointer_cast(sideChannelData.data() + i * sideChannelData.size() / _noOfBitSetsInSideChannel.size() ), + countBitSet<<<1, 1024, 0, _proc_stream>>>(thrust::raw_pointer_cast(sideChannelData.data() + i * sideChannelData.size() / _noOfBitSetsInSideChannel.size() ), sideChannelData.size() / _noOfBitSetsInSideChannel.size(), _selectedBit, _dadaBufferLayout.getNSideChannels(), _selectedBit, thrust::raw_pointer_cast(noOfBitSet.data() + i)); + + CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream)); } BOOST_LOG_TRIVIAL(debug) << "Performing FFT 1"; @@ -337,6 +339,8 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b static_cast<void *>(_sideChannelData_db.a_ptr()), static_cast<void *>(block.ptr() + _dadaBufferLayout.sizeOfData() + _dadaBufferLayout.sizeOfGap()), _dadaBufferLayout.sizeOfSideChannelData(), cudaMemcpyHostToDevice, _h2d_stream)); + BOOST_LOG_TRIVIAL(debug) << "First side channel item: 0x" << std::setw(12) << std::setfill('0') << std::hex << (reinterpret_cast<uint64_t*>(block.ptr() + _dadaBufferLayout.sizeOfData() + _dadaBufferLayout.sizeOfGap()))[0] << std::dec; + if (_call_count == 1) { return false; @@ -354,7 +358,7 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b _noOfBitSetsInSideChannel.swap(); // move to specific stream! thrust::fill(thrust::cuda::par.on(_proc_stream),_power_db.a().begin(), _power_db.a().end(), 0.); - thrust::fill(thrust::cuda::par.on(_proc_stream), _noOfBitSetsInSideChannel.a().begin(), _noOfBitSetsInSideChannel.a().end(), 0.); + thrust::fill(thrust::cuda::par.on(_proc_stream), _noOfBitSetsInSideChannel.a().begin(), _noOfBitSetsInSideChannel.a().end(), 0L); } process(_raw_voltage_db.b(), _sideChannelData_db.b(), _power_db.a(), _noOfBitSetsInSideChannel.a()); @@ -379,10 +383,11 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b cudaMemcpyDeviceToHost, _d2h_stream)); // copy noOf bit set data CUDA_ERROR_CHECK( - cudaMemcpyAsync( static_cast<void *>(_host_power_db.a_ptr() + memOffset + 2 * _nchans ), + cudaMemcpyAsync( static_cast<void *>(_host_power_db.a_ptr() + memOffset + 2 * _nchans * sizeof(IntegratedPowerType)), static_cast<void *>(_noOfBitSetsInSideChannel.b_ptr() + i ), 1 * sizeof(size_t), cudaMemcpyDeviceToHost, _d2h_stream)); + BOOST_LOG_TRIVIAL(info) << " TOBR NOF BITS SET: " << _noOfBitSetsInSideChannel.b()[i]; } BOOST_LOG_TRIVIAL(debug) << "Copy Data back to host"; @@ -398,10 +403,11 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b size_t memOffset = 2 * i * (_nchans * sizeof(IntegratedPowerType) + sizeof(size_t)); size_t* on_values = reinterpret_cast<size_t*> (_host_power_db.b_ptr() + memOffset + 2 * _nchans * sizeof(IntegratedPowerType)); + *on_values *= _nsamps_per_heap; size_t* off_values = reinterpret_cast<size_t*> (_host_power_db.b_ptr() + memOffset + 2 * _nchans * sizeof(IntegratedPowerType) + sizeof(size_t)); - *off_values = _dadaBufferLayout.getNHeaps() - (*on_values); + *off_values = _nsamps_per_output_spectra - (*on_values); - BOOST_LOG_TRIVIAL(info) << " " << i << ": No of bit set in side channel: " << *on_values << " / " << *off_values << std::endl; + BOOST_LOG_TRIVIAL(info) << " " << i << ": No of samples wo/w. bit set in side channel: " << *on_values << " / " << *off_values << std::endl; } // Wrap in a RawBytes object here; diff --git a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu index 22babc11..1f50d034 100644 --- a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu +++ b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu @@ -11,7 +11,7 @@ namespace { TEST(GatedSpectrometer, BitManipulationMacros) { for (int i = 0; i < 64; i++) { - int64_t v = 0; + uint64_t v = 0; SET_BIT(v, i); for (int j = 0; j < 64; j++) { @@ -57,7 +57,7 @@ TEST(GatedSpectrometer, GatingKernel) thrust::device_vector<float> G0(blockSize * nBlocks); thrust::device_vector<float> G1(blockSize * nBlocks); - thrust::device_vector<int64_t> _sideChannelData(nBlocks); + thrust::device_vector<uint64_t> _sideChannelData(nBlocks); thrust::device_vector<float> baseLine(1); thrust::fill(G0.begin(), G0.end(), 42); @@ -67,8 +67,8 @@ TEST(GatedSpectrometer, GatingKernel) // everything to G0 { baseLine[0] = 0.; - const int64_t *sideCD = - (int64_t *)(thrust::raw_pointer_cast(_sideChannelData.data())); + const uint64_t *sideCD = + (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data())); psrdada_cpp::effelsberg::edd::gating<<<1024, 1024>>>( thrust::raw_pointer_cast(G0.data()), thrust::raw_pointer_cast(G1.data()), sideCD, @@ -88,8 +88,8 @@ TEST(GatedSpectrometer, GatingKernel) { baseLine[0] = -5. * G0.size(); thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 1L); - const int64_t *sideCD = - (int64_t *)(thrust::raw_pointer_cast(_sideChannelData.data())); + const uint64_t *sideCD = + (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data())); psrdada_cpp::effelsberg::edd::gating<<<1024, 1024>>>( thrust::raw_pointer_cast(G0.data()), thrust::raw_pointer_cast(G1.data()), sideCD, @@ -108,24 +108,24 @@ TEST(GatedSpectrometer, GatingKernel) TEST(GatedSpectrometer, countBitSet) { - size_t nBlocks = 16; - thrust::device_vector<int64_t> _sideChannelData(nBlocks); + size_t nBlocks = 100000; + thrust::device_vector<uint64_t> _sideChannelData(nBlocks); thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 0); - const int64_t *sideCD = - (int64_t *)(thrust::raw_pointer_cast(_sideChannelData.data())); + const uint64_t *sideCD = + (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data())); thrust::device_vector<size_t> res(1); // test 1 side channel res[0] = 0; psrdada_cpp::effelsberg::edd:: - countBitSet<<<(_sideChannelData.size() + 255) / 256, 256>>>( + countBitSet<<<1, 1024>>>( sideCD, nBlocks, 0, 1, 0, thrust::raw_pointer_cast(res.data())); EXPECT_EQ(res[0], 0u); res[0] = 0; thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 1L); - psrdada_cpp::effelsberg::edd::countBitSet<<<(_sideChannelData.size() + 255) / 256, 256>>>( + psrdada_cpp::effelsberg::edd::countBitSet<<<1, 1024>>>( sideCD, nBlocks, 0, 1, 0, thrust::raw_pointer_cast(res.data())); EXPECT_EQ(res[0], nBlocks); @@ -135,13 +135,13 @@ TEST(GatedSpectrometer, countBitSet) { thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 0); for (size_t i = 2; i < _sideChannelData.size(); i += nSideChannels) _sideChannelData[i] = 1L; - psrdada_cpp::effelsberg::edd::countBitSet<<<(_sideChannelData.size() + 255) / 256, 256>>>( + psrdada_cpp::effelsberg::edd::countBitSet<<<1, 1024>>>( sideCD, nBlocks, 0, nSideChannels, 3, thrust::raw_pointer_cast(res.data())); EXPECT_EQ(res[0], 0u); res[0] = 0; - psrdada_cpp::effelsberg::edd::countBitSet<<<(_sideChannelData.size() + 255) / 256, 256>>>( + psrdada_cpp::effelsberg::edd::countBitSet<<<1, 1024>>>( sideCD, nBlocks, 0, nSideChannels, 2, thrust::raw_pointer_cast(res.data())); EXPECT_EQ(res[0], nBlocks / nSideChannels); -- GitLab