From d0ef28a2b174534e998bfd37385c3e876493601a Mon Sep 17 00:00:00 2001
From: Tobias Winchen <tobias.winchen@rwth-aachen.de>
Date: Fri, 17 May 2019 12:58:02 +0000
Subject: [PATCH] Count number of samples per spectra instead og ehaps

---
 .../effelsberg/edd/GatedSpectrometer.cuh      |  5 +-
 .../edd/detail/GatedSpectrometer.cu           | 50 +++++++++++--------
 .../edd/test/src/GatedSpectrometerTest.cu     | 28 +++++------
 3 files changed, 45 insertions(+), 38 deletions(-)

diff --git a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
index 145bce51..44b3ffb4 100644
--- a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
+++ b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
@@ -89,7 +89,7 @@ public:
 
 private:
   void process(thrust::device_vector<RawVoltageType> const &digitiser_raw,
-               thrust::device_vector<int64_t> const &sideChannelData,
+               thrust::device_vector<uint64_t> const &sideChannelData,
                thrust::device_vector<IntegratedPowerType> &detected,
                thrust::device_vector<size_t> &noOfBitSet);
 
@@ -103,6 +103,7 @@ private:
   std::size_t _batch;
   std::size_t _nsamps_per_output_spectra;
   std::size_t _nsamps_per_buffer;
+  std::size_t _nsamps_per_heap;
 
   HandlerType &_handler;
   cufftHandle _fft_plan;
@@ -113,7 +114,7 @@ private:
 
   // Input data
   DoubleDeviceBuffer<RawVoltageType> _raw_voltage_db;
-  DoubleDeviceBuffer<int64_t> _sideChannelData_db;
+  DoubleDeviceBuffer<uint64_t> _sideChannelData_db;
 
   // Output data
   DoubleDeviceBuffer<IntegratedPowerType> _power_db;
diff --git a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
index 0cfb1af5..4a4e0d51 100644
--- a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
+++ b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
@@ -8,6 +8,7 @@
 #include <thrust/system/cuda/execution_policy.h>
 
 #include <iostream>
+#include <iomanip>
 #include <cstring>
 #include <sstream>
 
@@ -15,14 +16,14 @@ namespace psrdada_cpp {
 namespace effelsberg {
 namespace edd {
 
-__global__ void gating(float* __restrict__ G0, float* __restrict__ G1, const int64_t* __restrict__ sideChannelData,
+__global__ void gating(float* __restrict__ G0, float* __restrict__ G1, const uint64_t* __restrict__ sideChannelData,
                        size_t N, size_t heapSize, size_t bitpos,
                        size_t noOfSideChannels, size_t selectedSideChannel, const float* __restrict__ _baseLineN) {
   float baseLine = (*_baseLineN) / N;
   for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < N);
        i += blockDim.x * gridDim.x) {
     const float w = G0[i] - baseLine;
-    const int64_t sideChannelItem =
+    const uint64_t sideChannelItem =
         sideChannelData[((i / heapSize) * (noOfSideChannels)) +
                         selectedSideChannel]; // Probably not optimal access as
                                               // same data is copied for several
@@ -36,23 +37,22 @@ __global__ void gating(float* __restrict__ G0, float* __restrict__ G1, const int
 }
 
 
-__global__ void countBitSet(const int64_t *sideChannelData, size_t N, size_t
+__global__ void countBitSet(const uint64_t *sideChannelData, size_t N, size_t
     bitpos, size_t noOfSideChannels, size_t selectedSideChannel, size_t
     *nBitsSet)
 {
   // really not optimized reduction, but here only trivial array sizes.
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  __shared__ unsigned int x[256];
+  // run only in one block!
+  __shared__ size_t x[1024];
+  size_t ls = 0;
 
-  if (i == 0)
-    nBitsSet[0] = 0;
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < N);
+       i += blockDim.x * gridDim.x) {
+    ls += TEST_BIT(sideChannelData[i * noOfSideChannels + selectedSideChannel], bitpos);
+  }
+  x[threadIdx.x] = ls;
 
-  if (i * noOfSideChannels + selectedSideChannel < N)
-    x[threadIdx.x] = TEST_BIT(sideChannelData[i * noOfSideChannels + selectedSideChannel], bitpos);
-  else
-    x[threadIdx.x] = 0;
   __syncthreads();
-
   for(int s = blockDim.x / 2; s > 0; s = s / 2)
   {
     if (threadIdx.x < s)
@@ -111,7 +111,7 @@ GatedSpectrometer<HandlerType, IntegratedPowerType>::GatedSpectrometer(
       _selectedSideChannel(selectedSideChannel), _selectedBit(selectedBit),
       _fft_length(fft_length),
       _naccumulate(naccumulate), _nbits(nbits), _handler(handler), _fft_plan(0),
-      _call_count(0) {
+      _call_count(0), _nsamps_per_heap(4096) {
 
   // Sanity checks
   assert(((_nbits == 12) || (_nbits == 8)));
@@ -191,8 +191,9 @@ GatedSpectrometer<HandlerType, IntegratedPowerType>::GatedSpectrometer(
   BOOST_LOG_TRIVIAL(debug) << "  Powers size: " << _power_db.size() / 2;
 
   _noOfBitSetsInSideChannel.resize( batch / (_naccumulate / nBlocks));
-  thrust::fill(_noOfBitSetsInSideChannel.a().begin(), _noOfBitSetsInSideChannel.a().end(), 0.);
-  thrust::fill(_noOfBitSetsInSideChannel.b().begin(), _noOfBitSetsInSideChannel.b().end(), 0.);
+  thrust::fill(_noOfBitSetsInSideChannel.a().begin(), _noOfBitSetsInSideChannel.a().end(), 0L);
+  thrust::fill(_noOfBitSetsInSideChannel.b().begin(), _noOfBitSetsInSideChannel.b().end(), 0L);
+  BOOST_LOG_TRIVIAL(debug) << "  Bit set counrer size: " << _noOfBitSetsInSideChannel.size();
 
   // on the host both power are stored in the same data buffer together with
   // the number of bit sets
@@ -253,7 +254,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::init(RawBytes &block)
 template <class HandlerType, typename IntegratedPowerType>
 void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
     thrust::device_vector<RawVoltageType> const &digitiser_raw,
-    thrust::device_vector<int64_t> const &sideChannelData,
+    thrust::device_vector<uint64_t> const &sideChannelData,
     thrust::device_vector<IntegratedPowerType> &detected, thrust::device_vector<size_t> &noOfBitSet) {
   BOOST_LOG_TRIVIAL(debug) << "Unpacking raw voltages";
   switch (_nbits) {
@@ -280,11 +281,12 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
 
   for (size_t i = 0; i < _noOfBitSetsInSideChannel.size(); i++)
   { // ToDo: Should be in one kernel call
-    countBitSet<<<(sideChannelData.size()+255)/256, 256, 0,
-      _proc_stream>>>(thrust::raw_pointer_cast(sideChannelData.data() + i * sideChannelData.size() / _noOfBitSetsInSideChannel.size() ),
+    countBitSet<<<1, 1024, 0, _proc_stream>>>(thrust::raw_pointer_cast(sideChannelData.data() + i * sideChannelData.size() / _noOfBitSetsInSideChannel.size() ),
           sideChannelData.size() / _noOfBitSetsInSideChannel.size(), _selectedBit,
           _dadaBufferLayout.getNSideChannels(), _selectedBit,
           thrust::raw_pointer_cast(noOfBitSet.data() + i));
+    
+    CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
   }
 
   BOOST_LOG_TRIVIAL(debug) << "Performing FFT 1";
@@ -337,6 +339,8 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
       static_cast<void *>(_sideChannelData_db.a_ptr()),
       static_cast<void *>(block.ptr() + _dadaBufferLayout.sizeOfData() + _dadaBufferLayout.sizeOfGap()),
       _dadaBufferLayout.sizeOfSideChannelData(), cudaMemcpyHostToDevice, _h2d_stream));
+  BOOST_LOG_TRIVIAL(debug) << "First side channel item: 0x" <<   std::setw(12) << std::setfill('0') << std::hex <<  (reinterpret_cast<uint64_t*>(block.ptr() + _dadaBufferLayout.sizeOfData() + _dadaBufferLayout.sizeOfGap()))[0] << std::dec;
+
 
   if (_call_count == 1) {
     return false;
@@ -354,7 +358,7 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
     _noOfBitSetsInSideChannel.swap();
     // move to specific stream!
     thrust::fill(thrust::cuda::par.on(_proc_stream),_power_db.a().begin(), _power_db.a().end(), 0.);
-    thrust::fill(thrust::cuda::par.on(_proc_stream), _noOfBitSetsInSideChannel.a().begin(), _noOfBitSetsInSideChannel.a().end(), 0.);
+    thrust::fill(thrust::cuda::par.on(_proc_stream), _noOfBitSetsInSideChannel.a().begin(), _noOfBitSetsInSideChannel.a().end(), 0L);
   }
 
   process(_raw_voltage_db.b(), _sideChannelData_db.b(), _power_db.a(), _noOfBitSetsInSideChannel.a());
@@ -379,10 +383,11 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
                         cudaMemcpyDeviceToHost, _d2h_stream));
     // copy noOf bit set data
     CUDA_ERROR_CHECK(
-        cudaMemcpyAsync( static_cast<void *>(_host_power_db.a_ptr() + memOffset + 2 * _nchans ),
+        cudaMemcpyAsync( static_cast<void *>(_host_power_db.a_ptr() + memOffset + 2 * _nchans * sizeof(IntegratedPowerType)),
           static_cast<void *>(_noOfBitSetsInSideChannel.b_ptr() + i ),
             1 * sizeof(size_t),
             cudaMemcpyDeviceToHost, _d2h_stream));
+    BOOST_LOG_TRIVIAL(info) << " TOBR NOF BITS SET: " << _noOfBitSetsInSideChannel.b()[i]; 
   }
 
   BOOST_LOG_TRIVIAL(debug) << "Copy Data back to host";
@@ -398,10 +403,11 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
     size_t memOffset = 2 * i * (_nchans * sizeof(IntegratedPowerType) + sizeof(size_t));
 
     size_t* on_values = reinterpret_cast<size_t*> (_host_power_db.b_ptr() + memOffset + 2 * _nchans * sizeof(IntegratedPowerType));
+    *on_values *= _nsamps_per_heap;
     size_t* off_values = reinterpret_cast<size_t*> (_host_power_db.b_ptr() + memOffset + 2 * _nchans * sizeof(IntegratedPowerType) + sizeof(size_t));
-    *off_values =  _dadaBufferLayout.getNHeaps() - (*on_values);
+    *off_values =  _nsamps_per_output_spectra - (*on_values);
 
-    BOOST_LOG_TRIVIAL(info) << "    " << i << ": No of bit set in side channel: " << *on_values << " / " << *off_values << std::endl;
+    BOOST_LOG_TRIVIAL(info) << "    " << i << ": No of samples wo/w. bit set in side channel: " << *on_values << " / " << *off_values << std::endl;
   }
 
   // Wrap in a RawBytes object here;
diff --git a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
index 22babc11..1f50d034 100644
--- a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
+++ b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
@@ -11,7 +11,7 @@ namespace {
 
 TEST(GatedSpectrometer, BitManipulationMacros) {
   for (int i = 0; i < 64; i++) {
-    int64_t v = 0;
+    uint64_t v = 0;
     SET_BIT(v, i);
 
     for (int j = 0; j < 64; j++) {
@@ -57,7 +57,7 @@ TEST(GatedSpectrometer, GatingKernel)
 
   thrust::device_vector<float> G0(blockSize * nBlocks);
   thrust::device_vector<float> G1(blockSize * nBlocks);
-  thrust::device_vector<int64_t> _sideChannelData(nBlocks);
+  thrust::device_vector<uint64_t> _sideChannelData(nBlocks);
   thrust::device_vector<float> baseLine(1);
 
   thrust::fill(G0.begin(), G0.end(), 42);
@@ -67,8 +67,8 @@ TEST(GatedSpectrometer, GatingKernel)
   // everything to G0
   {
     baseLine[0] = 0.;
-    const int64_t *sideCD =
-        (int64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
+    const uint64_t *sideCD =
+        (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
     psrdada_cpp::effelsberg::edd::gating<<<1024, 1024>>>(
           thrust::raw_pointer_cast(G0.data()),
           thrust::raw_pointer_cast(G1.data()), sideCD,
@@ -88,8 +88,8 @@ TEST(GatedSpectrometer, GatingKernel)
   {
     baseLine[0] = -5. * G0.size();
     thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 1L);
-    const int64_t *sideCD =
-        (int64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
+    const uint64_t *sideCD =
+        (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
     psrdada_cpp::effelsberg::edd::gating<<<1024, 1024>>>(
           thrust::raw_pointer_cast(G0.data()),
           thrust::raw_pointer_cast(G1.data()), sideCD,
@@ -108,24 +108,24 @@ TEST(GatedSpectrometer, GatingKernel)
 
 
 TEST(GatedSpectrometer, countBitSet) {
-  size_t nBlocks = 16;
-  thrust::device_vector<int64_t> _sideChannelData(nBlocks);
+  size_t nBlocks = 100000;
+  thrust::device_vector<uint64_t> _sideChannelData(nBlocks);
   thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 0);
-  const int64_t *sideCD =
-      (int64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
+  const uint64_t *sideCD =
+      (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
 
   thrust::device_vector<size_t> res(1);
 
   // test 1 side channel
   res[0] = 0;
   psrdada_cpp::effelsberg::edd::
-      countBitSet<<<(_sideChannelData.size() + 255) / 256, 256>>>(
+      countBitSet<<<1, 1024>>>(
     sideCD, nBlocks, 0, 1, 0, thrust::raw_pointer_cast(res.data()));
   EXPECT_EQ(res[0], 0u);
 
   res[0] = 0;
   thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 1L);
-  psrdada_cpp::effelsberg::edd::countBitSet<<<(_sideChannelData.size() + 255) / 256, 256>>>(
+  psrdada_cpp::effelsberg::edd::countBitSet<<<1, 1024>>>(
     sideCD, nBlocks, 0, 1, 0, thrust::raw_pointer_cast(res.data()));
   EXPECT_EQ(res[0], nBlocks);
 
@@ -135,13 +135,13 @@ TEST(GatedSpectrometer, countBitSet) {
   thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 0);
   for (size_t i = 2; i < _sideChannelData.size(); i += nSideChannels)
     _sideChannelData[i] = 1L;
-  psrdada_cpp::effelsberg::edd::countBitSet<<<(_sideChannelData.size() + 255) / 256, 256>>>(
+  psrdada_cpp::effelsberg::edd::countBitSet<<<1, 1024>>>(
     sideCD, nBlocks, 0, nSideChannels, 3,
     thrust::raw_pointer_cast(res.data()));
   EXPECT_EQ(res[0], 0u);
 
   res[0] = 0;
-  psrdada_cpp::effelsberg::edd::countBitSet<<<(_sideChannelData.size() + 255) / 256, 256>>>(
+  psrdada_cpp::effelsberg::edd::countBitSet<<<1, 1024>>>(
     sideCD, nBlocks, 0, nSideChannels, 2,
     thrust::raw_pointer_cast(res.data()));
   EXPECT_EQ(res[0], nBlocks / nSideChannels);
-- 
GitLab