Count number of samples per spectra instead og ehaps

d0ef28a2 · Tobias Winchen · 794b9f08 · d0ef28a2 · d0ef28a2 · d0ef28a2
Commit d0ef28a2 authored May 17, 2019 by Tobias Winchen
--- a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
+++ b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
@@ -89,7 +89,7 @@ public:

 private:
  void process(thrust::device_vector<RawVoltageType> const &digitiser_raw,
-               thrust::device_vector<int64_t> const &sideChannelData,
+               thrust::device_vector<uint64_t> const &sideChannelData,
               thrust::device_vector<IntegratedPowerType> &detected,
               thrust::device_vector<size_t> &noOfBitSet);

@@ -103,6 +103,7 @@ private:
  std::size_t _batch;
  std::size_t _nsamps_per_output_spectra;
  std::size_t _nsamps_per_buffer;
+  std::size_t _nsamps_per_heap;

  HandlerType &_handler;
  cufftHandle _fft_plan;
@@ -113,7 +114,7 @@ private:

  // Input data
  DoubleDeviceBuffer<RawVoltageType> _raw_voltage_db;
-  DoubleDeviceBuffer<int64_t> _sideChannelData_db;
+  DoubleDeviceBuffer<uint64_t> _sideChannelData_db;

  // Output data
  DoubleDeviceBuffer<IntegratedPowerType> _power_db;


--- a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
+++ b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
@@ -8,6 +8,7 @@
 #include <thrust/system/cuda/execution_policy.h>

 #include <iostream>
+#include <iomanip>
 #include <cstring>
 #include <sstream>

@@ -15,14 +16,14 @@ namespace psrdada_cpp {
 namespace effelsberg {
 namespace edd {

-__global__ void gating(float* __restrict__ G0, float* __restrict__ G1, const int64_t* __restrict__ sideChannelData,
+__global__ void gating(float* __restrict__ G0, float* __restrict__ G1, const uint64_t* __restrict__ sideChannelData,
                       size_t N, size_t heapSize, size_t bitpos,
                       size_t noOfSideChannels, size_t selectedSideChannel, const float* __restrict__ _baseLineN) {
  float baseLine = (*_baseLineN) / N;
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < N);
       i += blockDim.x * gridDim.x) {
    const float w = G0[i] - baseLine;
-    const int64_t sideChannelItem =
+    const uint64_t sideChannelItem =
        sideChannelData[((i / heapSize) * (noOfSideChannels)) +
                        selectedSideChannel]; // Probably not optimal access as
                                              // same data is copied for several
@@ -36,23 +37,22 @@ __global__ void gating(float* __restrict__ G0, float* __restrict__ G1, const int
 }


-__global__ void countBitSet(const int64_t *sideChannelData, size_t N, size_t
+__global__ void countBitSet(const uint64_t *sideChannelData, size_t N, size_t
    bitpos, size_t noOfSideChannels, size_t selectedSideChannel, size_t
    *nBitsSet)
 {
  // really not optimized reduction, but here only trivial array sizes.
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  __shared__ unsigned int x[256];
+  // run only in one block!
+  __shared__ size_t x[1024];
+  size_t ls = 0;

-  if (i == 0)
-    nBitsSet[0] = 0;
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < N);
+       i += blockDim.x * gridDim.x) {
+    ls += TEST_BIT(sideChannelData[i * noOfSideChannels + selectedSideChannel], bitpos);
+  }
+  x[threadIdx.x] = ls;

-  if (i * noOfSideChannels + selectedSideChannel < N)
-    x[threadIdx.x] = TEST_BIT(sideChannelData[i * noOfSideChannels + selectedSideChannel], bitpos);
-  else
-    x[threadIdx.x] = 0;
  __syncthreads();
-
  for(int s = blockDim.x / 2; s > 0; s = s / 2)
  {
    if (threadIdx.x < s)
@@ -111,7 +111,7 @@ GatedSpectrometer<HandlerType, IntegratedPowerType>::GatedSpectrometer(
      _selectedSideChannel(selectedSideChannel), _selectedBit(selectedBit),
      _fft_length(fft_length),
      _naccumulate(naccumulate), _nbits(nbits), _handler(handler), _fft_plan(0),
-      _call_count(0) {
+      _call_count(0), _nsamps_per_heap(4096) {

  // Sanity checks
  assert(((_nbits == 12) || (_nbits == 8)));
@@ -191,8 +191,9 @@ GatedSpectrometer<HandlerType, IntegratedPowerType>::GatedSpectrometer(
  BOOST_LOG_TRIVIAL(debug) << "  Powers size: " << _power_db.size() / 2;

  _noOfBitSetsInSideChannel.resize( batch / (_naccumulate / nBlocks));
-  thrust::fill(_noOfBitSetsInSideChannel.a().begin(), _noOfBitSetsInSideChannel.a().end(), 0.);
-  thrust::fill(_noOfBitSetsInSideChannel.b().begin(), _noOfBitSetsInSideChannel.b().end(), 0.);
+  thrust::fill(_noOfBitSetsInSideChannel.a().begin(), _noOfBitSetsInSideChannel.a().end(), 0L);
+  thrust::fill(_noOfBitSetsInSideChannel.b().begin(), _noOfBitSetsInSideChannel.b().end(), 0L);
+  BOOST_LOG_TRIVIAL(debug) << "  Bit set counrer size: " << _noOfBitSetsInSideChannel.size();

  // on the host both power are stored in the same data buffer together with
  // the number of bit sets
@@ -253,7 +254,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::init(RawBytes &block)
 template <class HandlerType, typename IntegratedPowerType>
 void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
    thrust::device_vector<RawVoltageType> const &digitiser_raw,
-    thrust::device_vector<int64_t> const &sideChannelData,
+    thrust::device_vector<uint64_t> const &sideChannelData,
    thrust::device_vector<IntegratedPowerType> &detected, thrust::device_vector<size_t> &noOfBitSet) {
  BOOST_LOG_TRIVIAL(debug) << "Unpacking raw voltages";
  switch (_nbits) {
@@ -280,11 +281,12 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(

  for (size_t i = 0; i < _noOfBitSetsInSideChannel.size(); i++)
  { // ToDo: Should be in one kernel call
-    countBitSet<<<(sideChannelData.size()+255)/256, 256, 0,
-      _proc_stream>>>(thrust::raw_pointer_cast(sideChannelData.data() + i * sideChannelData.size() / _noOfBitSetsInSideChannel.size() ),
+    countBitSet<<<1, 1024, 0, _proc_stream>>>(thrust::raw_pointer_cast(sideChannelData.data() + i * sideChannelData.size() / _noOfBitSetsInSideChannel.size() ),
          sideChannelData.size() / _noOfBitSetsInSideChannel.size(), _selectedBit,
          _dadaBufferLayout.getNSideChannels(), _selectedBit,
          thrust::raw_pointer_cast(noOfBitSet.data() + i));
+    
+    CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
  }

  BOOST_LOG_TRIVIAL(debug) << "Performing FFT 1";
@@ -337,6 +339,8 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
      static_cast<void *>(_sideChannelData_db.a_ptr()),
      static_cast<void *>(block.ptr() + _dadaBufferLayout.sizeOfData() + _dadaBufferLayout.sizeOfGap()),
      _dadaBufferLayout.sizeOfSideChannelData(), cudaMemcpyHostToDevice, _h2d_stream));
+  BOOST_LOG_TRIVIAL(debug) << "First side channel item: 0x" <<   std::setw(12) << std::setfill('0') << std::hex <<  (reinterpret_cast<uint64_t*>(block.ptr() + _dadaBufferLayout.sizeOfData() + _dadaBufferLayout.sizeOfGap()))[0] << std::dec;
+

  if (_call_count == 1) {
    return false;
@@ -354,7 +358,7 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
    _noOfBitSetsInSideChannel.swap();
    // move to specific stream!
    thrust::fill(thrust::cuda::par.on(_proc_stream),_power_db.a().begin(), _power_db.a().end(), 0.);
-    thrust::fill(thrust::cuda::par.on(_proc_stream), _noOfBitSetsInSideChannel.a().begin(), _noOfBitSetsInSideChannel.a().end(), 0.);
+    thrust::fill(thrust::cuda::par.on(_proc_stream), _noOfBitSetsInSideChannel.a().begin(), _noOfBitSetsInSideChannel.a().end(), 0L);
  }

  process(_raw_voltage_db.b(), _sideChannelData_db.b(), _power_db.a(), _noOfBitSetsInSideChannel.a());
@@ -379,10 +383,11 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
                        cudaMemcpyDeviceToHost, _d2h_stream));
    // copy noOf bit set data
    CUDA_ERROR_CHECK(
-        cudaMemcpyAsync( static_cast<void *>(_host_power_db.a_ptr() + memOffset + 2 * _nchans ),
+        cudaMemcpyAsync( static_cast<void *>(_host_power_db.a_ptr() + memOffset + 2 * _nchans * sizeof(IntegratedPowerType)),
          static_cast<void *>(_noOfBitSetsInSideChannel.b_ptr() + i ),
            1 * sizeof(size_t),
            cudaMemcpyDeviceToHost, _d2h_stream));
+    BOOST_LOG_TRIVIAL(info) << " TOBR NOF BITS SET: " << _noOfBitSetsInSideChannel.b()[i]; 
  }

  BOOST_LOG_TRIVIAL(debug) << "Copy Data back to host";
@@ -398,10 +403,11 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
    size_t memOffset = 2 * i * (_nchans * sizeof(IntegratedPowerType) + sizeof(size_t));

    size_t* on_values = reinterpret_cast<size_t*> (_host_power_db.b_ptr() + memOffset + 2 * _nchans * sizeof(IntegratedPowerType));
+    *on_values *= _nsamps_per_heap;
    size_t* off_values = reinterpret_cast<size_t*> (_host_power_db.b_ptr() + memOffset + 2 * _nchans * sizeof(IntegratedPowerType) + sizeof(size_t));
-    *off_values =  _dadaBufferLayout.getNHeaps() - (*on_values);
+    *off_values =  _nsamps_per_output_spectra - (*on_values);

-    BOOST_LOG_TRIVIAL(info) << "    " << i << ": No of bit set in side channel: " << *on_values << " / " << *off_values << std::endl;
+    BOOST_LOG_TRIVIAL(info) << "    " << i << ": No of samples wo/w. bit set in side channel: " << *on_values << " / " << *off_values << std::endl;
  }

  // Wrap in a RawBytes object here;


--- a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
+++ b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
@@ -11,7 +11,7 @@ namespace {

 TEST(GatedSpectrometer, BitManipulationMacros) {
  for (int i = 0; i < 64; i++) {
-    int64_t v = 0;
+    uint64_t v = 0;
    SET_BIT(v, i);

    for (int j = 0; j < 64; j++) {
@@ -57,7 +57,7 @@ TEST(GatedSpectrometer, GatingKernel)

  thrust::device_vector<float> G0(blockSize * nBlocks);
  thrust::device_vector<float> G1(blockSize * nBlocks);
-  thrust::device_vector<int64_t> _sideChannelData(nBlocks);
+  thrust::device_vector<uint64_t> _sideChannelData(nBlocks);
  thrust::device_vector<float> baseLine(1);

  thrust::fill(G0.begin(), G0.end(), 42);
@@ -67,8 +67,8 @@ TEST(GatedSpectrometer, GatingKernel)
  // everything to G0
  {
    baseLine[0] = 0.;
-    const int64_t *sideCD =
-        (int64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
+    const uint64_t *sideCD =
+        (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
    psrdada_cpp::effelsberg::edd::gating<<<1024, 1024>>>(
          thrust::raw_pointer_cast(G0.data()),
          thrust::raw_pointer_cast(G1.data()), sideCD,
@@ -88,8 +88,8 @@ TEST(GatedSpectrometer, GatingKernel)
  {
    baseLine[0] = -5. * G0.size();
    thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 1L);
-    const int64_t *sideCD =
-        (int64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
+    const uint64_t *sideCD =
+        (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
    psrdada_cpp::effelsberg::edd::gating<<<1024, 1024>>>(
          thrust::raw_pointer_cast(G0.data()),
          thrust::raw_pointer_cast(G1.data()), sideCD,
@@ -108,24 +108,24 @@ TEST(GatedSpectrometer, GatingKernel)


 TEST(GatedSpectrometer, countBitSet) {
-  size_t nBlocks = 16;
-  thrust::device_vector<int64_t> _sideChannelData(nBlocks);
+  size_t nBlocks = 100000;
+  thrust::device_vector<uint64_t> _sideChannelData(nBlocks);
  thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 0);
-  const int64_t *sideCD =
-      (int64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
+  const uint64_t *sideCD =
+      (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));

  thrust::device_vector<size_t> res(1);

  // test 1 side channel
  res[0] = 0;
  psrdada_cpp::effelsberg::edd::
-      countBitSet<<<(_sideChannelData.size() + 255) / 256, 256>>>(
+      countBitSet<<<1, 1024>>>(
    sideCD, nBlocks, 0, 1, 0, thrust::raw_pointer_cast(res.data()));
  EXPECT_EQ(res[0], 0u);

  res[0] = 0;
  thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 1L);
-  psrdada_cpp::effelsberg::edd::countBitSet<<<(_sideChannelData.size() + 255) / 256, 256>>>(
+  psrdada_cpp::effelsberg::edd::countBitSet<<<1, 1024>>>(
    sideCD, nBlocks, 0, 1, 0, thrust::raw_pointer_cast(res.data()));
  EXPECT_EQ(res[0], nBlocks);

@@ -135,13 +135,13 @@ TEST(GatedSpectrometer, countBitSet) {
  thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 0);
  for (size_t i = 2; i < _sideChannelData.size(); i += nSideChannels)
    _sideChannelData[i] = 1L;
-  psrdada_cpp::effelsberg::edd::countBitSet<<<(_sideChannelData.size() + 255) / 256, 256>>>(
+  psrdada_cpp::effelsberg::edd::countBitSet<<<1, 1024>>>(
    sideCD, nBlocks, 0, nSideChannels, 3,
    thrust::raw_pointer_cast(res.data()));
  EXPECT_EQ(res[0], 0u);

  res[0] = 0;
-  psrdada_cpp::effelsberg::edd::countBitSet<<<(_sideChannelData.size() + 255) / 256, 256>>>(
+  psrdada_cpp::effelsberg::edd::countBitSet<<<1, 1024>>>(
    sideCD, nBlocks, 0, nSideChannels, 2,
    thrust::raw_pointer_cast(res.data()));
  EXPECT_EQ(res[0], nBlocks / nSideChannels);