From d22f63f563050aca96c3b7239f1486d51de11a81 Mon Sep 17 00:00:00 2001
From: Tobias Winchen <tobias.winchen@rwth-aachen.de>
Date: Fri, 8 May 2020 11:50:34 +0200
Subject: [PATCH] Fixed mem transfer compute overlap

---
 .../effelsberg/edd/GatedSpectrometer.cuh      | 10 ++-
 .../edd/detail/GatedSpectrometer.cu           | 77 ++++++++++++++-----
 .../edd/test/src/GatedSpectrometerTest.cu     | 35 ++++++---
 3 files changed, 88 insertions(+), 34 deletions(-)

diff --git a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
index 847106d0..02a64e60 100644
--- a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
+++ b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
@@ -46,6 +46,12 @@ struct PolarizationData
     thrust::device_vector<UnpackedVoltageType> _baseLineG0;
     /// Baseline in gate 1 state
     thrust::device_vector<UnpackedVoltageType> _baseLineG1;
+
+    /// Baseline in gate 0 state after update
+    thrust::device_vector<UnpackedVoltageType> _baseLineG0_update;
+    /// Baseline in gate 1 state after update
+    thrust::device_vector<UnpackedVoltageType> _baseLineG1_update;
+
     /// Channelized voltage in gate 0 state
     thrust::device_vector<ChannelisedVoltageType> _channelised_voltage_G0;
     /// Channelized voltage in gate 1 state
@@ -241,8 +247,8 @@ private:
 __global__ void gating(float *G0, float *G1, const int64_t *sideChannelData,
                        size_t N, size_t heapSize, size_t bitpos,
                        size_t noOfSideChannels, size_t selectedSideChannel,
-                       const float  baseLineG0,
-                       const float  baseLineG1,
+                       const float*  __restrict__ _baseLineG0,
+                       const float*  __restrict__ _baseLineG1,
                        float* __restrict__ baseLineNG0,
                        float* __restrict__ baseLineNG1,
                        uint64_cu* stats_G0,
diff --git a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
index 23e1647b..679b7fc2 100644
--- a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
+++ b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
@@ -51,8 +51,8 @@ __global__ void gating(float* __restrict__ G0,
         const uint64_t* __restrict__ sideChannelData,
         size_t N, size_t heapSize, size_t bitpos,
         size_t noOfSideChannels, size_t selectedSideChannel,
-        const float  baseLineG0,
-        const float  baseLineG1,
+        const float*  __restrict__ _baseLineG0,
+        const float*  __restrict__ _baseLineG1,
         float* __restrict__ baseLineNG0,
         float* __restrict__ baseLineNG1,
         uint64_cu* stats_G0, uint64_cu* stats_G1) {
@@ -60,6 +60,9 @@ __global__ void gating(float* __restrict__ G0,
   uint32_t _G0stats = 0;
   uint32_t _G1stats = 0;
 
+  const float baseLineG0 = _baseLineG0[0];
+  const float baseLineG1 = _baseLineG1[0];
+
   float baselineUpdateG0 = 0;
   float baselineUpdateG1 = 0;
 
@@ -115,6 +118,36 @@ __global__ void gating(float* __restrict__ G0,
 
 
 
+// Updates the baselines of the gates for the polarization set for the next
+// block
+// only few output blocks per input block thus execution on only one thread.
+// Important is that the execution is async on the GPU.
+__global__ void update_baselines(float*  __restrict__ baseLineG0,
+        float*  __restrict__ baseLineG1,
+        float* __restrict__ baseLineNG0,
+        float* __restrict__ baseLineNG1,
+        uint64_cu* stats_G0, uint64_cu* stats_G1,
+        size_t N)
+{
+    size_t NG0 = 0;
+    size_t NG1 = 0;
+
+    for (size_t i =0; i < N; i++)
+    {
+       NG0 += stats_G0[i];
+       NG1 += stats_G1[i];
+    }
+
+    baseLineG0[0] = baseLineNG0[0] / NG0;
+    baseLineG1[0] = baseLineNG1[0] / NG1;
+    baseLineNG0[0] = 0;
+    baseLineNG1[0] = 0;
+}
+
+
+
+
+
 template <class HandlerType>
 GatedSpectrometer<HandlerType>::GatedSpectrometer(
     const DadaBufferLayout &dadaBufferLayout,
@@ -194,10 +227,14 @@ GatedSpectrometer<HandlerType>::GatedSpectrometer(
   _unpacked_voltage_G0.resize(_nsamps_per_buffer);
   _unpacked_voltage_G1.resize(_nsamps_per_buffer);
 
-  polarization0._baseLineG0.resize(1);
-  polarization0._baseLineG1.resize(1);
-  polarization1._baseLineG0.resize(1);
-  polarization1._baseLineG1.resize(1);
+    polarization0._baseLineG0.resize(1);
+    polarization0._baseLineG0_update.resize(1);
+    polarization0._baseLineG1.resize(1);
+    polarization0._baseLineG1_update.resize(1);
+    polarization1._baseLineG0.resize(1);
+    polarization1._baseLineG0_update.resize(1);
+    polarization1._baseLineG1.resize(1);
+    polarization1._baseLineG1_update.resize(1);
 
   BOOST_LOG_TRIVIAL(debug) << "  Unpacked voltages size (in samples): "
                            << _unpacked_voltage_G0.size();
@@ -286,14 +323,7 @@ void GatedSpectrometer<HandlerType>::gated_fft(
     throw std::runtime_error("Unsupported number of bits");
   }
 
-  // Get baseline from previous block
-  float previous_baseLineG0 = data._baseLineG0[0];
-  float previous_baseLineG1 = data._baseLineG1[0];
-
-  uint64_t NG0 = 0;
-  uint64_t NG1 = 0;
-
-// Loop over outputblocks, for case of multiple output blocks per input block
+  // Loop over outputblocks, for case of multiple output blocks per input block
   int step = data._sideChannelData.b().size() / _noOfBitSetsIn_G0.size();
 
   for (size_t i = 0; i < _noOfBitSetsIn_G0.size(); i++)
@@ -307,19 +337,26 @@ void GatedSpectrometer<HandlerType>::gated_fft(
       _selectedBit,
       _dadaBufferLayout.getNSideChannels(),
       _selectedSideChannel,
-      previous_baseLineG0, previous_baseLineG1,
       thrust::raw_pointer_cast(data._baseLineG0.data()),
       thrust::raw_pointer_cast(data._baseLineG1.data()),
+      thrust::raw_pointer_cast(data._baseLineG0_update.data()),
+      thrust::raw_pointer_cast(data._baseLineG1_update.data()),
       thrust::raw_pointer_cast(_noOfBitSetsIn_G0.data() + i),
       thrust::raw_pointer_cast(_noOfBitSetsIn_G1.data() + i)
       );
-    NG0 += _noOfBitSetsIn_G0[i];
-    NG1 += _noOfBitSetsIn_G1[i];
   }
-  data._baseLineG0[0] /= NG0;
-  data._baseLineG1[0] /= NG1;
-  BOOST_LOG_TRIVIAL(debug) << "Updating Baselines\n G0: " << previous_baseLineG0 << " -> " << data._baseLineG0[0] << ", " << previous_baseLineG1 << " -> " << data._baseLineG1[0] ;
 
+    // only few output blocks per input block thus execution on only one thread.
+    // Important is that the execution is async on the GPU.
+    update_baselines<<<1,1,0, _proc_stream>>>(
+        thrust::raw_pointer_cast(data._baseLineG0.data()),
+        thrust::raw_pointer_cast(data._baseLineG1.data()),
+        thrust::raw_pointer_cast(data._baseLineG0_update.data()),
+        thrust::raw_pointer_cast(data._baseLineG1_update.data()),
+        thrust::raw_pointer_cast(_noOfBitSetsIn_G0.data()),
+        thrust::raw_pointer_cast(_noOfBitSetsIn_G1.data()),
+        _noOfBitSetsIn_G0.size()
+            );
 
   BOOST_LOG_TRIVIAL(debug) << "Performing FFT 1";
   UnpackedVoltageType *_unpacked_voltage_ptr =
diff --git a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
index 18aa9fea..495375dd 100644
--- a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
+++ b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
@@ -152,6 +152,8 @@ TEST(GatedSpectrometer, GatingKernel)
   thrust::device_vector<float> baseLineG0(1);
   thrust::device_vector<float> baseLineG1(1);
 
+  thrust::device_vector<float> baseLineG0_update(1);
+  thrust::device_vector<float> baseLineG1_update(1);
   thrust::fill(G0.begin(), G0.end(), 42);
   thrust::fill(G1.begin(), G1.end(), 23);
   thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 0);
@@ -160,18 +162,22 @@ TEST(GatedSpectrometer, GatingKernel)
   {
     thrust::fill(_nG0.begin(), _nG0.end(), 0);
     thrust::fill(_nG1.begin(), _nG1.end(), 0);
-    baseLineG0[0] = 0.;
-    baseLineG1[0] = 0.;
+    baseLineG0[0] = -3;
+    baseLineG1[0] = -4;
+    baseLineG0_update[0] = 0;
+    baseLineG1_update[0] = 0;
+
     const uint64_t *sideCD =
         (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
     psrdada_cpp::effelsberg::edd::gating<<<1024 , 1024>>>(
           thrust::raw_pointer_cast(G0.data()),
           thrust::raw_pointer_cast(G1.data()), sideCD,
-          G0.size(), G0.size(), 0, 1,
+          G0.size(), blockSize, 0, 1,
           0,
-          -3., -4,
           thrust::raw_pointer_cast(baseLineG0.data()),
           thrust::raw_pointer_cast(baseLineG1.data()),
+          thrust::raw_pointer_cast(baseLineG0_update.data()),
+          thrust::raw_pointer_cast(baseLineG1_update.data()),
           thrust::raw_pointer_cast(_nG0.data()),
           thrust::raw_pointer_cast(_nG1.data())
           );
@@ -188,27 +194,31 @@ TEST(GatedSpectrometer, GatingKernel)
     EXPECT_EQ(_nG0[0], G0.size());
     EXPECT_EQ(_nG1[0], 0u);
 
-    EXPECT_FLOAT_EQ(baseLineG0[0] / (_nG0[0] + 1E-127), 42.f);
-    EXPECT_FLOAT_EQ(baseLineG1[0] / (_nG1[0] + 1E-127), 0.f);
+    EXPECT_FLOAT_EQ(42.f, baseLineG0_update[0] / (_nG0[0] + 1E-121));
+    EXPECT_FLOAT_EQ(0.f, baseLineG1_update[0] / (_nG1[0] + 1E-121));
   }
 
   // everything to G1 // with baseline -5
   {
     thrust::fill(_nG0.begin(), _nG0.end(), 0);
     thrust::fill(_nG1.begin(), _nG1.end(), 0);
-    baseLineG0[0] = 0.;
-    baseLineG1[0] = 0.;
+    baseLineG0[0] = 5.;
+    baseLineG1[0] = -2;
+    baseLineG0_update[0] = 0;
+    baseLineG1_update[0] = 0;
+
     thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 1L);
     const uint64_t *sideCD =
         (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
     psrdada_cpp::effelsberg::edd::gating<<<1024, 1024>>>(
           thrust::raw_pointer_cast(G0.data()),
           thrust::raw_pointer_cast(G1.data()), sideCD,
-          G0.size(), G0.size(), 0, 1,
+          G0.size(), blockSize, 0, 1,
           0,
-          5., -2.,
           thrust::raw_pointer_cast(baseLineG0.data()),
           thrust::raw_pointer_cast(baseLineG1.data()),
+          thrust::raw_pointer_cast(baseLineG0_update.data()),
+          thrust::raw_pointer_cast(baseLineG1_update.data()),
           thrust::raw_pointer_cast(_nG0.data()),
           thrust::raw_pointer_cast(_nG1.data())
           );
@@ -223,8 +233,9 @@ TEST(GatedSpectrometer, GatingKernel)
 
     EXPECT_EQ(_nG0[0], 0u);
     EXPECT_EQ(_nG1[0], G1.size());
-    EXPECT_FLOAT_EQ(baseLineG0[0] / (_nG0[0] + 1E-127), 0.);
-    EXPECT_FLOAT_EQ(baseLineG1[0] / (_nG1[0] + 1E-127), 42.);
+
+    EXPECT_FLOAT_EQ(0.f, baseLineG0_update[0] / (_nG0[0] + 1E-121));
+    EXPECT_FLOAT_EQ(42.f, baseLineG1_update[0] / (_nG1[0] + 1E-121));
   }
 }
 
-- 
GitLab