diff --git a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
index 4bd3cc5addc50755c1fc4f29b2a55f3b5f15be39..d716594d4f9b4ababaa23fa827ad6b038fc92663 100644
--- a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
+++ b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
@@ -133,7 +133,8 @@ private:
   thrust::device_vector<UnpackedVoltageType> _unpacked_voltage_G0;
   thrust::device_vector<UnpackedVoltageType> _unpacked_voltage_G1;
   thrust::device_vector<ChannelisedVoltageType> _channelised_voltage;
-  thrust::device_vector<UnpackedVoltageType> _baseLineN;
+  thrust::device_vector<UnpackedVoltageType> _baseLineNG0;
+  thrust::device_vector<UnpackedVoltageType> _baseLineNG1;
 
   cudaStream_t _h2d_stream;
   cudaStream_t _proc_stream;
@@ -169,8 +170,12 @@ private:
 __global__ void gating(float *G0, float *G1, const int64_t *sideChannelData,
                        size_t N, size_t heapSize, size_t bitpos,
                        size_t noOfSideChannels, size_t selectedSideChannel,
-                       const float *_baseLine, uint64_cu* stats_G0, uint64_cu*
-                       stats_G1);
+                       const float  baseLineG0,
+                       const float  baseLineG1,
+                       float* __restrict__ baseLineNG0,
+                       float* __restrict__ baseLineNG1,
+                       uint64_cu* stats_G0,
+                       uint64_cu* stats_G1);
 
 
 
diff --git a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
index 853a9af776524eedea0972ba80e493da76c3cf95..bd33eaecc2b0be867a8fe7d6768abf1c329ffcc1 100644
--- a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
+++ b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
@@ -36,16 +36,25 @@ __device__ void reduce(T *x, const T &v)
 __global__ void gating(float* __restrict__ G0, float* __restrict__ G1, const uint64_t* __restrict__ sideChannelData,
                        size_t N, size_t heapSize, size_t bitpos,
                        size_t noOfSideChannels, size_t selectedSideChannel,
-                       const float* __restrict__ _baseLineN, uint64_cu* stats_G0, uint64_cu* stats_G1) {
-  float baseLine = (*_baseLineN) / N;
+                       const float  baseLineG0,
+                       const float  baseLineG1,
+                       float* __restrict__ baseLineNG0,
+                       float* __restrict__ baseLineNG1,
+                       uint64_cu* stats_G0, uint64_cu* stats_G1) {
+//  float baseLineG0 = (*_baseLineNG0) / N;
+ // float baseLineG1 = (*_baseLineNG1) / N;
 
   // statistics values for samopels to G0, G1
   uint32_t _G0stats = 0;
   uint32_t _G1stats = 0;
 
+  float baselineUpdateG0 = 0;
+  float baselineUpdateG1 = 0;
+
   for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < N);
        i += blockDim.x * gridDim.x) {
-    const float w = G0[i] - baseLine;
+    const float v = G0[i];
+
     const uint64_t sideChannelItem =
         sideChannelData[((i / heapSize) * (noOfSideChannels)) +
                         selectedSideChannel]; // Probably not optimal access as
@@ -55,10 +64,14 @@ __global__ void gating(float* __restrict__ G0, float* __restrict__ G1, const uin
 
     const unsigned int bit_set = TEST_BIT(sideChannelItem, bitpos);
     const unsigned int heap_lost = TEST_BIT(sideChannelItem, 63);
-    G1[i] = w * bit_set * (!heap_lost) + baseLine;
-    G0[i] = w * (!bit_set) *(!heap_lost) + baseLine;
+    G1[i] = (v - baseLineG1) * bit_set * (!heap_lost) + baseLineG1;
+    G0[i] = (v - baseLineG0) * (!bit_set) *(!heap_lost) + baseLineG0;
+
     _G0stats += (!bit_set) *(!heap_lost);
     _G1stats += bit_set * (!heap_lost);
+
+    baselineUpdateG1 += v * bit_set * (!heap_lost);
+    baselineUpdateG0 += v * (!bit_set) *(!heap_lost);
   }
   __shared__ uint32_t x[1024];
 
@@ -67,40 +80,27 @@ __global__ void gating(float* __restrict__ G0, float* __restrict__ G1, const uin
   if(threadIdx.x == 0)
     atomicAdd(stats_G0,  (uint64_cu) x[threadIdx.x]);
 
+  __syncthreads();
   reduce<uint32_t>(x, _G1stats);
   if(threadIdx.x == 0)
     atomicAdd(stats_G1,  (uint64_cu) x[threadIdx.x]);
-}
 
+  //reuse shared array
+  float *y = (float*) x;
 
-__global__ void countBitSet(const uint64_t *sideChannelData, size_t N, size_t
-    bitpos, size_t noOfSideChannels, size_t selectedSideChannel, size_t
-    *nBitsSet)
-{
-  // really not optimized reduction, but here only trivial array sizes.
-  // run only in one block!
-  __shared__ uint64_t x[1024];
-  uint64_t ls = 0;
-
-  for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < N);
-       i += blockDim.x * gridDim.x) {
-    ls += TEST_BIT(sideChannelData[i * noOfSideChannels + selectedSideChannel], bitpos);
-  }
-  x[threadIdx.x] = ls;
+  //update the baseline array
+  reduce<float>(y, baselineUpdateG0);
+  if(threadIdx.x == 0)
+    atomicAdd(baseLineNG0, y[threadIdx.x]);
 
   __syncthreads();
-  for(int s = blockDim.x / 2; s > 0; s = s / 2)
-  {
-    if (threadIdx.x < s)
-      x[threadIdx.x] += x[threadIdx.x + s];
-    __syncthreads();
-  }
-
+  reduce<float>(y, baselineUpdateG1);
   if(threadIdx.x == 0)
-   nBitsSet[0] += x[threadIdx.x];
+    atomicAdd(baseLineNG1, y[threadIdx.x]);
 }
 
 
+
 template <class HandlerType, typename IntegratedPowerType>
 GatedSpectrometer<HandlerType, IntegratedPowerType>::GatedSpectrometer(
     const DadaBufferLayout &dadaBufferLayout,
@@ -178,7 +178,8 @@ GatedSpectrometer<HandlerType, IntegratedPowerType>::GatedSpectrometer(
   _unpacked_voltage_G0.resize(_nsamps_per_buffer);
   _unpacked_voltage_G1.resize(_nsamps_per_buffer);
 
-  _baseLineN.resize(array_sum_Nthreads);
+  _baseLineNG0.resize(1);
+  _baseLineNG1.resize(1);
   BOOST_LOG_TRIVIAL(debug) << "  Unpacked voltages size (in samples): "
                            << _unpacked_voltage_G0.size();
   _channelised_voltage.resize(_nchans * batch);
@@ -270,12 +271,17 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
     throw std::runtime_error("Unsupported number of bits");
   }
   BOOST_LOG_TRIVIAL(debug) << "Calculate baseline";
-  psrdada_cpp::effelsberg::edd::array_sum<<<64, array_sum_Nthreads, 0, _proc_stream>>>(thrust::raw_pointer_cast(_unpacked_voltage_G0.data()), _unpacked_voltage_G0.size(), thrust::raw_pointer_cast(_baseLineN.data()));
-  psrdada_cpp::effelsberg::edd::array_sum<<<1, array_sum_Nthreads, 0, _proc_stream>>>(thrust::raw_pointer_cast(_baseLineN.data()), _baseLineN.size(), thrust::raw_pointer_cast(_baseLineN.data()));
+  //calculate baseline from previos block
 
   BOOST_LOG_TRIVIAL(debug) << "Perform gating";
 
+  float baseLineG0 = _baseLineNG0[0];
+  float baseLineG1 = _baseLineNG1[0];
 
+  uint64_t NG0 = 0;
+  uint64_t NG1 = 0;
+
+  // Loop over outputblocks, for case of multiple output blocks per input block
   for (size_t i = 0; i < noOfBitSetsIn_G0.size(); i++)
   { // ToDo: Should be in one kernel call
   gating<<<1024, 1024, 0, _proc_stream>>>(
@@ -283,11 +289,20 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
       thrust::raw_pointer_cast(_unpacked_voltage_G1.data() + i * sideChannelData.size() / noOfBitSetsIn_G0.size()),
       thrust::raw_pointer_cast(sideChannelData.data() + i * sideChannelData.size() / noOfBitSetsIn_G0.size()),
       _unpacked_voltage_G0.size() / noOfBitSetsIn_G0.size(), _dadaBufferLayout.getHeapSize(), _selectedBit, _dadaBufferLayout.getNSideChannels(),
-      _selectedSideChannel, thrust::raw_pointer_cast(_baseLineN.data()),
+      _selectedSideChannel,
+      baseLineG0, baseLineG1,
+      thrust::raw_pointer_cast(_baseLineNG0.data()),
+      thrust::raw_pointer_cast(_baseLineNG1.data()),
       thrust::raw_pointer_cast(noOfBitSetsIn_G0.data() + i),
       thrust::raw_pointer_cast(noOfBitSetsIn_G1.data() + i)
       );
+    NG0 += noOfBitSetsIn_G0[i];
+    NG1 += noOfBitSetsIn_G1[i];
   }
+  _baseLineNG0[0] /= NG0;
+  _baseLineNG1[0] /= NG1;
+  BOOST_LOG_TRIVIAL(debug) << "Updating Baselines\n G0: " << baseLineG0 << " -> " << _baseLineNG0[0] << ", " << baseLineG1 << " -> " << _baseLineNG1[0] ;
+
 
   BOOST_LOG_TRIVIAL(debug) << "Performing FFT 1";
   UnpackedVoltageType *_unpacked_voltage_ptr =
@@ -339,7 +354,11 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
       static_cast<void *>(_sideChannelData_db.a_ptr()),
       static_cast<void *>(block.ptr() + _dadaBufferLayout.sizeOfData() + _dadaBufferLayout.sizeOfGap()),
       _dadaBufferLayout.sizeOfSideChannelData(), cudaMemcpyHostToDevice, _h2d_stream));
-  BOOST_LOG_TRIVIAL(debug) << "First side channel item: 0x" <<   std::setw(16) << std::setfill('0') << std::hex <<  (reinterpret_cast<uint64_t*>(block.ptr() + _dadaBufferLayout.sizeOfData() + _dadaBufferLayout.sizeOfGap()))[0] << std::dec;
+  BOOST_LOG_TRIVIAL(debug) << "First side channel item: 0x" <<   std::setw(16)
+      << std::setfill('0') << std::hex <<
+      (reinterpret_cast<uint64_t*>(block.ptr() + _dadaBufferLayout.sizeOfData()
+                                   + _dadaBufferLayout.sizeOfGap()))[0] <<
+      std::dec;
 
 
   if (_call_count == 1) {
diff --git a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
index dd51a3b41cc80617dbc81c69845d37dc19fe90a2..163e8e9fba27da62bac0eee7654316ddbe61fee4 100644
--- a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
+++ b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
@@ -52,15 +52,16 @@ TEST(GatedSpectrometer, BitManipulationMacros) {
 
 TEST(GatedSpectrometer, GatingKernel)
 {
-  size_t blockSize = 1024;
-  size_t nBlocks = 16 * 1024;
+  const size_t blockSize = 1024;
+  const size_t nBlocks = 16 * 1024;
 
   thrust::device_vector<float> G0(blockSize * nBlocks);
   thrust::device_vector<float> G1(blockSize * nBlocks);
   thrust::device_vector<uint64_t> _sideChannelData(nBlocks);
-  thrust::device_vector<psrdada_cpp::effelsberg::edd::uint64_cu> _nG0(1);
-  thrust::device_vector<psrdada_cpp::effelsberg::edd::uint64_cu> _nG1(1);
-  thrust::device_vector<float> baseLine(1);
+  thrust::device_vector<psrdada_cpp::effelsberg::edd::uint64_cu> _nG0(nBlocks);
+  thrust::device_vector<psrdada_cpp::effelsberg::edd::uint64_cu> _nG1(nBlocks);
+  thrust::device_vector<float> baseLineG0(1);
+  thrust::device_vector<float> baseLineG1(1);
 
   thrust::fill(G0.begin(), G0.end(), 42);
   thrust::fill(G1.begin(), G1.end(), 23);
@@ -70,50 +71,62 @@ TEST(GatedSpectrometer, GatingKernel)
   {
     thrust::fill(_nG0.begin(), _nG0.end(), 0);
     thrust::fill(_nG1.begin(), _nG1.end(), 0);
-    baseLine[0] = 0.;
+    baseLineG0[0] = 0.;
+    baseLineG1[0] = 0.;
     const uint64_t *sideCD =
         (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
-    psrdada_cpp::effelsberg::edd::gating<<<1024, 1024>>>(
+    psrdada_cpp::effelsberg::edd::gating<<<1024 , 1024>>>(
           thrust::raw_pointer_cast(G0.data()),
           thrust::raw_pointer_cast(G1.data()), sideCD,
-          G0.size(), blockSize, 0, 1,
-          0, thrust::raw_pointer_cast(baseLine.data()),
+          G0.size(), G0.size(), 0, 1,
+          0,
+          -3., -4,
+          thrust::raw_pointer_cast(baseLineG0.data()),
+          thrust::raw_pointer_cast(baseLineG1.data()),
           thrust::raw_pointer_cast(_nG0.data()),
           thrust::raw_pointer_cast(_nG1.data())
           );
+
     thrust::pair<thrust::device_vector<float>::iterator, thrust::device_vector<float>::iterator> minmax;
     minmax = thrust::minmax_element(G0.begin(), G0.end());
     EXPECT_EQ(*minmax.first, 42);
     EXPECT_EQ(*minmax.second, 42);
 
     minmax = thrust::minmax_element(G1.begin(), G1.end());
-    EXPECT_EQ(*minmax.first, 0);
-    EXPECT_EQ(*minmax.second, 0);
+    EXPECT_EQ(*minmax.first, -4.);
+    EXPECT_EQ(*minmax.second, -4.);
 
     EXPECT_EQ(_nG0[0], G0.size());
     EXPECT_EQ(_nG1[0], 0u);
+
+    EXPECT_FLOAT_EQ(baseLineG0[0] / (_nG0[0] + 1E-127), 42.f);
+    EXPECT_FLOAT_EQ(baseLineG1[0] / (_nG1[0] + 1E-127), 0.f);
   }
 
   // everything to G1 // with baseline -5
   {
     thrust::fill(_nG0.begin(), _nG0.end(), 0);
     thrust::fill(_nG1.begin(), _nG1.end(), 0);
-    baseLine[0] = -5. * G0.size();
+    baseLineG0[0] = 0.;
+    baseLineG1[0] = 0.;
     thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 1L);
     const uint64_t *sideCD =
         (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
     psrdada_cpp::effelsberg::edd::gating<<<1024, 1024>>>(
           thrust::raw_pointer_cast(G0.data()),
           thrust::raw_pointer_cast(G1.data()), sideCD,
-          G0.size(), blockSize, 0, 1,
-          0, thrust::raw_pointer_cast(baseLine.data()),
+          G0.size(), G0.size(), 0, 1,
+          0,
+          5., -2.,
+          thrust::raw_pointer_cast(baseLineG0.data()),
+          thrust::raw_pointer_cast(baseLineG1.data()),
           thrust::raw_pointer_cast(_nG0.data()),
           thrust::raw_pointer_cast(_nG1.data())
           );
     thrust::pair<thrust::device_vector<float>::iterator, thrust::device_vector<float>::iterator> minmax;
     minmax = thrust::minmax_element(G0.begin(), G0.end());
-    EXPECT_EQ(*minmax.first, -5.);
-    EXPECT_EQ(*minmax.second, -5.);
+    EXPECT_EQ(*minmax.first, 5.);
+    EXPECT_EQ(*minmax.second, 5.);
 
     minmax = thrust::minmax_element(G1.begin(), G1.end());
     EXPECT_EQ(*minmax.first, 42);
@@ -121,54 +134,12 @@ TEST(GatedSpectrometer, GatingKernel)
 
     EXPECT_EQ(_nG0[0], 0u);
     EXPECT_EQ(_nG1[0], G1.size());
+    EXPECT_FLOAT_EQ(baseLineG0[0] / (_nG0[0] + 1E-127), 0.);
+    EXPECT_FLOAT_EQ(baseLineG1[0] / (_nG1[0] + 1E-127), 42.);
   }
 }
 
 
-TEST(GatedSpectrometer, countBitSet) {
-  size_t nBlocks = 100000;
-  int nSideChannels = 4;
-  thrust::device_vector<uint64_t> _sideChannelData(nBlocks * nSideChannels);
-  thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 0L);
-  const uint64_t *sideCD =
-      (uint64_t *)(thrust::raw_pointer_cast(_sideChannelData.data()));
-
-  thrust::device_vector<size_t> res(1);
-
-  // test 1 side channel
-  res[0] = 0;
-  psrdada_cpp::effelsberg::edd::
-      countBitSet<<<1, 1024>>>(
-    sideCD, nBlocks, 0, 1, 0, thrust::raw_pointer_cast(res.data()));
-
-  EXPECT_EQ(res[0], 0u);
-
-  res[0] = 0;
-  thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 1L);
-  psrdada_cpp::effelsberg::edd::countBitSet<<<1, 1024>>>(
-    sideCD, nBlocks, 0, 1, 0, thrust::raw_pointer_cast(res.data()));
-  EXPECT_EQ(res[0], nBlocks);
-
-  // test mult side channels w stride.
-  res[0] = 0;
-
-  thrust::fill(_sideChannelData.begin(), _sideChannelData.end(), 0L);
-  for (size_t i = 2; i < _sideChannelData.size(); i += nSideChannels)
-    _sideChannelData[i] = 1L;
-  psrdada_cpp::effelsberg::edd::countBitSet<<<1, 1024>>>(
-    sideCD, nBlocks, 0, nSideChannels, 3,
-    thrust::raw_pointer_cast(res.data()));
-  cudaDeviceSynchronize();
-  EXPECT_EQ(0U, res[0]);
-
-  res[0] = 0;
-  psrdada_cpp::effelsberg::edd::countBitSet<<<1, 1024>>>(
-    sideCD, nBlocks, 0, nSideChannels, 2,
-    thrust::raw_pointer_cast(res.data()));
-  cudaDeviceSynchronize();
-  EXPECT_EQ(nBlocks, res[0]);
-}
-
 
 TEST(GatedSpectrometer, array_sum) {