diff --git a/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh b/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh
index d5efa88b96c9439bb836e0841b3c74b16abe39e9..c95b3e0ca5dec13c062d9cf55350609bdb5ec62f 100644
--- a/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh
+++ b/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh
@@ -14,7 +14,7 @@ namespace kernels {
 template <typename T>
 __global__
 void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ out,
-    int nchans, int nsamps, int naccumulate, float scale, float offset)
+    int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
 {
     // grid stride loop over output array to keep 
     for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x)
@@ -30,7 +30,8 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
         double y = tmp.y * tmp.y;
         sum += x + y;
       }
-      out[i] = (int8_t) ((sum - offset)/scale);
+      size_t toff = out_offset * nchans + currentOutputSpectra * nchans; 
+      out[toff + i] = (int8_t) ((sum - offset)/scale);
     }
 
 }
@@ -39,7 +40,7 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
 template <typename T>
 __global__
 void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out,
-    int nchans, int nsamps, int naccumulate, float scale, float offset)
+    int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
 {
     for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x)
     {
@@ -54,7 +55,8 @@ void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ ou
         double y = tmp.y * tmp.y;
         sum += x + y;
       }
-      out[i] = sum;
+      size_t toff = out_offset * nchans + currentOutputSpectra * nchans * stride;
+      out[i + toff] = sum;
     }
 }
 
@@ -90,15 +92,17 @@ public:
 
   }
 
-  void detect(InputType const& input, OutputType& output)
+  // stride sets an offset of _nChans * stride to the detection in the output
+  // to allow multiple spectra in one output
+  void detect(InputType const& input, OutputType& output, int stride = 0, int stoff = 0)
   {
       assert(input.size() % (_nchans * _tscrunch) == 0 /* Input is not a multiple of _nchans * _tscrunch*/);
-      output.resize(input.size()/_tscrunch);
+      //output.resize(input.size()/_tscrunch);
       int nsamps = input.size() / _nchans;
       float2 const* input_ptr = thrust::raw_pointer_cast(input.data());
       T * output_ptr = thrust::raw_pointer_cast(output.data());
       kernels::detect_and_accumulate<T> <<<1024, 1024, 0, _stream>>>(
-          input_ptr, output_ptr, _nchans, nsamps, _tscrunch, _scale, _offset);
+          input_ptr, output_ptr, _nchans, nsamps, _tscrunch, _scale, _offset, stride, stoff);
   }
 
 
diff --git a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
index 1d51268e55b1a3ea2ca9f5d4fbe01a28df4a7b1b..6e9cf4dd559b39320ad7349f3abd2c3ddde76c90 100644
--- a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
+++ b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
@@ -89,8 +89,7 @@ public:
 private:
   void process(thrust::device_vector<RawVoltageType> const &digitiser_raw,
                thrust::device_vector<int64_t> const &sideChannelData,
-               thrust::device_vector<IntegratedPowerType> &detected_G0,
-               thrust::device_vector<IntegratedPowerType> &detected_G1,
+               thrust::device_vector<IntegratedPowerType> &detected,
                thrust::device_vector<size_t> &noOfBitSet);
 
 private:
@@ -116,8 +115,7 @@ private:
   std::unique_ptr<DetectorAccumulator<IntegratedPowerType> > _detector;
 
   DoubleDeviceBuffer<RawVoltageType> _raw_voltage_db;
-  DoubleDeviceBuffer<IntegratedPowerType> _power_db_G0;
-  DoubleDeviceBuffer<IntegratedPowerType> _power_db_G1;
+  DoubleDeviceBuffer<IntegratedPowerType> _power_db;
   DoubleDeviceBuffer<int64_t> _sideChannelData_db;
   DoubleDeviceBuffer<size_t> _noOfBitSetsInSideChannel;
   size_t _noOfBitSetsInSideChannel_host [2];
diff --git a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
index e9f3c82ec181232cf75566e6eabbd09b96337736..56bb43b813e75eb0b2a624227b49052bdfefdf97 100644
--- a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
+++ b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
@@ -180,12 +180,10 @@ GatedSpectrometer<HandlerType, IntegratedPowerType>::GatedSpectrometer(
   _channelised_voltage.resize(_nchans * batch);
   BOOST_LOG_TRIVIAL(debug) << "  Channelised voltages size: "
                            << _channelised_voltage.size();
-  _power_db_G0.resize(_nchans * batch / _naccumulate);
-  _power_db_G1.resize(_nchans * batch / _naccumulate);
-  BOOST_LOG_TRIVIAL(debug) << "  Powers size: " << _power_db_G0.size() << ", "
-                           << _power_db_G1.size();
+  _power_db.resize(_nchans * batch / _naccumulate * 2);  // hold on and off spectra to simplify output
+  BOOST_LOG_TRIVIAL(debug) << "  Powers size: " << _power_db.size() / 2; 
   // on the host both power are stored in the same data buffer
-  _host_power_db.resize( _power_db_G0.size() + _power_db_G1 .size());
+  _host_power_db.resize( _power_db.size());
   _noOfBitSetsInSideChannel.resize(1);
 
   CUDA_ERROR_CHECK(cudaStreamCreate(&_h2d_stream));
@@ -244,8 +242,7 @@ template <class HandlerType, typename IntegratedPowerType>
 void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
     thrust::device_vector<RawVoltageType> const &digitiser_raw,
     thrust::device_vector<int64_t> const &sideChannelData,
-    thrust::device_vector<IntegratedPowerType> &detected_G0,
-    thrust::device_vector<IntegratedPowerType> &detected_G1, thrust::device_vector<size_t> &noOfBitSet) {
+    thrust::device_vector<IntegratedPowerType> &detected, thrust::device_vector<size_t> &noOfBitSet) {
   BOOST_LOG_TRIVIAL(debug) << "Unpacking raw voltages";
   switch (_nbits) {
   case 8:
@@ -282,7 +279,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
       thrust::raw_pointer_cast(_channelised_voltage.data());
   CUFFT_ERROR_CHECK(cufftExecR2C(_fft_plan, (cufftReal *)_unpacked_voltage_ptr,
                                  (cufftComplex *)_channelised_voltage_ptr));
-  _detector->detect(_channelised_voltage, detected_G0);
+  _detector->detect(_channelised_voltage, detected, 2, 0);
 
   BOOST_LOG_TRIVIAL(debug) << "Performing FFT 2";
   _unpacked_voltage_ptr = thrust::raw_pointer_cast(_unpacked_voltage_G1.data());
@@ -290,7 +287,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
                                  (cufftComplex *)_channelised_voltage_ptr));
 
   CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
-  _detector->detect(_channelised_voltage, detected_G1);
+  _detector->detect(_channelised_voltage, detected, 2, 1);
   CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
   BOOST_LOG_TRIVIAL(debug) << "Exit processing";
 } // process
@@ -331,12 +328,10 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
   }
 
   // Synchronize all streams
-  _power_db_G0.swap();
-  _power_db_G1.swap();
+  _power_db.swap();
   _noOfBitSetsInSideChannel.swap();
 
-  process(_raw_voltage_db.b(), _sideChannelData_db.b(), _power_db_G0.a(),
-          _power_db_G1.a(), _noOfBitSetsInSideChannel.a());
+  process(_raw_voltage_db.b(), _sideChannelData_db.b(), _power_db.a(), _noOfBitSetsInSideChannel.a());
 
   // signal that data block has been processed
   CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
@@ -350,15 +345,15 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
   std::swap(_noOfBitSetsInSideChannel_host[0], _noOfBitSetsInSideChannel_host[1]);
   CUDA_ERROR_CHECK(
       cudaMemcpyAsync(static_cast<void *>(_host_power_db.a_ptr()),
-                      static_cast<void *>(_power_db_G0.b_ptr()),
-                      _power_db_G0.size() * sizeof(IntegratedPowerType),
+                      static_cast<void *>(_power_db.b_ptr()),
+                      _power_db.size() * sizeof(IntegratedPowerType),
                       cudaMemcpyDeviceToHost, _d2h_stream));
-  CUDA_ERROR_CHECK(cudaMemcpyAsync(
-      static_cast<void *>(_host_power_db.a_ptr() +
-                          (_power_db_G0.size())),           // as I am adding BEFORE the cast to void, I dont need the sizeof
-      static_cast<void *>(_power_db_G1.b_ptr()),
-      _power_db_G1.size() * sizeof(IntegratedPowerType), cudaMemcpyDeviceToHost,
-      _d2h_stream));
+//  CUDA_ERROR_CHECK(cudaMemcpyAsync(
+//        static_cast<void *>(_host_power_db.a_ptr() +
+//                          (_power_db_G0.size())),           // as I am adding BEFORE the cast to void, I dont need the sizeof
+//      static_cast<void *>(_power_db_G1.b_ptr()),
+//      _power_db_G1.size() * sizeof(IntegratedPowerType), cudaMemcpyDeviceToHost,
+//      _d2h_stream));
 
   CUDA_ERROR_CHECK(cudaMemcpyAsync(static_cast<void *>(&_noOfBitSetsInSideChannel_host[0]),
         static_cast<void *>(_noOfBitSetsInSideChannel.b_ptr()),