From 09928fff24c78ff8615de449a3e452843a717ffe Mon Sep 17 00:00:00 2001
From: Tobias Winchen <tobias.winchen@rwth-aachen.de>
Date: Wed, 20 May 2020 12:18:00 +0200
Subject: [PATCH] Fixed fft_length, improved documentation

---
 .../effelsberg/edd/GatedSpectrometer.cuh      | 324 +++++++-----------
 .../edd/detail/GatedSpectrometer.cu           |  31 +-
 .../effelsberg/edd/src/GatedSpectrometer.cu   | 188 ++++++++--
 3 files changed, 297 insertions(+), 246 deletions(-)

diff --git a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
index 68d4c6e1..4b489f4b 100644
--- a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
+++ b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
@@ -25,7 +25,7 @@ namespace psrdada_cpp {
 namespace effelsberg {
 namespace edd {
 
-
+/// Macro to manipulate single bits of an 64-bit type.
 #define BIT_MASK(bit) (1uL << (bit))
 #define SET_BIT(value, bit) ((value) |= BIT_MASK(bit))
 #define CLEAR_BIT(value, bit) ((value) &= ~BIT_MASK(bit))
@@ -39,12 +39,18 @@ typedef float UnpackedVoltageType;
 typedef float2 ChannelisedVoltageType;
 typedef float IntegratedPowerType;
 
-
-
-/// Input data and intermediate processing data for one polarization
+/**
+ @class PolarizationData
+ @brief Device data arrays for raw voltage input and intermediate processing data for one polarization
+ */
 struct PolarizationData
 {
     size_t _nbits;
+    /**
+    * @brief      Constructor
+    *
+    * @param      nbits Bit-depth of the input data.
+    */
     PolarizationData(size_t nbits): _nbits(nbits) {};
     /// Raw ADC Voltage
     DoubleDeviceBuffer<RawVoltageType> _raw_voltage;
@@ -74,62 +80,116 @@ struct PolarizationData
 };
 
 
-struct SinglePolarizationInput : public PolarizationData
+/**
+ @class SinglePolarizationInput
+ @brief Input data for a buffer containing one polarization
+ */
+class SinglePolarizationInput : public PolarizationData
 {
     DadaBufferLayout _dadaBufferLayout;
     size_t _fft_length;
-    size_t _batch;
 
-    // a buffer contains batch * fft_length samples
-    SinglePolarizationInput(size_t fft_length, size_t _batch, size_t nbits,
+public:
+
+    /**
+    * @brief      Constructor
+    *
+    * @param      fft_length length of the fft.
+    * @param      nbits bit-depth of the input data.
+    * @param      dadaBufferLayout layout of the input dada buffer
+    */
+    SinglePolarizationInput(size_t fft_length, size_t nbits,
             const DadaBufferLayout &dadaBufferLayout);
 
+    /// Copy data from input block to input dubble buffer
     void getFromBlock(RawBytes &block, cudaStream_t &_h2d_stream);
 
-    size_t getSamplesPerInputPolarization()
-    {
-        return _dadaBufferLayout.sizeOfData() * 8 / _nbits;
-    }
+    /// Number of samples per input polarization
+    size_t getSamplesPerInputPolarization();
 };
 
 
-/// Input data and intermediate processing data for two polarizations
-struct DualPolarizationInput
+/**
+ @class SinglePolarizationInput
+ @brief Input data for a buffer containing two polarizations
+ */
+class DualPolarizationInput
 {
     DadaBufferLayout _dadaBufferLayout;
     size_t _fft_length;
-    size_t _batch;
+
+    public:
     PolarizationData polarization0, polarization1;
 
-    DualPolarizationInput(size_t fft_length, size_t batch, size_t nbit, const DadaBufferLayout
+    /**
+    * @brief      Constructor
+    *
+    * @param      fft_length length of the fft.
+    * @param      nbits bit-depth of the input data.
+    * @param      dadaBufferLayout layout of the input dada buffer
+    */
+    DualPolarizationInput(size_t fft_length, size_t nbits, const DadaBufferLayout
             &dadaBufferLayout);
 
+    /// Swaps input buffers for both polarizations
     void swap();
 
+    /// Copy data from input block to input dubble buffer
     void getFromBlock(RawBytes &block, cudaStream_t &_h2d_stream);
 
-    size_t getSamplesPerInputPolarization()
-    {
-        return _dadaBufferLayout.sizeOfData() * 8 / polarization0._nbits / 2;
-    }
+    /// Number of samples per input polarization
+    size_t getSamplesPerInputPolarization();
 };
 
 
 
-/// INterface for the processed output data stream
+
+/**
+ @class PowerSpectrumOutput
+ @brief Output data for one gate, single power spectrum
+ */
+struct PowerSpectrumOutput
+{
+    /**
+    * @brief      Constructor
+    *
+    * @param      size size of the output, i.e. number of channels.
+    * @param      blocks number of blocks in the output.
+    */
+    PowerSpectrumOutput(size_t size, size_t blocks);
+
+    /// spectrum data
+    DoubleDeviceBuffer<IntegratedPowerType> data;
+
+    /// Number of samples integrated in this output block
+    DoubleDeviceBuffer<uint64_cu> _noOfBitSets;
+
+    /// Swap output buffers and reset the buffer in given stream for new integration
+    void swap(cudaStream_t &_proc_stream);
+};
+
+
+/**
+ @class OutputDataStream
+ @brief Interface for the processed output data stream
+ */
 struct OutputDataStream
 {
     size_t _nchans;
     size_t _blocks;
 
+    /**
+    * @brief      Constructor
+    *
+    * @param      nchans number of channels.
+    * @param      blocks number of blocks in the output.
+    */
     OutputDataStream(size_t nchans, size_t blocks) : _nchans(nchans), _blocks(blocks)
     {
     }
 
-    /// Reset output to for new integration
-    virtual void reset(cudaStream_t &_proc_stream) = 0;
     /// Swap output buffers
-    virtual void swap() = 0;
+    virtual void swap(cudaStream_t &_proc_stream) = 0;
 
     // output buffer on the host
     DoublePinnedHostBuffer<char> _host_power;
@@ -140,216 +200,63 @@ struct OutputDataStream
 };
 
 
-// Output data for one gate, single power spectrum
-struct PowerSpectrumOutput
-{
-    PowerSpectrumOutput(size_t size, size_t blocks);
-
-    /// spectrum data
-    DoubleDeviceBuffer<IntegratedPowerType> data;
-
-    /// Number of samples integrated in this output block
-    DoubleDeviceBuffer<uint64_cu> _noOfBitSets;
-
-    /// Reset outptu for new integration
-    void reset(cudaStream_t &_proc_stream);
-
-    /// Swap output buffers
-    void swap();
-};
-
-
+/**
+ @class GatedPowerSpectrumOutput
+ @brief Output Stream for power spectrum output
+ */
 struct GatedPowerSpectrumOutput : public OutputDataStream
 {
-
     GatedPowerSpectrumOutput(size_t nchans, size_t blocks);
 
+    /// Power spectrum for off and on gate
     PowerSpectrumOutput G0, G1;
 
-    void reset(cudaStream_t &_proc_stream);
-
-    /// Swap output buffers
-    void swap();
+    void swap(cudaStream_t &_proc_stream);
 
     void data2Host(cudaStream_t &_d2h_stream);
-    };
+};
 
 
-// Output data for one gate full stokes
+/**
+ @class FullStokesOutput
+ @brief Output data for one gate full stokes
+ */
 struct FullStokesOutput
 {
-    /// Stokes parameters
-    DoubleDeviceBuffer<IntegratedPowerType> I;
-    DoubleDeviceBuffer<IntegratedPowerType> Q;
-    DoubleDeviceBuffer<IntegratedPowerType> U;
-    DoubleDeviceBuffer<IntegratedPowerType> V;
+    /**
+    * @brief      Constructor
+    *
+    * @param      size size of the output, i.e. number of channels.
+    * @param      blocks number of blocks in the output.
+    */
+    FullStokesOutput(size_t size, size_t blocks);
+
+    /// Buffer for Stokes Parameters
+    DoubleDeviceBuffer<IntegratedPowerType> I, Q, U, V;
 
     /// Number of samples integrated in this output block
     DoubleDeviceBuffer<uint64_cu> _noOfBitSets;
 
-    /// Reset outptu for new integration
-    void reset(cudaStream_t &_proc_stream)
-    {
-        thrust::fill(thrust::cuda::par.on(_proc_stream), I.a().begin(), I.a().end(), 0.);
-        thrust::fill(thrust::cuda::par.on(_proc_stream), Q.a().begin(), Q.a().end(), 0.);
-        thrust::fill(thrust::cuda::par.on(_proc_stream), U.a().begin(), U.a().end(), 0.);
-        thrust::fill(thrust::cuda::par.on(_proc_stream), V.a().begin(), V.a().end(), 0.);
-        thrust::fill(thrust::cuda::par.on(_proc_stream), _noOfBitSets.a().begin(), _noOfBitSets.a().end(), 0L);
-    }
-
     /// Swap output buffers
-    void swap()
-    {
-        I.swap();
-        Q.swap();
-        U.swap();
-        V.swap();
-        _noOfBitSets.swap();
-    }
-
-    /// Resize all buffers
-    FullStokesOutput(size_t size, size_t blocks)
-    {
-        I.resize(size * blocks);
-        Q.resize(size * blocks);
-        U.resize(size * blocks);
-        V.resize(size * blocks);
-        _noOfBitSets.resize(blocks);
-    }
+    void swap(cudaStream_t &_proc_stream);
 };
 
 
+/**
+ @class GatedPowerSpectrumOutput
+ @brief Output Stream for power spectrum output
+ */
 struct GatedFullStokesOutput: public OutputDataStream
 {
-
-    GatedFullStokesOutput(size_t nchans, size_t blocks): OutputDataStream(nchans, blocks / 2), G0(nchans, blocks / 2),
-G1(nchans, blocks / 2)
-    {
-        BOOST_LOG_TRIVIAL(debug) << "Output with " << _blocks << " blocks a " << _nchans << " channels";
-        _host_power.resize( 8 * ( _nchans * sizeof(IntegratedPowerType) + sizeof(size_t) ) * _blocks);
-        BOOST_LOG_TRIVIAL(debug) << "Allocated " << _host_power.size() << " bytes.";
-    };
-
+    /// stokes output for on/off gate
     FullStokesOutput G0, G1;
-     void reset(cudaStream_t &_proc_stream)
-    {
-        G0.reset(_proc_stream);
-        G1.reset(_proc_stream);
-    }
-
-    /// Swap output buffers
-    void swap()
-    {
-        G0.swap();
-        G1.swap();
-        _host_power.swap();
-    }
-
-
-
-    void data2Host(cudaStream_t &_d2h_stream)
-    {
-    for (size_t i = 0; i < G0._noOfBitSets.size(); i++)
-    {
-        size_t memslicesize = (_nchans * sizeof(IntegratedPowerType));
-        size_t memOffset = 8 * i * (memslicesize + sizeof(size_t));
-        // Copy  II QQ UU VV
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset) ,
-                            static_cast<void *>(G0.I.b_ptr() + i * _nchans),
-                            _nchans * sizeof(IntegratedPowerType),
-                            cudaMemcpyDeviceToHost, _d2h_stream));
-
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 1 * memslicesize) ,
-                            static_cast<void *>(G1.I.b_ptr() + i * _nchans),
-                            _nchans * sizeof(IntegratedPowerType),
-                            cudaMemcpyDeviceToHost, _d2h_stream));
-
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 2 * memslicesize) ,
-                            static_cast<void *>(G0.Q.b_ptr() + i * _nchans),
-                            _nchans * sizeof(IntegratedPowerType),
-                            cudaMemcpyDeviceToHost, _d2h_stream));
-
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 3 * memslicesize) ,
-                            static_cast<void *>(G1.Q.b_ptr() + i * _nchans),
-                            _nchans * sizeof(IntegratedPowerType),
-                            cudaMemcpyDeviceToHost, _d2h_stream));
-
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 4 * memslicesize) ,
-                            static_cast<void *>(G0.U.b_ptr() + i * _nchans),
-                            _nchans * sizeof(IntegratedPowerType),
-                            cudaMemcpyDeviceToHost, _d2h_stream));
-
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 5 * memslicesize) ,
-                            static_cast<void *>(G1.U.b_ptr() + i * _nchans),
-                            _nchans * sizeof(IntegratedPowerType),
-                            cudaMemcpyDeviceToHost, _d2h_stream));
-
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 6 * memslicesize) ,
-                            static_cast<void *>(G0.V.b_ptr() + i * _nchans),
-                            _nchans * sizeof(IntegratedPowerType),
-                            cudaMemcpyDeviceToHost, _d2h_stream));
-
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 7 * memslicesize) ,
-                            static_cast<void *>(G1.V.b_ptr() + i * _nchans),
-                            _nchans * sizeof(IntegratedPowerType),
-                            cudaMemcpyDeviceToHost, _d2h_stream));
-
-        // Copy SCI
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize),
-              static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
-                1 * sizeof(size_t),
-                cudaMemcpyDeviceToHost, _d2h_stream));
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 1 * sizeof(size_t)),
-              static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
-                1 * sizeof(size_t),
-                cudaMemcpyDeviceToHost, _d2h_stream));
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 2 * sizeof(size_t)),
-              static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
-                1 * sizeof(size_t),
-                cudaMemcpyDeviceToHost, _d2h_stream));
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 3 * sizeof(size_t)),
-              static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
-                1 * sizeof(size_t),
-                cudaMemcpyDeviceToHost, _d2h_stream));
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 4 * sizeof(size_t)),
-              static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
-                1 * sizeof(size_t),
-                cudaMemcpyDeviceToHost, _d2h_stream));
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 5 * sizeof(size_t)),
-              static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
-                1 * sizeof(size_t),
-                cudaMemcpyDeviceToHost, _d2h_stream));
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 6 * sizeof(size_t)),
-              static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
-                1 * sizeof(size_t),
-                cudaMemcpyDeviceToHost, _d2h_stream));
-        CUDA_ERROR_CHECK(
-            cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 7 * sizeof(size_t)),
-              static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
-                1 * sizeof(size_t),
-                cudaMemcpyDeviceToHost, _d2h_stream));
-
-      }
 
+    GatedFullStokesOutput(size_t nchans, size_t blocks);
 
+    /// Swap output buffers
+    void swap(cudaStream_t &_proc_stream);
 
-    }
-
+    void data2Host(cudaStream_t &_d2h_stream);
 };
 
 
@@ -359,7 +266,6 @@ G1(nchans, blocks / 2)
  @class GatedSpectrometer
  @brief Split data into two streams and create integrated spectra depending on
  bit set in side channel data.
-
  */
 template <class HandlerType,
          class InputType,
diff --git a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
index d3df8cbd..673dae75 100644
--- a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
+++ b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
@@ -108,19 +108,7 @@ GatedSpectrometer<HandlerType, InputType, OutputType>::GatedSpectrometer(
       << "Correction factors for 8-bit conversion: offset = " << offset
       << ", scaling = " << scaling;
 
-  // plan the FFT
-  size_t nsamps_per_buffer = _dadaBufferLayout.sizeOfData() * 8 / nbits;
-  int batch = nsamps_per_buffer / _fft_length;
-  int n[] = {static_cast<int>(_fft_length)};
-  BOOST_LOG_TRIVIAL(debug) << "Generating FFT plan: \n"
-      << "   fft_length = " << _fft_length << "\n"
-      << "   n[0] = " << n[0] << "\n"
-      << "   _nchans = " << _nchans << "\n"
-      << "   batch = " << batch << "\n";
-  CUFFT_ERROR_CHECK(cufftPlanMany(&_fft_plan, 1, n, NULL, 1, _fft_length, NULL,
-                                  1, _nchans, CUFFT_R2C, batch));
-
-  inputDataStream = new InputType(fft_length, batch, nbits, _dadaBufferLayout);
+  inputDataStream = new InputType(fft_length, nbits, _dadaBufferLayout);
 
   //How many output spectra per input block?
   size_t nsamps_per_output_spectra = fft_length * naccumulate;
@@ -143,6 +131,20 @@ GatedSpectrometer<HandlerType, InputType, OutputType>::GatedSpectrometer(
   BOOST_LOG_TRIVIAL(debug) << "Integrating  " << nsamps_per_output_spectra <<
       " samples from " << _nBlocks << "blocks into one output spectrum.";
 
+  // plan the FFT
+  size_t nsamps_per_buffer = _dadaBufferLayout.sizeOfData() * 8 / nbits;
+  int batch = nsamps_per_pol / _fft_length;
+  int n[] = {static_cast<int>(_fft_length)};
+  BOOST_LOG_TRIVIAL(debug) << "Generating FFT plan: \n"
+      << "   fft_length = " << _fft_length << "\n"
+      << "   n[0] = " << n[0] << "\n"
+      << "   _nchans = " << _nchans << "\n"
+      << "   batch = " << batch << "\n";
+  CUFFT_ERROR_CHECK(cufftPlanMany(&_fft_plan, 1, n, NULL, 1, _fft_length, NULL,
+                                  1, _nchans, CUFFT_R2C, batch));
+
+
+
   // We unpack one pol at a time
   _unpacked_voltage_G0.resize(nsamps_per_pol);
   _unpacked_voltage_G1.resize(nsamps_per_pol);
@@ -328,8 +330,7 @@ bool GatedSpectrometer<HandlerType, InputType, OutputType>::operator()(RawBytes
   {
     BOOST_LOG_TRIVIAL(debug) << "Starting new output block.";
     CUDA_ERROR_CHECK(cudaStreamSynchronize(_d2h_stream));
-    outputDataStream->swap();
-    outputDataStream->reset(_proc_stream);
+    outputDataStream->swap(_proc_stream);
   }
 
   BOOST_LOG_TRIVIAL(debug) << "Processing block.";
diff --git a/psrdada_cpp/effelsberg/edd/src/GatedSpectrometer.cu b/psrdada_cpp/effelsberg/edd/src/GatedSpectrometer.cu
index 5924c3fb..4ca8fde5 100644
--- a/psrdada_cpp/effelsberg/edd/src/GatedSpectrometer.cu
+++ b/psrdada_cpp/effelsberg/edd/src/GatedSpectrometer.cu
@@ -185,13 +185,23 @@ void PolarizationData::resize(size_t rawVolttageBufferBytes, size_t nsidechannel
 }
 
 
-SinglePolarizationInput::SinglePolarizationInput(size_t fft_length, size_t batch, size_t nbits, const DadaBufferLayout
-        &dadaBufferLayout) : PolarizationData(nbits), _fft_length(fft_length), _batch(batch), _dadaBufferLayout(dadaBufferLayout)
+SinglePolarizationInput::SinglePolarizationInput(size_t fft_length, size_t nbits, const DadaBufferLayout
+        &dadaBufferLayout) : PolarizationData(nbits), _fft_length(fft_length), _dadaBufferLayout(dadaBufferLayout)
 {
+
+  size_t nsamps_per_buffer = _dadaBufferLayout.sizeOfData() * 8 / nbits;
+  size_t _batch = nsamps_per_buffer / _fft_length;
+
     resize(_dadaBufferLayout.sizeOfData(), _dadaBufferLayout.getNSideChannels() * _dadaBufferLayout.getNHeaps(), (_fft_length / 2 + 1) * _batch);
 };
 
 
+size_t SinglePolarizationInput::getSamplesPerInputPolarization()
+{
+    return _dadaBufferLayout.sizeOfData() * 8 / _nbits;
+}
+
+
 void PolarizationData::swap()
 {
     _raw_voltage.swap();
@@ -220,13 +230,16 @@ void SinglePolarizationInput::getFromBlock(RawBytes &block, cudaStream_t &_h2d_s
 }
 
 
-DualPolarizationInput::DualPolarizationInput(size_t fft_length, size_t batch, size_t nbits, const DadaBufferLayout
+DualPolarizationInput::DualPolarizationInput(size_t fft_length, size_t nbits, const DadaBufferLayout
         &dadaBufferLayout) : _fft_length(fft_length),
-    _batch(batch),
     polarization0(nbits),
     polarization1(nbits),
     _dadaBufferLayout(dadaBufferLayout)
 {
+
+  size_t nsamps_per_buffer = _dadaBufferLayout.sizeOfData() * 8 / nbits;
+  size_t _batch = nsamps_per_buffer / _fft_length / 2;
+
     polarization0.resize(_dadaBufferLayout.sizeOfData() / 2, _dadaBufferLayout.getNSideChannels() * _dadaBufferLayout.getNHeaps() / 2, (_fft_length / 2 + 1) * _batch);
     polarization1.resize(_dadaBufferLayout.sizeOfData() / 2, _dadaBufferLayout.getNSideChannels() * _dadaBufferLayout.getNHeaps() / 2, (_fft_length / 2 + 1) * _batch);
 };
@@ -239,6 +252,12 @@ void DualPolarizationInput::swap()
 }
 
 
+size_t DualPolarizationInput::getSamplesPerInputPolarization()
+{
+    return _dadaBufferLayout.sizeOfData() * 8 / polarization0._nbits / 2;
+}
+
+
 void DualPolarizationInput::getFromBlock(RawBytes &block, cudaStream_t &_h2d_stream)
 {
 // Copy the data with stride to the GPU:
@@ -296,17 +315,12 @@ PowerSpectrumOutput::PowerSpectrumOutput(size_t size, size_t blocks)
 }
 
 
-void PowerSpectrumOutput::reset(cudaStream_t &_proc_stream)
-{
-    thrust::fill(thrust::cuda::par.on(_proc_stream), data.a().begin(), data.a().end(), 0.);
-    thrust::fill(thrust::cuda::par.on(_proc_stream), _noOfBitSets.a().begin(), _noOfBitSets.a().end(), 0L);
-}
-
-
-void PowerSpectrumOutput::swap()
+void PowerSpectrumOutput::swap(cudaStream_t &_proc_stream)
 {
     data.swap();
     _noOfBitSets.swap();
+    thrust::fill(thrust::cuda::par.on(_proc_stream), data.a().begin(), data.a().end(), 0.);
+    thrust::fill(thrust::cuda::par.on(_proc_stream), _noOfBitSets.a().begin(), _noOfBitSets.a().end(), 0L);
 }
 
 
@@ -320,18 +334,11 @@ G1(nchans, blocks)
 }
 
 
-void GatedPowerSpectrumOutput::reset(cudaStream_t &_proc_stream)
-{
-    G0.reset(_proc_stream);
-    G1.reset(_proc_stream);
-}
-
-
 /// Swap output buffers
-void GatedPowerSpectrumOutput::swap()
+void GatedPowerSpectrumOutput::swap(cudaStream_t &_proc_stream)
 {
-    G0.swap();
-    G1.swap();
+    G0.swap(_proc_stream);
+    G1.swap(_proc_stream);
     _host_power.swap();
 }
 
@@ -377,10 +384,147 @@ void GatedPowerSpectrumOutput::data2Host(cudaStream_t &_d2h_stream)
 }
 
 
+void FullStokesOutput::swap(cudaStream_t &_proc_stream)
+{
+    I.swap();
+    Q.swap();
+    U.swap();
+    V.swap();
+    _noOfBitSets.swap();
+    thrust::fill(thrust::cuda::par.on(_proc_stream), I.a().begin(), I.a().end(), 0.);
+    thrust::fill(thrust::cuda::par.on(_proc_stream), Q.a().begin(), Q.a().end(), 0.);
+    thrust::fill(thrust::cuda::par.on(_proc_stream), U.a().begin(), U.a().end(), 0.);
+    thrust::fill(thrust::cuda::par.on(_proc_stream), V.a().begin(), V.a().end(), 0.);
+    thrust::fill(thrust::cuda::par.on(_proc_stream), _noOfBitSets.a().begin(), _noOfBitSets.a().end(), 0L);
+}
+
+
+FullStokesOutput::FullStokesOutput(size_t size, size_t blocks)
+{
+    I.resize(size * blocks);
+    Q.resize(size * blocks);
+    U.resize(size * blocks);
+    V.resize(size * blocks);
+    _noOfBitSets.resize(blocks);
+}
+
+
+
+GatedFullStokesOutput::GatedFullStokesOutput(size_t nchans, size_t blocks): OutputDataStream(nchans, blocks), G0(nchans, blocks),
+G1(nchans, blocks)
+{
+    BOOST_LOG_TRIVIAL(debug) << "Output with " << _blocks << " blocks a " << _nchans << " channels";
+    _host_power.resize( 8 * ( _nchans * sizeof(IntegratedPowerType) + sizeof(size_t) ) * _blocks);
+    BOOST_LOG_TRIVIAL(debug) << "Allocated " << _host_power.size() << " bytes.";
+};
+
+
+void GatedFullStokesOutput::swap(cudaStream_t &_proc_stream)
+{
+    G0.swap(_proc_stream);
+    G1.swap(_proc_stream);
+    _host_power.swap();
+}
+
+
+void GatedFullStokesOutput::data2Host(cudaStream_t &_d2h_stream)
+{
+for (size_t i = 0; i < G0._noOfBitSets.size(); i++)
+{
+    size_t memslicesize = (_nchans * sizeof(IntegratedPowerType));
+    size_t memOffset = 8 * i * (memslicesize + sizeof(size_t));
+    // Copy  II QQ UU VV
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset) ,
+                        static_cast<void *>(G0.I.b_ptr() + i * _nchans),
+                        _nchans * sizeof(IntegratedPowerType),
+                        cudaMemcpyDeviceToHost, _d2h_stream));
+
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 1 * memslicesize) ,
+                        static_cast<void *>(G1.I.b_ptr() + i * _nchans),
+                        _nchans * sizeof(IntegratedPowerType),
+                        cudaMemcpyDeviceToHost, _d2h_stream));
+
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 2 * memslicesize) ,
+                        static_cast<void *>(G0.Q.b_ptr() + i * _nchans),
+                        _nchans * sizeof(IntegratedPowerType),
+                        cudaMemcpyDeviceToHost, _d2h_stream));
+
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 3 * memslicesize) ,
+                        static_cast<void *>(G1.Q.b_ptr() + i * _nchans),
+                        _nchans * sizeof(IntegratedPowerType),
+                        cudaMemcpyDeviceToHost, _d2h_stream));
+
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 4 * memslicesize) ,
+                        static_cast<void *>(G0.U.b_ptr() + i * _nchans),
+                        _nchans * sizeof(IntegratedPowerType),
+                        cudaMemcpyDeviceToHost, _d2h_stream));
 
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 5 * memslicesize) ,
+                        static_cast<void *>(G1.U.b_ptr() + i * _nchans),
+                        _nchans * sizeof(IntegratedPowerType),
+                        cudaMemcpyDeviceToHost, _d2h_stream));
 
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 6 * memslicesize) ,
+                        static_cast<void *>(G0.V.b_ptr() + i * _nchans),
+                        _nchans * sizeof(IntegratedPowerType),
+                        cudaMemcpyDeviceToHost, _d2h_stream));
 
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 7 * memslicesize) ,
+                        static_cast<void *>(G1.V.b_ptr() + i * _nchans),
+                        _nchans * sizeof(IntegratedPowerType),
+                        cudaMemcpyDeviceToHost, _d2h_stream));
 
+    // Copy SCI
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize),
+          static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
+            1 * sizeof(size_t),
+            cudaMemcpyDeviceToHost, _d2h_stream));
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 1 * sizeof(size_t)),
+          static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
+            1 * sizeof(size_t),
+            cudaMemcpyDeviceToHost, _d2h_stream));
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 2 * sizeof(size_t)),
+          static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
+            1 * sizeof(size_t),
+            cudaMemcpyDeviceToHost, _d2h_stream));
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 3 * sizeof(size_t)),
+          static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
+            1 * sizeof(size_t),
+            cudaMemcpyDeviceToHost, _d2h_stream));
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 4 * sizeof(size_t)),
+          static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
+            1 * sizeof(size_t),
+            cudaMemcpyDeviceToHost, _d2h_stream));
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 5 * sizeof(size_t)),
+          static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
+            1 * sizeof(size_t),
+            cudaMemcpyDeviceToHost, _d2h_stream));
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 6 * sizeof(size_t)),
+          static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
+            1 * sizeof(size_t),
+            cudaMemcpyDeviceToHost, _d2h_stream));
+    CUDA_ERROR_CHECK(
+        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 7 * sizeof(size_t)),
+          static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
+            1 * sizeof(size_t),
+            cudaMemcpyDeviceToHost, _d2h_stream));
+  }
+}
 
 
 }}} // namespace
-- 
GitLab