Alternate on / off spectra in output

e3edc797 · Tobias Winchen · c722b311 · e3edc797 · e3edc797 · e3edc797
Commit e3edc797 authored Apr 15, 2019 by Tobias Winchen
--- a/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh
+++ b/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh
@@ -14,7 +14,7 @@ namespace kernels {
 template <typename T>
 __global__
 void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ out,
-    int nchans, int nsamps, int naccumulate, float scale, float offset)
+    int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
 {
    // grid stride loop over output array to keep 
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x)
@@ -30,7 +30,8 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
        double y = tmp.y * tmp.y;
        sum += x + y;
      }
-      out[i] = (int8_t) ((sum - offset)/scale);
+      size_t toff = out_offset * nchans + currentOutputSpectra * nchans; 
+      out[toff + i] = (int8_t) ((sum - offset)/scale);
    }
 }
@@ -39,7 +40,7 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
 template <typename T>
 __global__
 void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out,
-    int nchans, int nsamps, int naccumulate, float scale, float offset)
+    int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
 {
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x)
    {
@@ -54,7 +55,8 @@ void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ ou
        double y = tmp.y * tmp.y;
        sum += x + y;
      }
-      out[i] = sum;
+      size_t toff = out_offset * nchans + currentOutputSpectra * nchans * stride;
+      out[i + toff] = sum;
    }
 }
@@ -90,15 +92,17 @@ public:
  }
-  void detect(InputType const& input, OutputType& output)
+  // stride sets an offset of _nChans * stride to the detection in the output
+  // to allow multiple spectra in one output
+  void detect(InputType const& input, OutputType& output, int stride = 0, int stoff = 0)
  {
      assert(input.size() % (_nchans * _tscrunch) == 0 /* Input is not a multiple of _nchans * _tscrunch*/);
-      output.resize(input.size()/_tscrunch);
+      //output.resize(input.size()/_tscrunch);
      int nsamps = input.size() / _nchans;
      float2 const* input_ptr = thrust::raw_pointer_cast(input.data());
      T * output_ptr = thrust::raw_pointer_cast(output.data());
      kernels::detect_and_accumulate<T> <<<1024, 1024, 0, _stream>>>(
-          input_ptr, output_ptr, _nchans, nsamps, _tscrunch, _scale, _offset);
+          input_ptr, output_ptr, _nchans, nsamps, _tscrunch, _scale, _offset, stride, stoff);
  }


--- a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
+++ b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
@@ -89,8 +89,7 @@ public:
 private:
  void process(thrust::device_vector<RawVoltageType> const &digitiser_raw,
               thrust::device_vector<int64_t> const &sideChannelData,
-               thrust::device_vector<IntegratedPowerType> &detected_G0,
+               thrust::device_vector<IntegratedPowerType> &detected,
-               thrust::device_vector<IntegratedPowerType> &detected_G1,
               thrust::device_vector<size_t> &noOfBitSet);
 private:
@@ -116,8 +115,7 @@ private:
  std::unique_ptr<DetectorAccumulator<IntegratedPowerType> > _detector;
  DoubleDeviceBuffer<RawVoltageType> _raw_voltage_db;
-  DoubleDeviceBuffer<IntegratedPowerType> _power_db_G0;
+  DoubleDeviceBuffer<IntegratedPowerType> _power_db;
-  DoubleDeviceBuffer<IntegratedPowerType> _power_db_G1;
  DoubleDeviceBuffer<int64_t> _sideChannelData_db;
  DoubleDeviceBuffer<size_t> _noOfBitSetsInSideChannel;
  size_t _noOfBitSetsInSideChannel_host [2];


--- a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
+++ b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
@@ -180,12 +180,10 @@ GatedSpectrometer<HandlerType, IntegratedPowerType>::GatedSpectrometer(
  _channelised_voltage.resize(_nchans * batch);
  BOOST_LOG_TRIVIAL(debug) << "  Channelised voltages size: "
                           << _channelised_voltage.size();
-  _power_db_G0.resize(_nchans * batch / _naccumulate);
+  _power_db.resize(_nchans * batch / _naccumulate * 2);  // hold on and off spectra to simplify output
-  _power_db_G1.resize(_nchans * batch / _naccumulate);
+  BOOST_LOG_TRIVIAL(debug) << "  Powers size: " << _power_db.size() / 2; 
-  BOOST_LOG_TRIVIAL(debug) << "  Powers size: " << _power_db_G0.size() << ", "
-                           << _power_db_G1.size();
  // on the host both power are stored in the same data buffer
-  _host_power_db.resize( _power_db_G0.size() + _power_db_G1 .size());
+  _host_power_db.resize( _power_db.size());
  _noOfBitSetsInSideChannel.resize(1);
  CUDA_ERROR_CHECK(cudaStreamCreate(&_h2d_stream));
@@ -244,8 +242,7 @@ template <class HandlerType, typename IntegratedPowerType>
 void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
    thrust::device_vector<RawVoltageType> const &digitiser_raw,
    thrust::device_vector<int64_t> const &sideChannelData,
-    thrust::device_vector<IntegratedPowerType> &detected_G0,
+    thrust::device_vector<IntegratedPowerType> &detected, thrust::device_vector<size_t> &noOfBitSet) {
-    thrust::device_vector<IntegratedPowerType> &detected_G1, thrust::device_vector<size_t> &noOfBitSet) {
  BOOST_LOG_TRIVIAL(debug) << "Unpacking raw voltages";
  switch (_nbits) {
  case 8:
@@ -282,7 +279,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
      thrust::raw_pointer_cast(_channelised_voltage.data());
  CUFFT_ERROR_CHECK(cufftExecR2C(_fft_plan, (cufftReal *)_unpacked_voltage_ptr,
                                 (cufftComplex *)_channelised_voltage_ptr));
-  _detector->detect(_channelised_voltage, detected_G0);
+  _detector->detect(_channelised_voltage, detected, 2, 0);
  BOOST_LOG_TRIVIAL(debug) << "Performing FFT 2";
  _unpacked_voltage_ptr = thrust::raw_pointer_cast(_unpacked_voltage_G1.data());
@@ -290,7 +287,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
                                 (cufftComplex *)_channelised_voltage_ptr));
  CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
-  _detector->detect(_channelised_voltage, detected_G1);
+  _detector->detect(_channelised_voltage, detected, 2, 1);
  CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
  BOOST_LOG_TRIVIAL(debug) << "Exit processing";
 } // process
@@ -331,12 +328,10 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
  }
  // Synchronize all streams
-  _power_db_G0.swap();
+  _power_db.swap();
-  _power_db_G1.swap();
  _noOfBitSetsInSideChannel.swap();
-  process(_raw_voltage_db.b(), _sideChannelData_db.b(), _power_db_G0.a(),
+  process(_raw_voltage_db.b(), _sideChannelData_db.b(), _power_db.a(), _noOfBitSetsInSideChannel.a());
-          _power_db_G1.a(), _noOfBitSetsInSideChannel.a());
  // signal that data block has been processed
  CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
@@ -350,15 +345,15 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
  std::swap(_noOfBitSetsInSideChannel_host[0], _noOfBitSetsInSideChannel_host[1]);
  CUDA_ERROR_CHECK(
      cudaMemcpyAsync(static_cast<void *>(_host_power_db.a_ptr()),
-                      static_cast<void *>(_power_db_G0.b_ptr()),
+                      static_cast<void *>(_power_db.b_ptr()),
-                      _power_db_G0.size() * sizeof(IntegratedPowerType),
+                      _power_db.size() * sizeof(IntegratedPowerType),
                      cudaMemcpyDeviceToHost, _d2h_stream));
-  CUDA_ERROR_CHECK(cudaMemcpyAsync(
+//  CUDA_ERROR_CHECK(cudaMemcpyAsync(
-      static_cast<void *>(_host_power_db.a_ptr() +
+//        static_cast<void *>(_host_power_db.a_ptr() +
-                          (_power_db_G0.size())),           // as I am adding BEFORE the cast to void, I dont need the sizeof
+//                          (_power_db_G0.size())),           // as I am adding BEFORE the cast to void, I dont need the sizeof
-      static_cast<void *>(_power_db_G1.b_ptr()),
+//      static_cast<void *>(_power_db_G1.b_ptr()),
-      _power_db_G1.size() * sizeof(IntegratedPowerType), cudaMemcpyDeviceToHost,
+//      _power_db_G1.size() * sizeof(IntegratedPowerType), cudaMemcpyDeviceToHost,
-      _d2h_stream));
+//      _d2h_stream));
  CUDA_ERROR_CHECK(cudaMemcpyAsync(static_cast<void *>(&_noOfBitSetsInSideChannel_host[0]),
        static_cast<void *>(_noOfBitSetsInSideChannel.b_ptr()),