diff --git a/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh b/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh index d5efa88b96c9439bb836e0841b3c74b16abe39e9..c95b3e0ca5dec13c062d9cf55350609bdb5ec62f 100644 --- a/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh +++ b/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh @@ -14,7 +14,7 @@ namespace kernels { template <typename T> __global__ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ out, - int nchans, int nsamps, int naccumulate, float scale, float offset) + int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset) { // grid stride loop over output array to keep for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x) @@ -30,7 +30,8 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o double y = tmp.y * tmp.y; sum += x + y; } - out[i] = (int8_t) ((sum - offset)/scale); + size_t toff = out_offset * nchans + currentOutputSpectra * nchans; + out[toff + i] = (int8_t) ((sum - offset)/scale); } } @@ -39,7 +40,7 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o template <typename T> __global__ void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out, - int nchans, int nsamps, int naccumulate, float scale, float offset) + int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset) { for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x) { @@ -54,7 +55,8 @@ void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ ou double y = tmp.y * tmp.y; sum += x + y; } - out[i] = sum; + size_t toff = out_offset * nchans + currentOutputSpectra * nchans * stride; + out[i + toff] = sum; } } @@ -90,15 +92,17 @@ public: } - void detect(InputType const& input, OutputType& output) + // stride sets an offset of _nChans * stride to the detection in the output + // to allow multiple spectra in one output + void detect(InputType const& input, OutputType& output, int stride = 0, int stoff = 0) { assert(input.size() % (_nchans * _tscrunch) == 0 /* Input is not a multiple of _nchans * _tscrunch*/); - output.resize(input.size()/_tscrunch); + //output.resize(input.size()/_tscrunch); int nsamps = input.size() / _nchans; float2 const* input_ptr = thrust::raw_pointer_cast(input.data()); T * output_ptr = thrust::raw_pointer_cast(output.data()); kernels::detect_and_accumulate<T> <<<1024, 1024, 0, _stream>>>( - input_ptr, output_ptr, _nchans, nsamps, _tscrunch, _scale, _offset); + input_ptr, output_ptr, _nchans, nsamps, _tscrunch, _scale, _offset, stride, stoff); } diff --git a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh index 1d51268e55b1a3ea2ca9f5d4fbe01a28df4a7b1b..6e9cf4dd559b39320ad7349f3abd2c3ddde76c90 100644 --- a/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh +++ b/psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh @@ -89,8 +89,7 @@ public: private: void process(thrust::device_vector<RawVoltageType> const &digitiser_raw, thrust::device_vector<int64_t> const &sideChannelData, - thrust::device_vector<IntegratedPowerType> &detected_G0, - thrust::device_vector<IntegratedPowerType> &detected_G1, + thrust::device_vector<IntegratedPowerType> &detected, thrust::device_vector<size_t> &noOfBitSet); private: @@ -116,8 +115,7 @@ private: std::unique_ptr<DetectorAccumulator<IntegratedPowerType> > _detector; DoubleDeviceBuffer<RawVoltageType> _raw_voltage_db; - DoubleDeviceBuffer<IntegratedPowerType> _power_db_G0; - DoubleDeviceBuffer<IntegratedPowerType> _power_db_G1; + DoubleDeviceBuffer<IntegratedPowerType> _power_db; DoubleDeviceBuffer<int64_t> _sideChannelData_db; DoubleDeviceBuffer<size_t> _noOfBitSetsInSideChannel; size_t _noOfBitSetsInSideChannel_host [2]; diff --git a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu index e9f3c82ec181232cf75566e6eabbd09b96337736..56bb43b813e75eb0b2a624227b49052bdfefdf97 100644 --- a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu +++ b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu @@ -180,12 +180,10 @@ GatedSpectrometer<HandlerType, IntegratedPowerType>::GatedSpectrometer( _channelised_voltage.resize(_nchans * batch); BOOST_LOG_TRIVIAL(debug) << " Channelised voltages size: " << _channelised_voltage.size(); - _power_db_G0.resize(_nchans * batch / _naccumulate); - _power_db_G1.resize(_nchans * batch / _naccumulate); - BOOST_LOG_TRIVIAL(debug) << " Powers size: " << _power_db_G0.size() << ", " - << _power_db_G1.size(); + _power_db.resize(_nchans * batch / _naccumulate * 2); // hold on and off spectra to simplify output + BOOST_LOG_TRIVIAL(debug) << " Powers size: " << _power_db.size() / 2; // on the host both power are stored in the same data buffer - _host_power_db.resize( _power_db_G0.size() + _power_db_G1 .size()); + _host_power_db.resize( _power_db.size()); _noOfBitSetsInSideChannel.resize(1); CUDA_ERROR_CHECK(cudaStreamCreate(&_h2d_stream)); @@ -244,8 +242,7 @@ template <class HandlerType, typename IntegratedPowerType> void GatedSpectrometer<HandlerType, IntegratedPowerType>::process( thrust::device_vector<RawVoltageType> const &digitiser_raw, thrust::device_vector<int64_t> const &sideChannelData, - thrust::device_vector<IntegratedPowerType> &detected_G0, - thrust::device_vector<IntegratedPowerType> &detected_G1, thrust::device_vector<size_t> &noOfBitSet) { + thrust::device_vector<IntegratedPowerType> &detected, thrust::device_vector<size_t> &noOfBitSet) { BOOST_LOG_TRIVIAL(debug) << "Unpacking raw voltages"; switch (_nbits) { case 8: @@ -282,7 +279,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process( thrust::raw_pointer_cast(_channelised_voltage.data()); CUFFT_ERROR_CHECK(cufftExecR2C(_fft_plan, (cufftReal *)_unpacked_voltage_ptr, (cufftComplex *)_channelised_voltage_ptr)); - _detector->detect(_channelised_voltage, detected_G0); + _detector->detect(_channelised_voltage, detected, 2, 0); BOOST_LOG_TRIVIAL(debug) << "Performing FFT 2"; _unpacked_voltage_ptr = thrust::raw_pointer_cast(_unpacked_voltage_G1.data()); @@ -290,7 +287,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process( (cufftComplex *)_channelised_voltage_ptr)); CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream)); - _detector->detect(_channelised_voltage, detected_G1); + _detector->detect(_channelised_voltage, detected, 2, 1); CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream)); BOOST_LOG_TRIVIAL(debug) << "Exit processing"; } // process @@ -331,12 +328,10 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b } // Synchronize all streams - _power_db_G0.swap(); - _power_db_G1.swap(); + _power_db.swap(); _noOfBitSetsInSideChannel.swap(); - process(_raw_voltage_db.b(), _sideChannelData_db.b(), _power_db_G0.a(), - _power_db_G1.a(), _noOfBitSetsInSideChannel.a()); + process(_raw_voltage_db.b(), _sideChannelData_db.b(), _power_db.a(), _noOfBitSetsInSideChannel.a()); // signal that data block has been processed CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream)); @@ -350,15 +345,15 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b std::swap(_noOfBitSetsInSideChannel_host[0], _noOfBitSetsInSideChannel_host[1]); CUDA_ERROR_CHECK( cudaMemcpyAsync(static_cast<void *>(_host_power_db.a_ptr()), - static_cast<void *>(_power_db_G0.b_ptr()), - _power_db_G0.size() * sizeof(IntegratedPowerType), + static_cast<void *>(_power_db.b_ptr()), + _power_db.size() * sizeof(IntegratedPowerType), cudaMemcpyDeviceToHost, _d2h_stream)); - CUDA_ERROR_CHECK(cudaMemcpyAsync( - static_cast<void *>(_host_power_db.a_ptr() + - (_power_db_G0.size())), // as I am adding BEFORE the cast to void, I dont need the sizeof - static_cast<void *>(_power_db_G1.b_ptr()), - _power_db_G1.size() * sizeof(IntegratedPowerType), cudaMemcpyDeviceToHost, - _d2h_stream)); +// CUDA_ERROR_CHECK(cudaMemcpyAsync( +// static_cast<void *>(_host_power_db.a_ptr() + +// (_power_db_G0.size())), // as I am adding BEFORE the cast to void, I dont need the sizeof +// static_cast<void *>(_power_db_G1.b_ptr()), +// _power_db_G1.size() * sizeof(IntegratedPowerType), cudaMemcpyDeviceToHost, +// _d2h_stream)); CUDA_ERROR_CHECK(cudaMemcpyAsync(static_cast<void *>(&_noOfBitSetsInSideChannel_host[0]), static_cast<void *>(_noOfBitSetsInSideChannel.b_ptr()),