Commit e3edc797 authored by Tobias Winchen's avatar Tobias Winchen
Browse files

Alternate on / off spectra in output

parent c722b311
...@@ -14,7 +14,7 @@ namespace kernels { ...@@ -14,7 +14,7 @@ namespace kernels {
template <typename T> template <typename T>
__global__ __global__
void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ out, void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ out,
int nchans, int nsamps, int naccumulate, float scale, float offset) int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
{ {
// grid stride loop over output array to keep // grid stride loop over output array to keep
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x) for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x)
...@@ -30,7 +30,8 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o ...@@ -30,7 +30,8 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
double y = tmp.y * tmp.y; double y = tmp.y * tmp.y;
sum += x + y; sum += x + y;
} }
out[i] = (int8_t) ((sum - offset)/scale); size_t toff = out_offset * nchans + currentOutputSpectra * nchans;
out[toff + i] = (int8_t) ((sum - offset)/scale);
} }
} }
...@@ -39,7 +40,7 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o ...@@ -39,7 +40,7 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
template <typename T> template <typename T>
__global__ __global__
void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out, void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out,
int nchans, int nsamps, int naccumulate, float scale, float offset) int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
{ {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x) for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x)
{ {
...@@ -54,7 +55,8 @@ void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ ou ...@@ -54,7 +55,8 @@ void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ ou
double y = tmp.y * tmp.y; double y = tmp.y * tmp.y;
sum += x + y; sum += x + y;
} }
out[i] = sum; size_t toff = out_offset * nchans + currentOutputSpectra * nchans * stride;
out[i + toff] = sum;
} }
} }
...@@ -90,15 +92,17 @@ public: ...@@ -90,15 +92,17 @@ public:
} }
void detect(InputType const& input, OutputType& output) // stride sets an offset of _nChans * stride to the detection in the output
// to allow multiple spectra in one output
void detect(InputType const& input, OutputType& output, int stride = 0, int stoff = 0)
{ {
assert(input.size() % (_nchans * _tscrunch) == 0 /* Input is not a multiple of _nchans * _tscrunch*/); assert(input.size() % (_nchans * _tscrunch) == 0 /* Input is not a multiple of _nchans * _tscrunch*/);
output.resize(input.size()/_tscrunch); //output.resize(input.size()/_tscrunch);
int nsamps = input.size() / _nchans; int nsamps = input.size() / _nchans;
float2 const* input_ptr = thrust::raw_pointer_cast(input.data()); float2 const* input_ptr = thrust::raw_pointer_cast(input.data());
T * output_ptr = thrust::raw_pointer_cast(output.data()); T * output_ptr = thrust::raw_pointer_cast(output.data());
kernels::detect_and_accumulate<T> <<<1024, 1024, 0, _stream>>>( kernels::detect_and_accumulate<T> <<<1024, 1024, 0, _stream>>>(
input_ptr, output_ptr, _nchans, nsamps, _tscrunch, _scale, _offset); input_ptr, output_ptr, _nchans, nsamps, _tscrunch, _scale, _offset, stride, stoff);
} }
......
...@@ -89,8 +89,7 @@ public: ...@@ -89,8 +89,7 @@ public:
private: private:
void process(thrust::device_vector<RawVoltageType> const &digitiser_raw, void process(thrust::device_vector<RawVoltageType> const &digitiser_raw,
thrust::device_vector<int64_t> const &sideChannelData, thrust::device_vector<int64_t> const &sideChannelData,
thrust::device_vector<IntegratedPowerType> &detected_G0, thrust::device_vector<IntegratedPowerType> &detected,
thrust::device_vector<IntegratedPowerType> &detected_G1,
thrust::device_vector<size_t> &noOfBitSet); thrust::device_vector<size_t> &noOfBitSet);
private: private:
...@@ -116,8 +115,7 @@ private: ...@@ -116,8 +115,7 @@ private:
std::unique_ptr<DetectorAccumulator<IntegratedPowerType> > _detector; std::unique_ptr<DetectorAccumulator<IntegratedPowerType> > _detector;
DoubleDeviceBuffer<RawVoltageType> _raw_voltage_db; DoubleDeviceBuffer<RawVoltageType> _raw_voltage_db;
DoubleDeviceBuffer<IntegratedPowerType> _power_db_G0; DoubleDeviceBuffer<IntegratedPowerType> _power_db;
DoubleDeviceBuffer<IntegratedPowerType> _power_db_G1;
DoubleDeviceBuffer<int64_t> _sideChannelData_db; DoubleDeviceBuffer<int64_t> _sideChannelData_db;
DoubleDeviceBuffer<size_t> _noOfBitSetsInSideChannel; DoubleDeviceBuffer<size_t> _noOfBitSetsInSideChannel;
size_t _noOfBitSetsInSideChannel_host [2]; size_t _noOfBitSetsInSideChannel_host [2];
......
...@@ -180,12 +180,10 @@ GatedSpectrometer<HandlerType, IntegratedPowerType>::GatedSpectrometer( ...@@ -180,12 +180,10 @@ GatedSpectrometer<HandlerType, IntegratedPowerType>::GatedSpectrometer(
_channelised_voltage.resize(_nchans * batch); _channelised_voltage.resize(_nchans * batch);
BOOST_LOG_TRIVIAL(debug) << " Channelised voltages size: " BOOST_LOG_TRIVIAL(debug) << " Channelised voltages size: "
<< _channelised_voltage.size(); << _channelised_voltage.size();
_power_db_G0.resize(_nchans * batch / _naccumulate); _power_db.resize(_nchans * batch / _naccumulate * 2); // hold on and off spectra to simplify output
_power_db_G1.resize(_nchans * batch / _naccumulate); BOOST_LOG_TRIVIAL(debug) << " Powers size: " << _power_db.size() / 2;
BOOST_LOG_TRIVIAL(debug) << " Powers size: " << _power_db_G0.size() << ", "
<< _power_db_G1.size();
// on the host both power are stored in the same data buffer // on the host both power are stored in the same data buffer
_host_power_db.resize( _power_db_G0.size() + _power_db_G1 .size()); _host_power_db.resize( _power_db.size());
_noOfBitSetsInSideChannel.resize(1); _noOfBitSetsInSideChannel.resize(1);
CUDA_ERROR_CHECK(cudaStreamCreate(&_h2d_stream)); CUDA_ERROR_CHECK(cudaStreamCreate(&_h2d_stream));
...@@ -244,8 +242,7 @@ template <class HandlerType, typename IntegratedPowerType> ...@@ -244,8 +242,7 @@ template <class HandlerType, typename IntegratedPowerType>
void GatedSpectrometer<HandlerType, IntegratedPowerType>::process( void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
thrust::device_vector<RawVoltageType> const &digitiser_raw, thrust::device_vector<RawVoltageType> const &digitiser_raw,
thrust::device_vector<int64_t> const &sideChannelData, thrust::device_vector<int64_t> const &sideChannelData,
thrust::device_vector<IntegratedPowerType> &detected_G0, thrust::device_vector<IntegratedPowerType> &detected, thrust::device_vector<size_t> &noOfBitSet) {
thrust::device_vector<IntegratedPowerType> &detected_G1, thrust::device_vector<size_t> &noOfBitSet) {
BOOST_LOG_TRIVIAL(debug) << "Unpacking raw voltages"; BOOST_LOG_TRIVIAL(debug) << "Unpacking raw voltages";
switch (_nbits) { switch (_nbits) {
case 8: case 8:
...@@ -282,7 +279,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process( ...@@ -282,7 +279,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
thrust::raw_pointer_cast(_channelised_voltage.data()); thrust::raw_pointer_cast(_channelised_voltage.data());
CUFFT_ERROR_CHECK(cufftExecR2C(_fft_plan, (cufftReal *)_unpacked_voltage_ptr, CUFFT_ERROR_CHECK(cufftExecR2C(_fft_plan, (cufftReal *)_unpacked_voltage_ptr,
(cufftComplex *)_channelised_voltage_ptr)); (cufftComplex *)_channelised_voltage_ptr));
_detector->detect(_channelised_voltage, detected_G0); _detector->detect(_channelised_voltage, detected, 2, 0);
BOOST_LOG_TRIVIAL(debug) << "Performing FFT 2"; BOOST_LOG_TRIVIAL(debug) << "Performing FFT 2";
_unpacked_voltage_ptr = thrust::raw_pointer_cast(_unpacked_voltage_G1.data()); _unpacked_voltage_ptr = thrust::raw_pointer_cast(_unpacked_voltage_G1.data());
...@@ -290,7 +287,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process( ...@@ -290,7 +287,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
(cufftComplex *)_channelised_voltage_ptr)); (cufftComplex *)_channelised_voltage_ptr));
CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream)); CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
_detector->detect(_channelised_voltage, detected_G1); _detector->detect(_channelised_voltage, detected, 2, 1);
CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream)); CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
BOOST_LOG_TRIVIAL(debug) << "Exit processing"; BOOST_LOG_TRIVIAL(debug) << "Exit processing";
} // process } // process
...@@ -331,12 +328,10 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b ...@@ -331,12 +328,10 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
} }
// Synchronize all streams // Synchronize all streams
_power_db_G0.swap(); _power_db.swap();
_power_db_G1.swap();
_noOfBitSetsInSideChannel.swap(); _noOfBitSetsInSideChannel.swap();
process(_raw_voltage_db.b(), _sideChannelData_db.b(), _power_db_G0.a(), process(_raw_voltage_db.b(), _sideChannelData_db.b(), _power_db.a(), _noOfBitSetsInSideChannel.a());
_power_db_G1.a(), _noOfBitSetsInSideChannel.a());
// signal that data block has been processed // signal that data block has been processed
CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream)); CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
...@@ -350,15 +345,15 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b ...@@ -350,15 +345,15 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
std::swap(_noOfBitSetsInSideChannel_host[0], _noOfBitSetsInSideChannel_host[1]); std::swap(_noOfBitSetsInSideChannel_host[0], _noOfBitSetsInSideChannel_host[1]);
CUDA_ERROR_CHECK( CUDA_ERROR_CHECK(
cudaMemcpyAsync(static_cast<void *>(_host_power_db.a_ptr()), cudaMemcpyAsync(static_cast<void *>(_host_power_db.a_ptr()),
static_cast<void *>(_power_db_G0.b_ptr()), static_cast<void *>(_power_db.b_ptr()),
_power_db_G0.size() * sizeof(IntegratedPowerType), _power_db.size() * sizeof(IntegratedPowerType),
cudaMemcpyDeviceToHost, _d2h_stream)); cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(cudaMemcpyAsync( // CUDA_ERROR_CHECK(cudaMemcpyAsync(
static_cast<void *>(_host_power_db.a_ptr() + // static_cast<void *>(_host_power_db.a_ptr() +
(_power_db_G0.size())), // as I am adding BEFORE the cast to void, I dont need the sizeof // (_power_db_G0.size())), // as I am adding BEFORE the cast to void, I dont need the sizeof
static_cast<void *>(_power_db_G1.b_ptr()), // static_cast<void *>(_power_db_G1.b_ptr()),
_power_db_G1.size() * sizeof(IntegratedPowerType), cudaMemcpyDeviceToHost, // _power_db_G1.size() * sizeof(IntegratedPowerType), cudaMemcpyDeviceToHost,
_d2h_stream)); // _d2h_stream));
CUDA_ERROR_CHECK(cudaMemcpyAsync(static_cast<void *>(&_noOfBitSetsInSideChannel_host[0]), CUDA_ERROR_CHECK(cudaMemcpyAsync(static_cast<void *>(&_noOfBitSetsInSideChannel_host[0]),
static_cast<void *>(_noOfBitSetsInSideChannel.b_ptr()), static_cast<void *>(_noOfBitSetsInSideChannel.b_ptr()),
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment