Commit e3edc797 authored by Tobias Winchen's avatar Tobias Winchen
Browse files

Alternate on / off spectra in output

parent c722b311
......@@ -14,7 +14,7 @@ namespace kernels {
template <typename T>
__global__
void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ out,
int nchans, int nsamps, int naccumulate, float scale, float offset)
int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
{
// grid stride loop over output array to keep
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x)
......@@ -30,7 +30,8 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
double y = tmp.y * tmp.y;
sum += x + y;
}
out[i] = (int8_t) ((sum - offset)/scale);
size_t toff = out_offset * nchans + currentOutputSpectra * nchans;
out[toff + i] = (int8_t) ((sum - offset)/scale);
}
}
......@@ -39,7 +40,7 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
template <typename T>
__global__
void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out,
int nchans, int nsamps, int naccumulate, float scale, float offset)
int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
{
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x)
{
......@@ -54,7 +55,8 @@ void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ ou
double y = tmp.y * tmp.y;
sum += x + y;
}
out[i] = sum;
size_t toff = out_offset * nchans + currentOutputSpectra * nchans * stride;
out[i + toff] = sum;
}
}
......@@ -90,15 +92,17 @@ public:
}
void detect(InputType const& input, OutputType& output)
// stride sets an offset of _nChans * stride to the detection in the output
// to allow multiple spectra in one output
void detect(InputType const& input, OutputType& output, int stride = 0, int stoff = 0)
{
assert(input.size() % (_nchans * _tscrunch) == 0 /* Input is not a multiple of _nchans * _tscrunch*/);
output.resize(input.size()/_tscrunch);
//output.resize(input.size()/_tscrunch);
int nsamps = input.size() / _nchans;
float2 const* input_ptr = thrust::raw_pointer_cast(input.data());
T * output_ptr = thrust::raw_pointer_cast(output.data());
kernels::detect_and_accumulate<T> <<<1024, 1024, 0, _stream>>>(
input_ptr, output_ptr, _nchans, nsamps, _tscrunch, _scale, _offset);
input_ptr, output_ptr, _nchans, nsamps, _tscrunch, _scale, _offset, stride, stoff);
}
......
......@@ -89,8 +89,7 @@ public:
private:
void process(thrust::device_vector<RawVoltageType> const &digitiser_raw,
thrust::device_vector<int64_t> const &sideChannelData,
thrust::device_vector<IntegratedPowerType> &detected_G0,
thrust::device_vector<IntegratedPowerType> &detected_G1,
thrust::device_vector<IntegratedPowerType> &detected,
thrust::device_vector<size_t> &noOfBitSet);
private:
......@@ -116,8 +115,7 @@ private:
std::unique_ptr<DetectorAccumulator<IntegratedPowerType> > _detector;
DoubleDeviceBuffer<RawVoltageType> _raw_voltage_db;
DoubleDeviceBuffer<IntegratedPowerType> _power_db_G0;
DoubleDeviceBuffer<IntegratedPowerType> _power_db_G1;
DoubleDeviceBuffer<IntegratedPowerType> _power_db;
DoubleDeviceBuffer<int64_t> _sideChannelData_db;
DoubleDeviceBuffer<size_t> _noOfBitSetsInSideChannel;
size_t _noOfBitSetsInSideChannel_host [2];
......
......@@ -180,12 +180,10 @@ GatedSpectrometer<HandlerType, IntegratedPowerType>::GatedSpectrometer(
_channelised_voltage.resize(_nchans * batch);
BOOST_LOG_TRIVIAL(debug) << " Channelised voltages size: "
<< _channelised_voltage.size();
_power_db_G0.resize(_nchans * batch / _naccumulate);
_power_db_G1.resize(_nchans * batch / _naccumulate);
BOOST_LOG_TRIVIAL(debug) << " Powers size: " << _power_db_G0.size() << ", "
<< _power_db_G1.size();
_power_db.resize(_nchans * batch / _naccumulate * 2); // hold on and off spectra to simplify output
BOOST_LOG_TRIVIAL(debug) << " Powers size: " << _power_db.size() / 2;
// on the host both power are stored in the same data buffer
_host_power_db.resize( _power_db_G0.size() + _power_db_G1 .size());
_host_power_db.resize( _power_db.size());
_noOfBitSetsInSideChannel.resize(1);
CUDA_ERROR_CHECK(cudaStreamCreate(&_h2d_stream));
......@@ -244,8 +242,7 @@ template <class HandlerType, typename IntegratedPowerType>
void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
thrust::device_vector<RawVoltageType> const &digitiser_raw,
thrust::device_vector<int64_t> const &sideChannelData,
thrust::device_vector<IntegratedPowerType> &detected_G0,
thrust::device_vector<IntegratedPowerType> &detected_G1, thrust::device_vector<size_t> &noOfBitSet) {
thrust::device_vector<IntegratedPowerType> &detected, thrust::device_vector<size_t> &noOfBitSet) {
BOOST_LOG_TRIVIAL(debug) << "Unpacking raw voltages";
switch (_nbits) {
case 8:
......@@ -282,7 +279,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
thrust::raw_pointer_cast(_channelised_voltage.data());
CUFFT_ERROR_CHECK(cufftExecR2C(_fft_plan, (cufftReal *)_unpacked_voltage_ptr,
(cufftComplex *)_channelised_voltage_ptr));
_detector->detect(_channelised_voltage, detected_G0);
_detector->detect(_channelised_voltage, detected, 2, 0);
BOOST_LOG_TRIVIAL(debug) << "Performing FFT 2";
_unpacked_voltage_ptr = thrust::raw_pointer_cast(_unpacked_voltage_G1.data());
......@@ -290,7 +287,7 @@ void GatedSpectrometer<HandlerType, IntegratedPowerType>::process(
(cufftComplex *)_channelised_voltage_ptr));
CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
_detector->detect(_channelised_voltage, detected_G1);
_detector->detect(_channelised_voltage, detected, 2, 1);
CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
BOOST_LOG_TRIVIAL(debug) << "Exit processing";
} // process
......@@ -331,12 +328,10 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
}
// Synchronize all streams
_power_db_G0.swap();
_power_db_G1.swap();
_power_db.swap();
_noOfBitSetsInSideChannel.swap();
process(_raw_voltage_db.b(), _sideChannelData_db.b(), _power_db_G0.a(),
_power_db_G1.a(), _noOfBitSetsInSideChannel.a());
process(_raw_voltage_db.b(), _sideChannelData_db.b(), _power_db.a(), _noOfBitSetsInSideChannel.a());
// signal that data block has been processed
CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
......@@ -350,15 +345,15 @@ bool GatedSpectrometer<HandlerType, IntegratedPowerType>::operator()(RawBytes &b
std::swap(_noOfBitSetsInSideChannel_host[0], _noOfBitSetsInSideChannel_host[1]);
CUDA_ERROR_CHECK(
cudaMemcpyAsync(static_cast<void *>(_host_power_db.a_ptr()),
static_cast<void *>(_power_db_G0.b_ptr()),
_power_db_G0.size() * sizeof(IntegratedPowerType),
static_cast<void *>(_power_db.b_ptr()),
_power_db.size() * sizeof(IntegratedPowerType),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(cudaMemcpyAsync(
static_cast<void *>(_host_power_db.a_ptr() +
(_power_db_G0.size())), // as I am adding BEFORE the cast to void, I dont need the sizeof
static_cast<void *>(_power_db_G1.b_ptr()),
_power_db_G1.size() * sizeof(IntegratedPowerType), cudaMemcpyDeviceToHost,
_d2h_stream));
// CUDA_ERROR_CHECK(cudaMemcpyAsync(
// static_cast<void *>(_host_power_db.a_ptr() +
// (_power_db_G0.size())), // as I am adding BEFORE the cast to void, I dont need the sizeof
// static_cast<void *>(_power_db_G1.b_ptr()),
// _power_db_G1.size() * sizeof(IntegratedPowerType), cudaMemcpyDeviceToHost,
// _d2h_stream));
CUDA_ERROR_CHECK(cudaMemcpyAsync(static_cast<void *>(&_noOfBitSetsInSideChannel_host[0]),
static_cast<void *>(_noOfBitSetsInSideChannel.b_ptr()),
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment