Commit 75dfadc4 authored by Tobias Winchen's avatar Tobias Winchen
Browse files

Allow 32 bit output

parent e7f380fc
......@@ -9,23 +9,99 @@ namespace effelsberg {
namespace edd {
namespace kernels {
template<typename T>
__global__
void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ out,
int nchans, int nsamps, int naccumulate, float scale, float offset);
int nchans, int nsamps, int naccumulate, float scale, float offset)
{
for (int block_idx = blockIdx.x; block_idx < nsamps/naccumulate; block_idx += gridDim.x)
{
int read_offset = block_idx * naccumulate * nchans;
int write_offset = block_idx * nchans;
for (int chan_idx = threadIdx.x; chan_idx < nchans; chan_idx += blockDim.x)
{
float sum = 0.0f;
for (int ii=0; ii < naccumulate; ++ii)
{
float2 tmp = in[read_offset + chan_idx + ii*nchans];
float x = tmp.x * tmp.x;
float y = tmp.y * tmp.y;
sum += x + y;
}
out[write_offset + chan_idx] = (int8_t) ((sum - offset)/scale);
}
}
}
template<typename T>
__global__
void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out,
int nchans, int nsamps, int naccumulate, float scale, float offset)
{
for (int block_idx = blockIdx.x; block_idx < nsamps/naccumulate; block_idx += gridDim.x)
{
int read_offset = block_idx * naccumulate * nchans;
int write_offset = block_idx * nchans;
for (int chan_idx = threadIdx.x; chan_idx < nchans; chan_idx += blockDim.x)
{
double sum = 0.0;
for (int ii=0; ii < naccumulate; ++ii)
{
float2 tmp = in[read_offset + chan_idx + ii*nchans];
double x = tmp.x * tmp.x;
double y = tmp.y * tmp.y;
sum += x + y;
}
out[write_offset + chan_idx] = (float) sum;
}
}
}
}
template <typename T>
class DetectorAccumulator
{
public:
typedef thrust::device_vector<float2> InputType;
typedef thrust::device_vector<int8_t> OutputType;
typedef thrust::device_vector<T> OutputType;
public:
DetectorAccumulator(int nchans, int tscrunch, float scale, float offset, cudaStream_t stream);
~DetectorAccumulator();
DetectorAccumulator(DetectorAccumulator const&) = delete;
void detect(InputType const& input, OutputType& output);
DetectorAccumulator(
int nchans, int tscrunch, float scale,
float offset, cudaStream_t stream)
: _nchans(nchans)
, _tscrunch(tscrunch)
, _scale(scale)
, _offset(offset)
, _stream(stream)
{
}
~DetectorAccumulator()
{
}
void detect(InputType const& input, OutputType& output)
{
assert(input.size() % (_nchans * _tscrunch) == 0 /* Input is not a multiple of _nchans * _tscrunch*/);
output.resize(input.size()/_tscrunch);
int nsamps = input.size() / _nchans;
float2 const* input_ptr = thrust::raw_pointer_cast(input.data());
T * output_ptr = thrust::raw_pointer_cast(output.data());
kernels::detect_and_accumulate<T> <<<1024, 1024, 0, _stream>>>(
input_ptr, output_ptr, _nchans, nsamps, _tscrunch, _scale, _offset);
}
private:
int _nchans;
......
......@@ -68,7 +68,7 @@ private:
int _nchans;
int _call_count;
std::unique_ptr<Unpacker> _unpacker;
std::unique_ptr<DetectorAccumulator> _detector;
std::unique_ptr<DetectorAccumulator<int8_t> > _detector;
DoubleDeviceBuffer<RawVoltageType> _raw_voltage_db;
DoubleDeviceBuffer<IntegratedPowerType> _power_db;
thrust::device_vector<UnpackedVoltageType> _unpacked_voltage;
......
......@@ -35,7 +35,8 @@ public:
typedef uint64_t RawVoltageType;
typedef float UnpackedVoltageType;
typedef float2 ChannelisedVoltageType;
typedef int8_t IntegratedPowerType;
typedef float IntegratedPowerType;
//typedef int8_t IntegratedPowerType;
public:
/**
......@@ -112,7 +113,7 @@ private:
int _nchans;
int _call_count;
std::unique_ptr<Unpacker> _unpacker;
std::unique_ptr<DetectorAccumulator> _detector;
std::unique_ptr<DetectorAccumulator<IntegratedPowerType> > _detector;
DoubleDeviceBuffer<RawVoltageType> _raw_voltage_db;
DoubleDeviceBuffer<IntegratedPowerType> _power_db_G0;
......
......@@ -63,7 +63,7 @@ FftSpectrometer<HandlerType>::FftSpectrometer(
CUDA_ERROR_CHECK(cudaStreamCreate(&_d2h_stream));
CUFFT_ERROR_CHECK(cufftSetStream(_fft_plan, _proc_stream));
_unpacker.reset(new Unpacker(_proc_stream));
_detector.reset(new DetectorAccumulator(_nchans, _naccumulate,
_detector.reset(new DetectorAccumulator<int8_t>(_nchans, _naccumulate,
scaling, offset, _proc_stream));
}
......
......@@ -11,6 +11,11 @@ namespace psrdada_cpp {
namespace effelsberg {
namespace edd {
__global__ void gating(float* __restrict__ G0, float* __restrict__ G1, const int64_t* __restrict__ sideChannelData,
size_t N, size_t heapSize, size_t bitpos,
size_t noOfSideChannels, size_t selectedSideChannel, const float* __restrict__ _baseLineN) {
......@@ -186,7 +191,7 @@ GatedSpectrometer<HandlerType>::GatedSpectrometer(
CUFFT_ERROR_CHECK(cufftSetStream(_fft_plan, _proc_stream));
_unpacker.reset(new Unpacker(_proc_stream));
_detector.reset(new DetectorAccumulator(_nchans, _naccumulate, scaling,
_detector.reset(new DetectorAccumulator<IntegratedPowerType>(_nchans, _naccumulate, scaling,
offset, _proc_stream));
} // constructor
......@@ -260,6 +265,7 @@ void GatedSpectrometer<HandlerType>::process(
CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
_detector->detect(_channelised_voltage, detected_G1);
CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
BOOST_LOG_TRIVIAL(debug) << "Exit processing";
} // process
......@@ -313,25 +319,31 @@ bool GatedSpectrometer<HandlerType>::operator()(RawBytes &block) {
return false;
}
BOOST_LOG_TRIVIAL(debug) << "Copy Data back to device";
CUDA_ERROR_CHECK(cudaStreamSynchronize(_d2h_stream));
BOOST_LOG_TRIVIAL(debug) << "Swap host power";
_host_power_db.swap();
BOOST_LOG_TRIVIAL(debug) << "swap no of bit channel";
std::swap(_noOfBitSetsInSideChannel_host[0], _noOfBitSetsInSideChannel_host[1]);
BOOST_LOG_TRIVIAL(debug) << "Foo";
CUDA_ERROR_CHECK(
cudaMemcpyAsync(static_cast<void *>(_host_power_db.a_ptr()),
static_cast<void *>(_power_db_G0.b_ptr()),
_power_db_G0.size() * sizeof(IntegratedPowerType),
cudaMemcpyDeviceToHost, _d2h_stream));
BOOST_LOG_TRIVIAL(debug) << "Bar";
CUDA_ERROR_CHECK(cudaMemcpyAsync(
static_cast<void *>(_host_power_db.a_ptr() +
(_power_db_G0.size() * sizeof(IntegratedPowerType))),
(_power_db_G0.size())), // as I am adding BEFORE the cast to void, I dont need the sizeof
static_cast<void *>(_power_db_G1.b_ptr()),
_power_db_G1.size() * sizeof(IntegratedPowerType), cudaMemcpyDeviceToHost,
_d2h_stream));
CUDA_ERROR_CHECK(cudaMemcpyAsync(static_cast<void *>(&_noOfBitSetsInSideChannel_host[0]),
static_cast<void *>(_noOfBitSetsInSideChannel.b_ptr()),
1 * sizeof(unsigned int),cudaMemcpyDeviceToHost, _d2h_stream));
//CUDA_ERROR_CHECK(cudaMemcpyAsync(static_cast<void *>(&_noOfBitSetsInSideChannel_host[0]),
// static_cast<void *>(_noOfBitSetsInSideChannel.b_ptr()),
// 1 * sizeof(size_t),cudaMemcpyDeviceToHost, _d2h_stream));
BOOST_LOG_TRIVIAL(debug) << "Copy Data back to device";
if (_call_count == 3) {
return false;
......
......@@ -6,59 +6,9 @@ namespace effelsberg {
namespace edd {
namespace kernels {
__global__
void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ out,
int nchans, int nsamps, int naccumulate, float scale, float offset)
{
for (int block_idx = blockIdx.x; block_idx < nsamps/naccumulate; block_idx += gridDim.x)
{
int read_offset = block_idx * naccumulate * nchans;
int write_offset = block_idx * nchans;
for (int chan_idx = threadIdx.x; chan_idx < nchans; chan_idx += blockDim.x)
{
float sum = 0.0f;
for (int ii=0; ii < naccumulate; ++ii)
{
float2 tmp = in[read_offset + chan_idx + ii*nchans];
float x = tmp.x * tmp.x;
float y = tmp.y * tmp.y;
sum += x + y;
}
out[write_offset + chan_idx] = (int8_t) ((sum - offset)/scale);
}
}
}
} //namespace kernels
DetectorAccumulator::DetectorAccumulator(
int nchans, int tscrunch, float scale,
float offset, cudaStream_t stream)
: _nchans(nchans)
, _tscrunch(tscrunch)
, _scale(scale)
, _offset(offset)
, _stream(stream)
{
}
DetectorAccumulator::~DetectorAccumulator()
{
}
void DetectorAccumulator::detect(InputType const& input, OutputType& output)
{
assert(input.size() % (_nchans * _tscrunch) == 0 /* Input is not a multiple of _nchans * _tscrunch*/);
output.resize(input.size()/_tscrunch);
int nsamps = input.size() / _nchans;
float2 const* input_ptr = thrust::raw_pointer_cast(input.data());
int8_t* output_ptr = thrust::raw_pointer_cast(output.data());
kernels::detect_and_accumulate<<<1024, 1024, 0, _stream>>>(
input_ptr, output_ptr, _nchans, nsamps, _tscrunch, _scale, _offset);
}
} //namespace edd
} //namespace effelsberg
} //namespace psrdada_cpp
......@@ -35,7 +35,7 @@ protected:
float offset);
void compare_against_host(
DetectorAccumulator::OutputType const& gpu_output,
DetectorAccumulator<int8_t>::OutputType const& gpu_output,
OutputType const& host_output);
protected:
......
......@@ -62,7 +62,7 @@ void DetectorAccumulatorTester::detect_c_reference(
}
void DetectorAccumulatorTester::compare_against_host(
DetectorAccumulator::OutputType const& gpu_output,
DetectorAccumulator<int8_t>::OutputType const& gpu_output,
OutputType const& host_output)
{
OutputType copy_from_gpu = gpu_output;
......@@ -88,10 +88,10 @@ TEST_F(DetectorAccumulatorTester, noise_test)
host_input[ii].x = distribution(generator);
host_input[ii].y = distribution(generator);
}
DetectorAccumulator::InputType gpu_input = host_input;
DetectorAccumulator::OutputType gpu_output;
DetectorAccumulator<int8_t>::InputType gpu_input = host_input;
DetectorAccumulator<int8_t>::OutputType gpu_output;
OutputType host_output;
DetectorAccumulator detector(nchans, tscrunch, scale, 0.0, _stream);
DetectorAccumulator<int8_t> detector(nchans, tscrunch, scale, 0.0, _stream);
detector.detect(gpu_input, gpu_output);
detect_c_reference(host_input, host_output, nchans, tscrunch, scale, 0.0);
compare_against_host(gpu_output, host_output);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment