Commit 4b71d0d2 authored by Ewan Barr's avatar Ewan Barr
Browse files

cleaned up double buffers with custom class

parent ce3445f8
#include "psrdada_cpp/double_buffer.hpp"
#include <utility>
namespace psrdada_cpp {
template <typename T>
DoubleBuffer<T>::DoubleBuffer()
{
_a_ptr = &_buf0;
_b_ptr = &_buf1;
}
template <typename T>
DoubleBuffer<T>::~DoubleBuffer()
{
}
template <typename T>
void DoubleBuffer<T>::resize(std::size_t size)
{
_buf0.resize(size);
_buf1.resize(size);
}
template <typename T>
void DoubleBuffer<T>::swap(std::size_t)
{
std::swap(_a_ptr, _b_ptr);
}
template <typename T>
T* DoubleBuffer<T>::a(std::size_t) const
{
return _a_ptr;
}
template <typename T>
T* DoubleBuffer<T>::b(std::size_t) const
{
return _b_ptr;
}
} //namespace psrdada_cpp
\ No newline at end of file
#ifndef PSRDADA_CPP_DOUBLE_BUFFER_HPP
#define PSRDADA_CPP_DOUBLE_BUFFER_HPP
namespace psrdada_cpp {
template <typename T>
class DoubleBuffer
{
public:
DoubleBuffer();
~DoubleBuffer();
void resize(std::size_t size);
void swap();
T* a() const;
T* b() const;
private:
T _buf0;
T _buf1;
T* _a_ptr;
T* _b_ptr;
};
} //namespace psrdada_cpp
#endif //PSRDADA_CPP_DOUBLE_BUFFER_HPP
\ No newline at end of file
......@@ -55,24 +55,11 @@ SimpleFFTSpectrometer<HandlerType>::SimpleFFTSpectrometer(
cufftSetStream(_fft_plan, _proc_stream);
BOOST_LOG_TRIVIAL(debug) << "Allocating memory";
_edd_raw_a.resize(n64bit_words);
_edd_raw_b.resize(n64bit_words);
_edd_raw_current = &_edd_raw_a;
_edd_raw_previous = &_edd_raw_b;
_edd_raw.resize(n64bit_words);
_edd_unpacked.resize(_nsamps);
_channelised.resize(_nchans * batch);
_detected_a.resize(_nchans * batch / _naccumulate);
_detected_b.resize(_nchans * batch / _naccumulate);
_detected_current = &_detected_a;
_detected_previous = &_detected_b;
_detected_host_a.resize(_nchans * batch / _naccumulate);
_detected_host_b.resize(_nchans * batch / _naccumulate);
_detected_host_current = &_detected_host_a;
_detected_host_previous = &_detected_host_b;
_detected.resize(_nchans * batch / _naccumulate);
_detected_host.resize(_nchans * batch / _naccumulate);
}
template <class HandlerType>
......@@ -130,31 +117,31 @@ bool SimpleFFTSpectrometer<HandlerType>::operator()(RawBytes& block)
// Synchronize all streams
CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
std::swap(_detected_current, _detected_previous);
_detected.swap();
CUDA_ERROR_CHECK(cudaStreamSynchronize(_d2h_stream));
std::swap(_detected_host_current, _detected_host_previous);
_detected_host.swap();
// Start host to device copy
cudaMemcpyAsync((char*) thrust::raw_pointer_cast(_edd_raw_current->data()),
cudaMemcpyAsync((char*) thrust::raw_pointer_cast(_edd_raw.a()->data()),
block.ptr(), block.used_bytes(), cudaMemcpyHostToDevice, _h2d_stream);
// Guaranteed that the previous copy is completed here
process(_edd_raw_previous, _detected_current);
process(_edd_raw.b(), _detected.a());
cudaMemcpyAsync((char*) thrust::raw_pointer_cast(_detected_host_current->data()),
(char*) thrust::raw_pointer_cast(_detected_previous->data()),
_detected_previous->size() * sizeof(char),
cudaMemcpyAsync((char*) thrust::raw_pointer_cast(_detected_host.a()->data()),
(char*) thrust::raw_pointer_cast(_detected.b()->data()),
_detected.b()->size() * sizeof(char),
cudaMemcpyDeviceToHost, _d2h_stream);
//Wrap _detected_host_previous in a RawBytes object here;
RawBytes bytes((char*) thrust::raw_pointer_cast(_detected_host_previous->data()),
_detected_host_previous->size() * sizeof(char),
_detected_host_previous->size() * sizeof(char));
RawBytes bytes((char*) thrust::raw_pointer_cast(_detected_host.b()->data()),
_detected_host.b()->size() * sizeof(char),
_detected_host.b()->size() * sizeof(char));
BOOST_LOG_TRIVIAL(debug) << "Calling handler";
CUDA_ERROR_CHECK(cudaStreamSynchronize(_h2d_stream));
std::swap(_edd_raw_current, _edd_raw_previous);
_edd_raw.swap();
// Due to the double buffering the data the output data is only
// valid by the third pass through. Until that time the code
......
......@@ -2,6 +2,7 @@
#define PSRDADA_CPP_EFFELSBERG_EDD_EDDFFT_HPP
#include "psrdada_cpp/raw_bytes.hpp"
#include "psrdada_cpp/double_buffer.hpp"
#include "thrust/device_vector.h"
#include "thrust/host_vector.h"
#include "thrust/system/cuda/experimental/pinned_allocator.h"
......@@ -14,12 +15,12 @@ namespace effelsberg {
namespace edd {
namespace kernels {
__global__
void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__ out, int n);
__global__
void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__ out, int n);
__global__
void detect_and_accumulate(cufftComplex* __restrict__ in, char* __restrict__ out,
int nchans, int nsamps, int naccumulate, float scale, float offset);
__global__
void detect_and_accumulate(cufftComplex* __restrict__ in, char* __restrict__ out,
int nchans, int nsamps, int naccumulate, float scale, float offset);
} //kernels
......@@ -73,22 +74,9 @@ private:
thrust::device_vector<float> _edd_unpacked;
thrust::device_vector<cufftComplex> _channelised;
thrust::device_vector<uint64_t> _edd_raw_a;
thrust::device_vector<uint64_t> _edd_raw_b;
thrust::device_vector<uint64_t>* _edd_raw_current;
thrust::device_vector<uint64_t>* _edd_raw_previous;
thrust::device_vector<char> _detected_a;
thrust::device_vector<char> _detected_b;
thrust::device_vector<char>* _detected_current;
thrust::device_vector<char>* _detected_previous;
thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char> > _detected_host_a;
thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char> > _detected_host_b;
thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char> >* _detected_host_current;
thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char> >* _detected_host_previous;
DoubleBuffer<thrust::device_vector<uint64_t>> _edd_raw;
DoubleBuffer<thrust::device_vector<char>> _detected;
DoubleBuffer<thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char>>> _detected_host;
cudaStream_t _h2d_stream;
cudaStream_t _proc_stream;
cudaStream_t _d2h_stream;
......
......@@ -17,18 +17,18 @@ namespace effelsberg {
namespace edd {
namespace kernels {
__device__ __forceinline__ uint64_t swap64(uint64_t x)
{
uint64_t result;
uint2 t;
asm("mov.b64 {%0,%1},%2; \n\t"
: "=r"(t.x), "=r"(t.y) : "l"(x));
t.x = __byte_perm(t.x, 0, 0x0123);
t.y = __byte_perm(t.y, 0, 0x0123);
asm("mov.b64 %0,{%1,%2}; \n\t"
: "=l"(result) : "r"(t.y), "r"(t.x));
return result;
}
__device__ __forceinline__ uint64_t swap64(uint64_t x)
{
uint64_t result;
uint2 t;
asm("mov.b64 {%0,%1},%2; \n\t"
: "=r"(t.x), "=r"(t.y) : "l"(x));
t.x = __byte_perm(t.x, 0, 0x0123);
t.y = __byte_perm(t.y, 0, 0x0123);
asm("mov.b64 %0,{%1,%2}; \n\t"
: "=l"(result) : "r"(t.y), "r"(t.x));
return result;
}
__global__
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment