diff --git a/psrdada_cpp/effelsberg/CMakeLists.txt b/psrdada_cpp/effelsberg/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c34cd3ca4cb1e2183cd85a6f87c3effddf3a686 --- /dev/null +++ b/psrdada_cpp/effelsberg/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(edd) \ No newline at end of file diff --git a/psrdada_cpp/effelsberg/edd/CMakeLists.txt b/psrdada_cpp/effelsberg/edd/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..398e13aede5f3a6d0eb4b7fa4df0ed3b503507be --- /dev/null +++ b/psrdada_cpp/effelsberg/edd/CMakeLists.txt @@ -0,0 +1,20 @@ +if(ENABLE_CUDA) + +set(PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES + ${CMAKE_PROJECT_NAME} + ${CMAKE_PROJECT_NAME}_effelsberg_edd + ${DEPENDENCY_LIBRARIES}) + +set(psrdada_cpp_effelsberg_edd_src + src/edd_simple_fft_spectrometer.cu + ) + +cuda_add_library(${CMAKE_PROJECT_NAME}_effelsberg_edd ${psrdada_cpp_effelsberg_edd_src}) + +#simple FFT spectrometer test +cuda_add_executable(edd_specpol src/edd_simple_fft_spectrometer_test.cu) +target_link_libraries(edd_specpol ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES}) +install(TARGETS edd_specpol DESTINATION bin) + + +endif(ENABLE_CUDA) diff --git a/psrdada_cpp/effelsberg/edd/detail/edd_simple_fft_spectrometer.cu b/psrdada_cpp/effelsberg/edd/detail/edd_simple_fft_spectrometer.cu new file mode 100644 index 0000000000000000000000000000000000000000..2206c1ec150642244ba0eaa3fca643f3f5b2b350 --- /dev/null +++ b/psrdada_cpp/effelsberg/edd/detail/edd_simple_fft_spectrometer.cu @@ -0,0 +1,72 @@ +#ifndef PSRDADA_CPP_EFFELSBERG_EDD_SIMPLE_FFT_SPECTROMETER_HPP +#define PSRDADA_CPP_EFFELSBERG_EDD_SIMPLE_FFT_SPECTROMETER_HPP + +#include "psrdada_cpp/effelsberg/edd/" +#include "psrdada_cpp/raw_bytes.hpp" +#include "thrust/device_vector.h" +#include "thrust/host_vector.h" +#include "cufft.h" + +namespace psrdada_cpp { +namespace effelsberg { +namespace edd { +namespace kernels { + + __global__ + void unpack_edd_12bit_to_float32(char* __restrict__ in, float* __restrict__ out, int n); + +} //kernels + +template <class HandlerType> +class SimpleFFTSpectrometer +{ +public: + SimpleFFTSpectrometer( + std::size_t fft_length, + std::size_t naccumulate, + std::size_t nbits, + HandlerType& handler); + ~SimpleFFTSpectrometer(); + + /** + * @brief A callback to be called on connection + * to a ring buffer. + * + * @detail The first available header block in the + * in the ring buffer is provided as an argument. + * It is here that header parameters could be read + * if desired. + * + * @param block A RawBytes object wrapping a DADA header buffer + */ + void init(RawBytes& block); + + /** + * @brief A callback to be called on acqusition of a new + * data block. + * + * @param block A RawBytes object wrapping a DADA data buffer + */ + bool operator()(RawBytes& block); + +private: + std::size_t _fft_length; + std::size_t _naccumulate; + std::size_t _nbits; + HandlerType& _handler; + bool _first_block; + std::size_t _nsamps; + cufftHandle _fft_plan; + thrust::device_vector<uint64_t> _edd_raw; + thrust::device_vector<float> _edd_unpacked; + thrust::device_vector<cufftComplex> _channelised; + thrust::device_vector<float> _detected; +}; + + +} //edd +} //effelsberg +} //psrdada_cpp + +#include "psrdada_cpp/effelsberg/edd/detail/edd_simple_fft_spectrometer.cu" +#endif //PSRDADA_CPP_EFFELSBERG_EDD_SIMPLE_FFT_SPECTROMETER_HPP \ No newline at end of file diff --git a/psrdada_cpp/effelsberg/edd/edd_simple_fft_spectrometer.cuh b/psrdada_cpp/effelsberg/edd/edd_simple_fft_spectrometer.cuh new file mode 100644 index 0000000000000000000000000000000000000000..37e163c2205c9b8b9550d2a780c3d6139a6e9cc1 --- /dev/null +++ b/psrdada_cpp/effelsberg/edd/edd_simple_fft_spectrometer.cuh @@ -0,0 +1,104 @@ +#ifndef PSRDADA_CPP_EFFELSBERG_EDD_SIMPLE_FFT_SPECTROMETER_HPP +#define PSRDADA_CPP_EFFELSBERG_EDD_SIMPLE_FFT_SPECTROMETER_HPP + +#include "psrdada_cpp/effelsberg/edd/edd_simple_fft_spectrometer.cuh" + +namespace psrdada_cpp { +namespace effelsberg { +namespace edd { + +template <class HandlerType> +SimpleFFTSpectrometer<HandlerType>::SimpleFFTSpectrometer( + std::size_t fft_length, + std::size_t naccumulate, + std::size_t nbits, + HandlerType& handler) + : _fft_length(fft_length) + , _naccumulate(naccumulate) + , _nbits(nbits) + , _handler(handler) + , _first_block(true) + , _nsamps(0) + , _fft_plan(NULL) +{ + +} + +template <class HandlerType> +SimpleFFTSpectrometer<HandlerType>::~SimpleFFTSpectrometer() +{ + if (!_first_block) + cufftDestroy(_fft_plan); +} + +template <class HandlerType> +void SimpleFFTSpectrometer<HandlerType>::init(RawBytes& block) +{ +} + +template <class HandlerType> +bool SimpleFFTSpectrometer<HandlerType>::operator()(RawBytes& block) +{ + + std::size_t nsamps_in_block = 8 * block.used_bytes() / _nbits; + std::size_t nchans = _fft_length / 2 + 1; + + if (_first_block) + { + _nsamps = nsamps_in_block; + std::size_t n64bit_words = 3 * _nsamps / 16; + if (_nsamps % _fft_length != 0) + { + throw std::runtime_error("Number of samples is not multiple of FFT size"); + } + std::size_t batch = _nsamps/_fft_length; + + // Only do these things once + CUFFT_ERROR_CHECK(cufftPlanMany(&_fft_plan, 1, _fft_length, 0, 1, + _fft_length, 1, 1, _fft_length, CUFFT_R2C, batch)); + + _edd_raw.resize(n64bit_words); + _edd_unpacked.resize(_nsamps); + _channelised.resize(nchans * batch); + _detected.resize(nchans * batch / _naccumulate); + _first_block = false; + } + + if (_nsamps != nsamps_in_block) + { + throw std::runtime_error("Received incomplete block"); + } + + uint64_t* _edd_raw_ptr = thrust::raw_pointer_cast(_edd_raw.data()); + float* _edd_unpacked_ptr = thrust::raw_pointer_cast(_edd_unpacked.data()); + + //Copy DADA buffer to GPU + thrust::copy(block.ptr(), block.ptr()+block.used_bytes(), (char*) _edd_raw_ptr); + + if (_nbits == 12) + { + unpack_edd_12bit_to_float32<<< 64, 1024>>>(_edd_raw_ptr, _edd_unpacked_ptr, _edd_raw.size()); + CUDA_ERROR_CHECK(cudaDeviceSynchronize()); + } + else if (_nbits == 8) + { + throw std::runtime_error("Only 12-bit mode supported"); + } + else + { + throw std::runtime_error("Only 12-bit mode supported"); + } + + cufftComplex* _channelised_ptr = thrust::raw_pointer_cast(_channelised.data()); + CUFFT_ERROR_CHECK(cufftExecuteR2C(_fft_plan, (cufftReal*)_edd_unpacked_ptr, _channelised_ptr, CUFFT_FORWARD)); + + //thrust::copy(_edd_unpacked.begin(), _edd_unpacked.end(), block.ptr()); + //_handler(block); +} + +} //edd +} //effelsberg +} //psrdada_cpp + +#include "psrdada_cpp/effelsberg/edd/detail/edd_simple_fft_spectrometer.cu" +#endif //PSRDADA_CPP_EFFELSBERG_EDD_SIMPLE_FFT_SPECTROMETER_HPP \ No newline at end of file diff --git a/psrdada_cpp/effelsberg/edd/src/edd_simple_fft_spectrometer.cu b/psrdada_cpp/effelsberg/edd/src/edd_simple_fft_spectrometer.cu new file mode 100644 index 0000000000000000000000000000000000000000..eca878fc32012c7154c2bded9771bdeb988f323c --- /dev/null +++ b/psrdada_cpp/effelsberg/edd/src/edd_simple_fft_spectrometer.cu @@ -0,0 +1,147 @@ +#include "psrdada_cpp/common.hpp" +#include "psrdada_cpp/cuda_utils.hpp" + +#define BSWAP64(x) ((0xFF00000000000000 & x) >> 56) | \ + ((0x00FF000000000000 & x) >> 40) | \ + ((0x0000FF0000000000 & x) >> 24) | \ + ((0x000000FF00000000 & x) >> 8) | \ + ((0x00000000FF000000 & x) << 8) | \ + ((0x0000000000FF0000 & x) << 24) | \ + ((0x000000000000FF00 & x) << 40) | \ + ((0x00000000000000FF & x) << 56) + + +namespace psrdada_cpp { +namespace effelsberg { +namespace edd { +namespace kernels { + + __device__ __forceinline__ uint64_t swap64(uint64_t x) + { + uint64_t result; + uint2 t; + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(t.x), "=r"(t.y) : "l"(x)); + t.x = __byte_perm(t.x, 0, 0x0123); + t.y = __byte_perm(t.y, 0, 0x0123); + asm("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(result) : "r"(t.y), "r"(t.x)); + return result; + } + + + __global__ + void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__ out, int n) + { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x ; (3*idx+2) < n ; idx+=gridDim.x*blockDim.x) + { + uint64_t val; + uint64_t rest; + int read_idx = 3*idx; + int write_idx = 16*idx; + float* sout = out+write_idx; + val = swap64(in[read_idx]); + sout[0] = (float)((int64_t)(( 0xFFF0000000000000 & val) << 0) >> 52); + sout[1] = (float)((int64_t)(( 0x000FFF0000000000 & val) << 12) >> 52); + sout[2] = (float)((int64_t)(( 0x000000FFF0000000 & val) << 24) >> 52); + sout[3] = (float)((int64_t)(( 0x000000000FFF0000 & val) << 36) >> 52); + sout[4] = (float)((int64_t)(( 0x000000000000FFF0 & val) << 48) >> 52); + rest = ( 0x000000000000000F & val) << 60; + val = swap64(in[read_idx+1]); + sout[5] = (float)((int64_t)((( 0xFF00000000000000 & val) >> 4) | rest) >> 52); + sout[6] = (float)((int64_t)(( 0x00FFF00000000000 & val) << 8) >> 52); + sout[7] = (float)((int64_t)(( 0x00000FFF00000000 & val) << 20) >> 52); + sout[8] = (float)((int64_t)(( 0x00000000FFF00000 & val) << 32) >> 52); + sout[9] = (float)((int64_t)(( 0x00000000000FFF00 & val) << 44) >> 52); + rest = ( 0x00000000000000FF & val) << 56; + val = swap64(in[read_idx+2]); + sout[10] = (float)((int64_t)((( 0xF000000000000000 & val) >> 8) | rest) >> 52); + sout[11] = (float)((int64_t)(( 0x0FFF000000000000 & val) << 4) >> 52); + sout[12] = (float)((int64_t)(( 0x0000FFF000000000 & val) << 16) >> 52); + sout[13] = (float)((int64_t)(( 0x0000000FFF000000 & val) << 28) >> 52); + sout[14] = (float)((int64_t)(( 0x0000000000FFF000 & val) << 40) >> 52); + } + } + +} //kernels + +namespace test +{ + void unpack_edd_12bit_to_float32_cpu(thrust::host_vector<uint64_t>const& input, thrust::host_vector<float>& output) + { + uint64_t val; + uint64_t rest; + output.reserve(16 * input.size() / 3); + for (int ii=0; ii<input.size(); ii+=3) + { + int idx = ii; + val = be64toh(input[idx]); + output.push_back( (float)((int64_t)(( 0xFFF0000000000000 & val) << 0) >> 52)); + output.push_back( (float)((int64_t)(( 0x000FFF0000000000 & val) << 12) >> 52)); + output.push_back( (float)((int64_t)(( 0x000000FFF0000000 & val) << 24) >> 52)); + output.push_back( (float)((int64_t)(( 0x000000000FFF0000 & val) << 36) >> 52)); + output.push_back( (float)((int64_t)(( 0x000000000000FFF0 & val) << 48) >> 52)); + rest = ( 0x000000000000000F & val) << 60; + + val = be64toh(input[++idx]); + output.push_back( (float)((int64_t)((( 0xFF00000000000000 & val) >> 4) | rest) >> 52)); + output.push_back( (float)((int64_t)(( 0x00FFF00000000000 & val) << 8) >> 52)); + output.push_back( (float)((int64_t)(( 0x00000FFF00000000 & val) << 20) >> 52)); + output.push_back( (float)((int64_t)(( 0x00000000FFF00000 & val) << 32) >> 52)); + output.push_back( (float)((int64_t)(( 0x00000000000FFF00 & val) << 44) >> 52)); + rest = ( 0x00000000000000FF & val) << 56; + + val = be64toh(input[++idx]); + output.push_back( (float)((int64_t)((( 0xF000000000000000 & val) >> 8) | rest) >> 52)); + output.push_back( (float)((int64_t)(( 0x0FFF000000000000 & val) << 4) >> 52)); + output.push_back( (float)((int64_t)(( 0x0000FFF000000000 & val) << 16) >> 52)); + output.push_back( (float)((int64_t)(( 0x0000000FFF000000 & val) << 28) >> 52)); + output.push_back( (float)((int64_t)(( 0x0000000000FFF000 & val) << 40) >> 52)); + output.push_back( (float)((int64_t)(( 0x0000000000000FFF & val) << 52) >> 52)); + } + } + + /* + void unpack_edd_12bit_to_float32_test() + { + + int nvalues = 1600; + int nlongs = 3 * nvalues / 16; + int nthreads = nlongs/3; + int nblocks = 1; + int smem_size = nthreads * 16 * sizeof(float); + + thrust::host_vector<uint64_t> input; + + for (int val=0; val<nlongs; ++val) + { + input.push_back(val); + } + + // Run CUDA version + thrust::device_vector<uint64_t> input_d = input; + thrust::device_vector<float> output_d(nvalues); + uint64_t* input_ptr = thrust::raw_pointer_cast(input_d.data()); + float* output_ptr = thrust::raw_pointer_cast(output_d.data()) + unpack_edd_12bit_to_float32<<<nblocks, nthreads, smem_size, 0>>>(input_ptr, output_ptr, nlongs); + CUDA_ERROR_CHECK(cudaDeviceSynchronize()); + thrust::host_vector<float> output = output_d; + + // Run host version + thrust::host_vector<float> test_output; + unpack_edd_12bit_to_float32_cpu(input, test_output); + + for (int ii=0; ii<nvalues; ++ii) + { + if (output[ii] != test_output[ii]) + { + printf("Error at index %d -> %f != %f \n", ii, output[ii], test_output[ii]); + } + } + } + */ +} //test + +} //edd +} //effelsberg +} //psrdada_cpp \ No newline at end of file diff --git a/psrdada_cpp/effelsberg/edd/src/edd_simple_fft_spectrometer_test.cu b/psrdada_cpp/effelsberg/edd/src/edd_simple_fft_spectrometer_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..5382faeb0d6c4d881037cf4dd42a6b340a30b899 --- /dev/null +++ b/psrdada_cpp/effelsberg/edd/src/edd_simple_fft_spectrometer_test.cu @@ -0,0 +1,19 @@ +#include "psrdada_cpp/common.h" +#include "psrdada_cpp/raw_bytes.h" +#include "psrdada_cpp/effelsberg/ebb/edd_simple_fft_spectrometer.cuh" + +#include "thrust/host_vector.h" + +int main() +{ + std::size_t size = 4096 * 12 * 4096 / 8 + thrust::host_vector<char> data; + data.resize(size); + RawBytes dada_input(data.data(), data.size()); + SimpleFFTSpectrometer spectrometer(8192, 1, 12, NULL); + for (int ii=0; ii<100; ++ii) + { + spectrometer(dada_input); + } + return 0; +} \ No newline at end of file