diff --git a/psrdada_cpp/effelsberg/edd/CMakeLists.txt b/psrdada_cpp/effelsberg/edd/CMakeLists.txt index bd1b566e70cbf52306ad9a88ffd8327b57ba5986..4f144b6d6313c1321d5bcfc237aee1ec89f040c4 100644 --- a/psrdada_cpp/effelsberg/edd/CMakeLists.txt +++ b/psrdada_cpp/effelsberg/edd/CMakeLists.txt @@ -6,19 +6,21 @@ set(PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES ${DEPENDENCY_LIBRARIES}) set(psrdada_cpp_effelsberg_edd_src - src/eddfft.cu + #src/eddfft.cu + src/Unpacker.cu ) cuda_add_library(${CMAKE_PROJECT_NAME}_effelsberg_edd ${psrdada_cpp_effelsberg_edd_src}) #simple FFT spectrometer test -cuda_add_executable(eddfft_test src/eddfft_test.cu) -target_link_libraries(eddfft_test ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} cufft) -install(TARGETS eddfft_test DESTINATION bin) +#cuda_add_executable(eddfft_test src/eddfft_test.cu) +#target_link_libraries(eddfft_test ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} cufft) +#install(TARGETS eddfft_test DESTINATION bin) #simple FFT spectrometer interface -cuda_add_executable(eddfft src/eddfft_cli.cu) -target_link_libraries(eddfft ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} cufft) -install(TARGETS eddfft DESTINATION bin) +#cuda_add_executable(eddfft src/eddfft_cli.cu) +#target_link_libraries(eddfft ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} cufft) +#install(TARGETS eddfft DESTINATION bin) +add_subdirectory(test) endif(ENABLE_CUDA) diff --git a/psrdada_cpp/effelsberg/edd/Unpacker.cuh b/psrdada_cpp/effelsberg/edd/Unpacker.cuh new file mode 100644 index 0000000000000000000000000000000000000000..198e8be4a31e064e6a8ce3e19dcb0c01a35e422d --- /dev/null +++ b/psrdada_cpp/effelsberg/edd/Unpacker.cuh @@ -0,0 +1,46 @@ +#ifndef PSRDADA_CPP_EFFELSBERG_EDD_UNPACKER_CUH +#define PSRDADA_CPP_EFFELSBERG_EDD_UNPACKER_CUH + +#include "psrdada_cpp/common.hpp" +#include <thrust/device_vector.h> + +namespace psrdada_cpp { +namespace effelsberg { +namespace edd { +namespace kernels { + +__global__ +void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__ out, int n); + +__global__ +void unpack_edd_8bit_to_float32(uint64_t* __restrict__ in, float* __restrict__ out, int n); + +} + +class Unpacker +{ +public: + typedef thrust::device_vector<uint64_t> InputType; + typedef thrust::device_vector<float> OutputType; + +public: + + Unpacker(cudaStream_t stream); + ~Unpacker(); + Unpacker(Unpacker const&) = delete; + + template <int Nbits> + void unpack(InputType const& input, OutputType& output); + +private: + cudaStream_t _copy_stream; +}; + +} //namespace edd +} //namespace effelsberg +} //namespace psrdada_cpp + +#endif // PSRDADA_CPP_EFFELSBERG_EDD_UNPACKER_CUH + + + diff --git a/psrdada_cpp/effelsberg/edd/src/Unpacker.cu b/psrdada_cpp/effelsberg/edd/src/Unpacker.cu new file mode 100644 index 0000000000000000000000000000000000000000..c309aafc7e00a94a0a88969d8f8395af4b3c4e5f --- /dev/null +++ b/psrdada_cpp/effelsberg/edd/src/Unpacker.cu @@ -0,0 +1,155 @@ +#include "psrdada_cpp/effelsberg/edd/Unpacker.cuh" +#include "psrdada_cpp/cuda_utils.hpp" + +#define EDD_NTHREADS_UNPACK 512 + +namespace psrdada_cpp { +namespace effelsberg { +namespace edd { +namespace kernels { + +__device__ __forceinline__ uint64_t swap64(uint64_t x) +{ + uint64_t result; + uint2 t; + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(t.x), "=r"(t.y) : "l"(x)); + t.x = __byte_perm(t.x, 0, 0x0123); + t.y = __byte_perm(t.y, 0, 0x0123); + asm("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(result) : "r"(t.y), "r"(t.x)); + return result; +} + +__global__ +void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__ out, int n) +{ + /** + * Note: This kernels will not work with more than 512 threads. + */ + __shared__ volatile float tmp_out[NTHREADS_UNPACK * 16]; + __shared__ volatile uint64_t tmp_in[NTHREADS_UNPACK * 3]; + int block_idx = blockIdx.x; + uint64_t val; + uint64_t rest; + volatile float* sout = tmp_out + (16 * threadIdx.x); + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; + (3 * idx + 2) < n; + idx+=gridDim.x*blockDim.x) + { + //Read to shared memeory + int block_read_start = block_idx * NTHREADS_UNPACK * 3; + tmp_in[threadIdx.x] = in[block_read_start + threadIdx.x]; + tmp_in[NTHREADS_UNPACK + threadIdx.x] = in[block_read_start + NTHREADS_UNPACK + threadIdx.x]; + tmp_in[NTHREADS_UNPACK * 2 + threadIdx.x] = in[block_read_start + NTHREADS_UNPACK * 2 + threadIdx.x]; + __syncthreads(); + val = swap64(tmp_in[3*threadIdx.x]); + sout[0] = (float)((int64_t)(( 0xFFF0000000000000 & val) << 0) >> 52); + sout[1] = (float)((int64_t)(( 0x000FFF0000000000 & val) << 12) >> 52); + sout[2] = (float)((int64_t)(( 0x000000FFF0000000 & val) << 24) >> 52); + sout[3] = (float)((int64_t)(( 0x000000000FFF0000 & val) << 36) >> 52); + sout[4] = (float)((int64_t)(( 0x000000000000FFF0 & val) << 48) >> 52); + rest = ( 0x000000000000000F & val) << 60; + val = swap64(tmp_in[3*threadIdx.x + 1]); + sout[5] = (float)((int64_t)((( 0xFF00000000000000 & val) >> 4) | rest) >> 52); + sout[6] = (float)((int64_t)(( 0x00FFF00000000000 & val) << 8) >> 52); + sout[7] = (float)((int64_t)(( 0x00000FFF00000000 & val) << 20) >> 52); + sout[8] = (float)((int64_t)(( 0x00000000FFF00000 & val) << 32) >> 52); + sout[9] = (float)((int64_t)(( 0x00000000000FFF00 & val) << 44) >> 52); + rest = ( 0x00000000000000FF & val) << 56; + val = swap64(tmp_in[3*threadIdx.x + 2]); + sout[10] = (float)((int64_t)((( 0xF000000000000000 & val) >> 8) | rest) >> 52); + sout[11] = (float)((int64_t)(( 0x0FFF000000000000 & val) << 4) >> 52); + sout[12] = (float)((int64_t)(( 0x0000FFF000000000 & val) << 16) >> 52); + sout[13] = (float)((int64_t)(( 0x0000000FFF000000 & val) << 28) >> 52); + sout[14] = (float)((int64_t)(( 0x0000000000FFF000 & val) << 40) >> 52); + __syncthreads(); + int block_write_start = block_idx * NTHREADS_UNPACK * 16; + for (int ii = threadIdx.x; ii < 16 * NTHREADS_UNPACK; ii += blockDim.x) + { + out[block_write_start + ii] = tmp_out[ii]; + } + block_idx += gridDim.x; + } +} + +__global__ +void unpack_edd_8bit_to_float32(uint64_t* __restrict__ in, float* __restrict__ out, int n); +{ + /** + * Note: This kernels will not work with more than 512 threads. + */ + __shared__ volatile float tmp_out[NTHREADS_UNPACK * 8]; + int block_idx = blockIdx.x; + uint64_t val; + volatile float* sout = tmp_out + (8 * threadIdx.x); + + for (int idx = blockIdx.x * blockDim.x + threadIdx.x ; idx < n ; idx+=gridDim.x*blockDim.x) + { + int block_read_start = block_idx * NTHREADS_UNPACK; + val = swap64(in[block_read_start + threadIdx.x]); + sout[0] = (float)((int64_t)(( 0xFF00000000000000 & val) << 0) >> 56); + sout[1] = (float)((int64_t)(( 0x00FF000000000000 & val) << 8) >> 56); + sout[2] = (float)((int64_t)(( 0x0000FF0000000000 & val) << 16) >> 56); + sout[3] = (float)((int64_t)(( 0x000000FF00000000 & val) << 24) >> 56); + sout[4] = (float)((int64_t)(( 0x00000000FF000000 & val) << 32) >> 56); + sout[5] = (float)((int64_t)(( 0x0000000000FF0000 & val) << 40) >> 56); + sout[6] = (float)((int64_t)(( 0x000000000000FF00 & val) << 48) >> 56); + sout[7] = (float)((int64_t)(( 0x00000000000000FF & val) << 56) >> 56); + __syncthreads(); + int block_write_start = block_idx * NTHREADS_UNPACK * 8; + for (int ii = threadIdx.x; ii < 8 * NTHREADS_UNPACK; ii+=blockDim.x) + { + out[block_write_start+ii] = tmp_out[ii]; + } + block_idx += gridDim.x; + } +} + +} //namespace kernels + + +Unpacker::Unpacker(cudaStream_t stream) + : _stream(stream) +{ + +} + +Unpacker::~Unpacker() +{ + +} + +template <> +void Unpacker::unpack<12>(InputType const& input, OutputType& output) +{ + BOOST_LOG_TRIVIAL(debug) << "Unpacking 12-bit data"; + std::size_t output_size = input.size() * 16 / 3; + BOOST_LOG_TRIVIAL(debug) << "Resizing output buffer to " << output_size << " elements"; + output.resize(output_size); + int nblocks = input.size() / EDD_NTHREADS_UNPACK; + InputType::value_type const* input_ptr = thrust::raw_pointer_cast(input.data()) + OutputType::value_type* output_ptr = thrust::raw_pointer_cast(output.data()) + kernels::unpack_edd_12bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK, 0, _stream>>>( + input_ptr, output_ptr, input.size()); + CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream)); +} + +template <> +void Unpacker::unpack<8>(InputType const& input, OutputType& output) +{ + BOOST_LOG_TRIVIAL(debug) << "Unpacking 12-bit data"; + std::size_t output_size = input.size() * 8; + BOOST_LOG_TRIVIAL(debug) << "Resizing output buffer to " << output_size << " elements"; + output.resize(output_size); + int nblocks = input.size() / EDD_NTHREADS_UNPACK; + InputType::value_type const* input_ptr = thrust::raw_pointer_cast(input.data()) + OutputType::value_type* output_ptr = thrust::raw_pointer_cast(output.data()) + kernels::unpack_edd_8bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK, 0, _stream>>>( + input_ptr, output_ptr, input.size()); + CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream)); +} + +} //namespace edd +} //namespace effelsberg +} //namespace psrdada_cpp \ No newline at end of file diff --git a/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt b/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..69f911afd6bf24a3e5ac79929f82664a36cdb53e --- /dev/null +++ b/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt @@ -0,0 +1,12 @@ +include_directories(${GTEST_INCLUDE_DIR}) + +link_directories(${GTEST_LIBRARY_DIR}) + +set( + gtest_edd_src + src/UnpackerTester.cu + +) +cuda_add_executable(gtest_edd ${gtest_edd_src} ) +target_link_libraries(gtest_edd ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES}) +add_test(gtest_edd gtest_edd --test_data "${CMAKE_CURRENT_LIST_DIR}/data") diff --git a/psrdada_cpp/effelsberg/edd/test/UnpackerTester.cuh b/psrdada_cpp/effelsberg/edd/test/UnpackerTester.cuh new file mode 100644 index 0000000000000000000000000000000000000000..986efa9c0c1620a85277736de2c53ed6acf5cd50 --- /dev/null +++ b/psrdada_cpp/effelsberg/edd/test/UnpackerTester.cuh @@ -0,0 +1,45 @@ +#ifndef PSRDADA_CPP_EFFELSBERG_EDD_UNPACKERTESTER_CUH +#define PSRDADA_CPP_EFFELSBERG_EDD_UNPACKERTESTER_CUH + +namespace psrdada_cpp { +namespace effelsberg { +namespace edd { +namespace test { + +class UnpackerTester: public ::testing::Test +{ +public: + typedef std::vector<uint64_t> InputType; + typedef std::vector<float> OutputType; + +protected: + void SetUp() override; + void TearDown() override; + +public: + UnpackerTester(); + ~UnpackerTester(); + +protected: + void unpacker_12_to_32_c_reference( + InputType const& input, + OutputType& output); + + void unpacker_8_to_32_c_reference( + InputType const& input, + OutputType& output); + + void compare_against_host( + Unpacker::OutputType const& gpu_output, + OutputType const& host_output); + +protected: + cudaStream_t _stream; +}; + +} //namespace test +} //namespace edd +} //namespace meerkat +} //namespace psrdada_cpp + +#endif //PSRDADA_CPP_EFFELSBERG_EDD_UNPACKERTESTER_CUH diff --git a/psrdada_cpp/effelsberg/edd/test/src/UnpackerTester.cu b/psrdada_cpp/effelsberg/edd/test/src/UnpackerTester.cu new file mode 100644 index 0000000000000000000000000000000000000000..437ce0d830a8f19f4fa0e90ab7caf5abf81a7e4e --- /dev/null +++ b/psrdada_cpp/effelsberg/edd/test/src/UnpackerTester.cu @@ -0,0 +1,74 @@ +#include "psrdada_cpp/effelsberg/edd/test/UnpackerTester.cuh" + +#define BSWAP64(x) ((0xFF00000000000000 & x) >> 56) | \ + ((0x00FF000000000000 & x) >> 40) | \ + ((0x0000FF0000000000 & x) >> 24) | \ + ((0x000000FF00000000 & x) >> 8) | \ + ((0x00000000FF000000 & x) << 8) | \ + ((0x0000000000FF0000 & x) << 24) | \ + ((0x000000000000FF00 & x) << 40) | \ + ((0x00000000000000FF & x) << 56) + +namespace psrdada_cpp { +namespace effelsberg { +namespace edd { +namespace test { + +UnpackerTester::UnpackerTester() + : ::testing::Test() + , _stream(0) +{ + +} + +UnpackerTester::~UnpackerTester() +{ + +} + +void UnpackerTester::SetUp() +{ + CUDA_ERROR_CHECK(cudaStreamCreate(&_stream)); +} + +void UnpackerTester::TearDown() +{ + CUDA_ERROR_CHECK(cudaStreamDestroy(_stream)); +} + +void UnpackerTester::unpacker_12_to_32_c_reference( + InputType const& input, + OutputType& output) +{ + +} + +void UnpackerTester::unpacker_8_to_32_c_reference( + InputType const& input, + OutputType& output) +{ + +} + +void UnpackerTester::compare_against_host( + Unpacker::OutputType const& gpu_output, + OutputType const& host_output) +{ + +} + +TEST_F(UnpackerTester, 12_bit_unpack_test) +{ + +} + +TEST_F(UnpackerTester, 8_bit_unpack_test) +{ + +} + + +} //namespace test +} //namespace edd +} //namespace meerkat +} //namespace psrdada_cpp \ No newline at end of file