From 656eacb68d0212c4c8ba966e2dcd91b4ac99f9c4 Mon Sep 17 00:00:00 2001 From: Tobias Winchen <tobias.winchen@rwth-aachen.de> Date: Mon, 18 Mar 2019 16:16:39 +0000 Subject: [PATCH] Added prototype for 2bit pack --- cmake/cuda.cmake | 2 +- psrdada_cpp/effelsberg/edd/CMakeLists.txt | 5 ++ psrdada_cpp/effelsberg/edd/VLBI.cuh | 29 +++++++ psrdada_cpp/effelsberg/edd/src/VLBI.cu | 75 +++++++++++++++++ psrdada_cpp/effelsberg/edd/src/VLBI_prof.cu | 42 ++++++++++ .../effelsberg/edd/test/CMakeLists.txt | 1 + .../effelsberg/edd/test/src/VLBITest.cu | 80 +++++++++++++++++++ 7 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 psrdada_cpp/effelsberg/edd/VLBI.cuh create mode 100644 psrdada_cpp/effelsberg/edd/src/VLBI.cu create mode 100644 psrdada_cpp/effelsberg/edd/src/VLBI_prof.cu create mode 100644 psrdada_cpp/effelsberg/edd/test/src/VLBITest.cu diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 6aa6c6ac..dc7d8921 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -14,7 +14,7 @@ if(ENABLE_CUDA) set(CUDA_PROPAGATE_HOST_FLAGS OFF) # Pass options to NVCC ( -ccbin /path --compiler-options -lfftw3f --compiler-options -lm --verbose) - list(APPEND CUDA_NVCC_FLAGS -DENABLE_CUDA --std c++11 -Wno-deprecated-gpu-targets) + list(APPEND CUDA_NVCC_FLAGS -DENABLE_CUDA --std c++11 -Wno-deprecated-gpu-targets --ptxas-options=-v) list(APPEND CUDA_NVCC_FLAGS_DEBUG --debug; --device-debug; --generate-line-info -Xcompiler "-Werror") #list(APPEND CUDA_NVCC_FLAGS -arch compute_35) # minumum compute level (Sps restriction) string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE) diff --git a/psrdada_cpp/effelsberg/edd/CMakeLists.txt b/psrdada_cpp/effelsberg/edd/CMakeLists.txt index 210d1d67..6c3d2525 100644 --- a/psrdada_cpp/effelsberg/edd/CMakeLists.txt +++ b/psrdada_cpp/effelsberg/edd/CMakeLists.txt @@ -9,6 +9,7 @@ set(psrdada_cpp_effelsberg_edd_src src/Unpacker.cu src/DetectorAccumulator.cu src/ScaledTransposeTFtoTFT.cu + src/VLBI.cu ) cuda_add_library(${CMAKE_PROJECT_NAME}_effelsberg_edd ${psrdada_cpp_effelsberg_edd_src}) @@ -18,5 +19,9 @@ cuda_add_executable(fft_spectrometer src/fft_spectrometer_cli.cu) target_link_libraries(fft_spectrometer ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}) install(TARGETS fft_spectrometer DESTINATION bin) +cuda_add_executable(VLBI_prof src/VLBI_prof.cu) +target_link_libraries(VLBI_prof ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}) +install(TARGETS VLBI_prof DESTINATION bin) + add_subdirectory(test) endif(ENABLE_CUDA) diff --git a/psrdada_cpp/effelsberg/edd/VLBI.cuh b/psrdada_cpp/effelsberg/edd/VLBI.cuh new file mode 100644 index 00000000..7b499338 --- /dev/null +++ b/psrdada_cpp/effelsberg/edd/VLBI.cuh @@ -0,0 +1,29 @@ +#ifndef PSRDADA_CPP_EFFELSBERG_EDD_VLBI_CUH +#define PSRDADA_CPP_EFFELSBERG_EDD_VLBI_CUH + +#include "psrdada_cpp/common.hpp" +#include <thrust/device_vector.h> + + +namespace psrdada_cpp { +namespace effelsberg { +namespace edd { +namespace kernels { + + + +__global__ +void pack_edd_float32_to_2bit(float* __restrict__ in, uint32_t * __restrict__ out, size_t n); + +} //namespace kernels + +void pack_2bit(thrust::device_vector<float> const& input, thrust::device_vector<uint32_t>& output, float minV, float maxV, cudaStream_t _stream = 0); + +} //namespace edd +} //namespace effelsberg +} //namespace psrdada_cpp + +#endif // PSRDADA_CPP_EFFELSBERG_EDD_VLBI_CUH + + + diff --git a/psrdada_cpp/effelsberg/edd/src/VLBI.cu b/psrdada_cpp/effelsberg/edd/src/VLBI.cu new file mode 100644 index 00000000..a3ce4c41 --- /dev/null +++ b/psrdada_cpp/effelsberg/edd/src/VLBI.cu @@ -0,0 +1,75 @@ +#include "psrdada_cpp/effelsberg/edd/VLBI.cuh" +#include "psrdada_cpp/cuda_utils.hpp" + +#define EDD_NTHREADS_PACK 1024 +#define NPACK 16 + +namespace psrdada_cpp { +namespace effelsberg { +namespace edd { +namespace kernels { + + + +__global__ +void pack_edd_float32_to_2bit(float const* __restrict__ in, uint32_t* __restrict__ out, size_t n, float minV, float maxV) +{ + + __shared__ uint32_t tmp_in[EDD_NTHREADS_PACK]; + //__shared__ uint32_t tmp_in[EDD_NTHREADS_PACK]; + //__shared__ volatile uint8_t tmp_out[EDD_NTHREADS_PACK / 4]; + + const float s = (maxV - minV) / 3.; + int odx = blockIdx.x * blockDim.x / NPACK + threadIdx.x; + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n ; idx += gridDim.x * blockDim.x) + { + const float delta = (in[idx] - minV); + tmp_in[threadIdx.x] = 0; + tmp_in[threadIdx.x] += (delta >= 1 * s); + tmp_in[threadIdx.x] += (delta >= 2 * s); + tmp_in[threadIdx.x] += (delta >= 3 * s); + __syncthreads(); + + // can be improved by distributing work on more threads in tree + // structure, but already at 60-70% memory utilization + if (threadIdx.x < EDD_NTHREADS_PACK / NPACK) + { + for (size_t i = 1; i < NPACK; i++) + { + tmp_in[threadIdx.x] |= tmp_in[threadIdx.x * NPACK + i] << i*2; + } + //out[odx] = tmp_out; + out[odx] = tmp_in[threadIdx.x]; + } + odx += gridDim.x * blockDim.x / NPACK; + + __syncthreads(); + } +} + +} //namespace kernels + + +void pack_2bit(thrust::device_vector<float> const& input, thrust::device_vector<uint32_t>& output, float minV, float maxV, cudaStream_t _stream) +{ + BOOST_LOG_TRIVIAL(debug) << "Packing 2-bit data"; + assert(input.size() % NPACK == 0); + output.resize(input.size() / NPACK); + BOOST_LOG_TRIVIAL(debug) << "INput size: " << input.size() << " elements"; + BOOST_LOG_TRIVIAL(debug) << "Resizing output buffer to " << output.size() << " elements"; + + size_t nblocks = std::min(input.size() / EDD_NTHREADS_PACK, 4096uL); + BOOST_LOG_TRIVIAL(debug) << " using " << nblocks << " blocks of " << EDD_NTHREADS_PACK << " threads"; + + float const* input_ptr = thrust::raw_pointer_cast(input.data()); + + uint32_t* output_ptr = thrust::raw_pointer_cast(output.data()); + + kernels::pack_edd_float32_to_2bit<<< nblocks, EDD_NTHREADS_PACK, 0, _stream>>>( + input_ptr, output_ptr, input.size(), minV, maxV); + CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream)); +} + +} //namespace edd +} //namespace effelsberg +} //namespace psrdada_cpp diff --git a/psrdada_cpp/effelsberg/edd/src/VLBI_prof.cu b/psrdada_cpp/effelsberg/edd/src/VLBI_prof.cu new file mode 100644 index 00000000..d9692f26 --- /dev/null +++ b/psrdada_cpp/effelsberg/edd/src/VLBI_prof.cu @@ -0,0 +1,42 @@ +#include <thrust/device_vector.h> +#include <cuda_profiler_api.h> +#include <thrust/random.h> +#include <thrust/execution_policy.h> +#include "psrdada_cpp/effelsberg/edd/VLBI.cuh" + + +struct GenRand +{ + __device__ + float operator () (int idx) + { + thrust::default_random_engine randEng; + thrust::uniform_real_distribution<float> uniDist; + randEng.discard(idx); + return uniDist(randEng); + } +}; + + +int main() +{ + + const size_t n = 1024 * 1024 * 1024 / 4; + thrust::device_vector<float> input(n); + thrust::device_vector<uint32_t> output(n); + + thrust::fill(input.begin(), input.end(), .5); + thrust::fill(output.begin(), output.end(), 5); + + cudaDeviceSynchronize(); + psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 1, 0); + cudaDeviceSynchronize(); + std::cout << input[0] << std::endl; + std::cout << input[1] << std::endl; + std::cout << input[2] << std::endl; + std::cout << input[3] << std::endl; + std::cout << (int) output[0] << std::endl; + for (size_t i = 0; i<10; i++) + std::cout << i <<": " << output[i] << std::endl; + cudaProfilerStop(); +} diff --git a/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt b/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt index 6e915f49..2d06dc86 100644 --- a/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt +++ b/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt @@ -9,6 +9,7 @@ set( src/FftSpectrometerTester.cu src/UnpackerTester.cu src/ScaledTransposeTFtoTFTTester.cu + src/VLBITest.cu ) cuda_add_executable(gtest_edd ${gtest_edd_src} ) target_link_libraries(gtest_edd ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}) diff --git a/psrdada_cpp/effelsberg/edd/test/src/VLBITest.cu b/psrdada_cpp/effelsberg/edd/test/src/VLBITest.cu new file mode 100644 index 00000000..54dc7e32 --- /dev/null +++ b/psrdada_cpp/effelsberg/edd/test/src/VLBITest.cu @@ -0,0 +1,80 @@ +#include "gtest/gtest.h" + +#include "psrdada_cpp/cuda_utils.hpp" +#include <random> +#include "psrdada_cpp/effelsberg/edd/VLBI.cuh" +#include "thrust/extrema.h" + +TEST(VLBITest, 2_bit_pack_test) +{ + std::size_t n = 1024; + thrust::device_vector<float> input(n); + thrust::device_vector<uint32_t> output(n); + { + thrust::fill(input.begin(), input.end(), 0); + thrust::fill(output.begin(), output.end(), 5); + + psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3); + + EXPECT_EQ(output.size(), input.size() / 16); + thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax; + minmax = thrust::minmax_element(output.begin(), output.end()); + EXPECT_EQ(0, *minmax.first); + EXPECT_EQ(0, *minmax.second); + } + + { + thrust::fill(input.begin(), input.end(), 1); + thrust::fill(output.begin(), output.end(), 5); + + psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3); + thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax; + minmax = thrust::minmax_element(output.begin(), output.end()); + + EXPECT_EQ((uint32_t)0b0101010101010101010101010101010101010101, *minmax.first); + EXPECT_EQ((uint32_t)0b0101010101010101010101010101010101010101, *minmax.second); + } + + { + thrust::fill(input.begin(), input.end(), 2); + thrust::fill(output.begin(), output.end(), 5); + + psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3); + thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax; + minmax = thrust::minmax_element(output.begin(), output.end()); + + EXPECT_EQ((uint32_t)0b1010101010101010101010101010101010101010, *minmax.first); + EXPECT_EQ((uint32_t)0b1010101010101010101010101010101010101010, *minmax.second); + } + + { + thrust::fill(input.begin(), input.end(), 3); + thrust::fill(output.begin(), output.end(), 5); + + psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3); + thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax; + minmax = thrust::minmax_element(output.begin(), output.end()); + + EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.first); + EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.second); + } + + { + thrust::fill(input.begin(), input.end(), 4); + thrust::fill(output.begin(), output.end(), 5); + + psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3); + thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax; + minmax = thrust::minmax_element(output.begin(), output.end()); + + EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.first); + EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.second); + } + +} + +//int main(int argc, char **argv) { +// ::testing::InitGoogleTest(&argc, argv); +// +// return RUN_ALL_TESTS(); +//} -- GitLab