Added prototype for 2bit pack

656eacb6 · Tobias Winchen · e9c608a6 · 656eacb6 · 656eacb6 · 656eacb6
Commit 656eacb6 authored 6 years ago by Tobias Winchen
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -14,7 +14,7 @@ if(ENABLE_CUDA)
  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
  # Pass options to NVCC ( -ccbin /path  --compiler-options -lfftw3f --compiler-options -lm --verbose)
-  list(APPEND CUDA_NVCC_FLAGS -DENABLE_CUDA --std c++11 -Wno-deprecated-gpu-targets)
+  list(APPEND CUDA_NVCC_FLAGS -DENABLE_CUDA --std c++11 -Wno-deprecated-gpu-targets --ptxas-options=-v)
  list(APPEND CUDA_NVCC_FLAGS_DEBUG --debug; --device-debug; --generate-line-info -Xcompiler "-Werror")
  #list(APPEND CUDA_NVCC_FLAGS -arch compute_35) # minumum compute level (Sps restriction)
  string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)

--- a/psrdada_cpp/effelsberg/edd/CMakeLists.txt
+++ b/psrdada_cpp/effelsberg/edd/CMakeLists.txt
@@ -9,6 +9,7 @@ set(psrdada_cpp_effelsberg_edd_src
    src/Unpacker.cu
    src/DetectorAccumulator.cu
    src/ScaledTransposeTFtoTFT.cu
+    src/VLBI.cu
    )
 cuda_add_library(${CMAKE_PROJECT_NAME}_effelsberg_edd ${psrdada_cpp_effelsberg_edd_src})
@@ -18,5 +19,9 @@ cuda_add_executable(fft_spectrometer src/fft_spectrometer_cli.cu)
 target_link_libraries(fft_spectrometer ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES})
 install(TARGETS fft_spectrometer DESTINATION bin)
+cuda_add_executable(VLBI_prof src/VLBI_prof.cu)
+target_link_libraries(VLBI_prof ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES})
+install(TARGETS VLBI_prof DESTINATION bin)
 add_subdirectory(test)
 endif(ENABLE_CUDA)
--- a/psrdada_cpp/effelsberg/edd/VLBI.cuh
+++ b/psrdada_cpp/effelsberg/edd/VLBI.cuh
+#ifndef PSRDADA_CPP_EFFELSBERG_EDD_VLBI_CUH
+#define PSRDADA_CPP_EFFELSBERG_EDD_VLBI_CUH
+#include "psrdada_cpp/common.hpp"
+#include <thrust/device_vector.h>
+namespace psrdada_cpp {
+namespace effelsberg {
+namespace edd {
+namespace kernels {
+__global__
+void pack_edd_float32_to_2bit(float* __restrict__ in, uint32_t * __restrict__ out,  size_t n);
+} //namespace kernels
+void pack_2bit(thrust::device_vector<float> const& input, thrust::device_vector<uint32_t>& output, float minV, float maxV, cudaStream_t _stream = 0);
+} //namespace edd
+} //namespace effelsberg
+} //namespace psrdada_cpp
+#endif // PSRDADA_CPP_EFFELSBERG_EDD_VLBI_CUH
--- a/psrdada_cpp/effelsberg/edd/src/VLBI.cu
+++ b/psrdada_cpp/effelsberg/edd/src/VLBI.cu
+#include "psrdada_cpp/effelsberg/edd/VLBI.cuh"
+#include "psrdada_cpp/cuda_utils.hpp"
+#define EDD_NTHREADS_PACK 1024 
+#define NPACK 16
+namespace psrdada_cpp {
+namespace effelsberg {
+namespace edd {
+namespace kernels {
+__global__
+void pack_edd_float32_to_2bit(float const* __restrict__ in, uint32_t* __restrict__ out, size_t n, float minV, float maxV)
+{
+    __shared__ uint32_t tmp_in[EDD_NTHREADS_PACK];
+    //__shared__ uint32_t tmp_in[EDD_NTHREADS_PACK];
+    //__shared__ volatile uint8_t tmp_out[EDD_NTHREADS_PACK / 4];
+    const float s = (maxV - minV) / 3.;
+    int odx = blockIdx.x * blockDim.x / NPACK + threadIdx.x;
+    for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n ; idx += gridDim.x * blockDim.x)
+    {
+        const float delta = (in[idx] - minV);
+        tmp_in[threadIdx.x] = 0;
+        tmp_in[threadIdx.x] += (delta >= 1 * s);
+        tmp_in[threadIdx.x] += (delta >= 2 * s);
+        tmp_in[threadIdx.x] += (delta >= 3 * s);
+        __syncthreads();
+        // can be improved by distributing work on more threads in tree
+        // structure, but already at 60-70% memory utilization  
+        if (threadIdx.x < EDD_NTHREADS_PACK / NPACK)
+        {
+          for (size_t i = 1; i < NPACK; i++)
+          {
+            tmp_in[threadIdx.x] |= tmp_in[threadIdx.x * NPACK + i] << i*2;
+          }
+          //out[odx] = tmp_out;
+          out[odx] = tmp_in[threadIdx.x];
+        }
+        odx += gridDim.x * blockDim.x / NPACK;
+        __syncthreads();
+    }
+}
+} //namespace kernels
+void pack_2bit(thrust::device_vector<float> const& input, thrust::device_vector<uint32_t>& output, float minV, float maxV, cudaStream_t _stream)
+{
+    BOOST_LOG_TRIVIAL(debug) << "Packing 2-bit data";
+    assert(input.size() % NPACK == 0);
+    output.resize(input.size() / NPACK);
+    BOOST_LOG_TRIVIAL(debug) << "INput size: " << input.size() << " elements";
+    BOOST_LOG_TRIVIAL(debug) << "Resizing output buffer to " << output.size() << " elements";
+    size_t nblocks = std::min(input.size() / EDD_NTHREADS_PACK, 4096uL);
+    BOOST_LOG_TRIVIAL(debug) << "  using " << nblocks << " blocks of " << EDD_NTHREADS_PACK << " threads";
+    float const* input_ptr = thrust::raw_pointer_cast(input.data());
+    uint32_t* output_ptr = thrust::raw_pointer_cast(output.data());
+    kernels::pack_edd_float32_to_2bit<<< nblocks, EDD_NTHREADS_PACK, 0, _stream>>>(
+            input_ptr, output_ptr, input.size(), minV, maxV);
+    CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream));
+}
+} //namespace edd
+} //namespace effelsberg
+} //namespace psrdada_cpp
--- a/psrdada_cpp/effelsberg/edd/src/VLBI_prof.cu
+++ b/psrdada_cpp/effelsberg/edd/src/VLBI_prof.cu
+#include <thrust/device_vector.h>
+#include <cuda_profiler_api.h>
+#include <thrust/random.h>
+#include <thrust/execution_policy.h>
+#include "psrdada_cpp/effelsberg/edd/VLBI.cuh"
+struct GenRand
+{
+    __device__
+    float operator () (int idx)
+    {
+        thrust::default_random_engine randEng;
+        thrust::uniform_real_distribution<float> uniDist;
+        randEng.discard(idx);
+        return uniDist(randEng);
+    }
+};
+int main()
+{
+  const size_t n = 1024 * 1024 * 1024 / 4;
+    thrust::device_vector<float>  input(n);
+    thrust::device_vector<uint32_t>  output(n);
+    thrust::fill(input.begin(), input.end(), .5);
+    thrust::fill(output.begin(), output.end(), 5);
+    cudaDeviceSynchronize();
+    psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 1, 0);
+    cudaDeviceSynchronize();
+    std::cout << input[0] << std::endl; 
+    std::cout << input[1] << std::endl; 
+    std::cout << input[2] << std::endl; 
+    std::cout << input[3] << std::endl; 
+    std::cout << (int) output[0] << std::endl; 
+    for (size_t i = 0; i<10; i++)
+      std::cout << i <<": " << output[i] << std::endl; 
+    cudaProfilerStop();
+}
--- a/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt
+++ b/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt
@@ -9,6 +9,7 @@ set(
    src/FftSpectrometerTester.cu
    src/UnpackerTester.cu
    src/ScaledTransposeTFtoTFTTester.cu
+    src/VLBITest.cu
 )
 cuda_add_executable(gtest_edd ${gtest_edd_src} )
 target_link_libraries(gtest_edd ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES})

--- a/psrdada_cpp/effelsberg/edd/test/src/VLBITest.cu
+++ b/psrdada_cpp/effelsberg/edd/test/src/VLBITest.cu
+#include "gtest/gtest.h"
+#include "psrdada_cpp/cuda_utils.hpp"
+#include <random>
+#include "psrdada_cpp/effelsberg/edd/VLBI.cuh"
+#include "thrust/extrema.h"
+TEST(VLBITest, 2_bit_pack_test)
+{
+    std::size_t n = 1024;
+    thrust::device_vector<float>  input(n);
+    thrust::device_vector<uint32_t>  output(n);
+    {
+      thrust::fill(input.begin(), input.end(), 0);
+      thrust::fill(output.begin(), output.end(), 5);
+      psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
+      EXPECT_EQ(output.size(), input.size() / 16);
+      thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
+      minmax = thrust::minmax_element(output.begin(), output.end());
+      EXPECT_EQ(0, *minmax.first);
+      EXPECT_EQ(0, *minmax.second);
+    }
+    {
+      thrust::fill(input.begin(), input.end(), 1);
+      thrust::fill(output.begin(), output.end(), 5);
+      psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
+      thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
+      minmax = thrust::minmax_element(output.begin(), output.end());
+      EXPECT_EQ((uint32_t)0b0101010101010101010101010101010101010101, *minmax.first);
+      EXPECT_EQ((uint32_t)0b0101010101010101010101010101010101010101, *minmax.second);
+    }
+    {
+      thrust::fill(input.begin(), input.end(), 2);
+      thrust::fill(output.begin(), output.end(), 5);
+      psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
+      thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
+      minmax = thrust::minmax_element(output.begin(), output.end());
+      EXPECT_EQ((uint32_t)0b1010101010101010101010101010101010101010, *minmax.first);
+      EXPECT_EQ((uint32_t)0b1010101010101010101010101010101010101010, *minmax.second);
+    }
+    {
+      thrust::fill(input.begin(), input.end(), 3);
+      thrust::fill(output.begin(), output.end(), 5);
+      psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
+      thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
+      minmax = thrust::minmax_element(output.begin(), output.end());
+      EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.first);
+      EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.second);
+    }
+    {
+      thrust::fill(input.begin(), input.end(), 4);
+      thrust::fill(output.begin(), output.end(), 5);
+      psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
+      thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
+      minmax = thrust::minmax_element(output.begin(), output.end());
+      EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.first);
+      EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.second);
+    }
+}
+//int main(int argc, char **argv) {
+//  ::testing::InitGoogleTest(&argc, argv);
+//
+//  return RUN_ALL_TESTS();
+//}