Merge branch 'devel' into 'devel'

Added 10 it unpacker on GPU See merge request !4

Merge branch 'devel' into 'devel'
64d92803 · Tobias Winchen · b3619f8d · f9ab54d5 · 64d92803 · 64d92803
Commit 64d92803 authored 4 years ago by Tobias Winchen
--- a/psrdada_cpp/effelsberg/edd/Unpacker.cuh
+++ b/psrdada_cpp/effelsberg/edd/Unpacker.cuh
@@ -12,6 +12,9 @@ namespace kernels {
 __global__
 void unpack_edd_12bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n);

+__global__
+void unpack_edd_10bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n);
+
 __global__
 void unpack_edd_8bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n);

@@ -33,7 +36,7 @@ public:
    void unpack(const uint64_t*  input, float* output, size_t size);

    template <int Nbits>
-    void unpack(InputType const& input, OutputType& output) 
+    void unpack(InputType const& input, OutputType& output)
    {
      InputType::value_type const* input_ptr = thrust::raw_pointer_cast(input.data());
      OutputType::value_type* output_ptr = thrust::raw_pointer_cast(output.data());

--- a/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
+++ b/psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
@@ -75,7 +75,7 @@ GatedSpectrometer<HandlerType, InputType, OutputType>::GatedSpectrometer(
 {

  // Sanity checks
-  assert(((nbits == 12) || (nbits == 8)));
+  assert(((nbits == 12) || (nbits == 8) || (nbits == 10)));
  assert(_naccumulate > 0);

  // check for any device errors

--- a/psrdada_cpp/effelsberg/edd/detail/GatedStokesSpectrometer.cu
+++ b/psrdada_cpp/effelsberg/edd/detail/GatedStokesSpectrometer.cu
@@ -160,7 +160,7 @@ GatedStokesSpectrometer<HandlerType>::GatedStokesSpectrometer(
      _call_count(0), _nsamps_per_heap(4096), _processing_efficiency(0.){

  // Sanity checks
-  assert(((_nbits == 12) || (_nbits == 8)));
+  assert(((_nbits == 12) || (_nbits == 8) || (_nbits == 10)));
  assert(_naccumulate > 0);

  // check for any device errors

--- a/psrdada_cpp/effelsberg/edd/src/GatedSpectrometer_cli.cu
+++ b/psrdada_cpp/effelsberg/edd/src/GatedSpectrometer_cli.cu
@@ -198,7 +198,7 @@ int main(int argc, char **argv) {

    desc.add_options()("nbits,b", po::value<unsigned int>(&ip.nbits)->required(),
                       "The number of bits per sample in the "
-                       "packetiser output (8 or 12)");
+                       "packetiser output (8, 10 12)");
    desc.add_options()("fft_length,n", po::value<size_t>(&ip.fft_length)->required(),
                       "The length of the FFT to perform on the data");
    desc.add_options()("naccumulate,a",

--- a/psrdada_cpp/effelsberg/edd/src/Unpacker.cu
+++ b/psrdada_cpp/effelsberg/edd/src/Unpacker.cu
@@ -2,6 +2,7 @@
 #include "psrdada_cpp/cuda_utils.hpp"

 #define EDD_NTHREADS_UNPACK 512
+#define EDD_NTHREADS_UNPACK10 128   // More than 128 threads are silently not launched??

 namespace psrdada_cpp {
 namespace effelsberg {
@@ -21,6 +22,93 @@ __device__ __forceinline__ uint64_t swap64(uint64_t x)
    return result;
 }

+__global__
+void unpack_edd_10bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n)
+{
+    /**
+     * Note: This kernels will not work with more than 256 threads.
+     */
+    __shared__ volatile float tmp_out[EDD_NTHREADS_UNPACK10 * 32];
+    __shared__ volatile uint64_t tmp_in[EDD_NTHREADS_UNPACK10 * 5];
+    int block_idx = blockIdx.x;
+    uint64_t val;
+    uint64_t rest;
+    volatile float* sout = tmp_out + (32 * threadIdx.x);
+    for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        (5 * idx + 4) < n;
+        idx+=gridDim.x*blockDim.x)
+    {
+        //Read to shared memeory
+        int block_read_start = block_idx * EDD_NTHREADS_UNPACK10 * 5;
+        tmp_in[threadIdx.x]                             = in[block_read_start + threadIdx.x];
+        tmp_in[EDD_NTHREADS_UNPACK10 + threadIdx.x]     = in[block_read_start + EDD_NTHREADS_UNPACK10 + threadIdx.x];
+        tmp_in[EDD_NTHREADS_UNPACK10 * 2 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK10 * 2 + threadIdx.x];
+        tmp_in[EDD_NTHREADS_UNPACK10 * 3 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK10 * 3 + threadIdx.x];
+        tmp_in[EDD_NTHREADS_UNPACK10 * 4 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK10 * 4 + threadIdx.x];
+
+        __syncthreads();
+        val  = swap64(tmp_in[5*threadIdx.x]);
+        sout[0] = (float)((int64_t)(( 0xFFC0000000000000 & val) <<  0) >> 54);
+        sout[1] = (float)((int64_t)(( 0x003FF00000000000 & val) << 10) >> 54);
+        sout[2] = (float)((int64_t)(( 0x00000FFC00000000 & val) << 20) >> 54);
+        sout[3] = (float)((int64_t)(( 0x00000003FF000000 & val) << 30) >> 54);
+        sout[4] = (float)((int64_t)(( 0x0000000000FFC000 & val) << 40) >> 54);
+        sout[5] = (float)((int64_t)(( 0x0000000000003FF0 & val) << 50) >> 54);
+        rest    =                   ( 0x000000000000000F & val) << 60;
+
+        val  = swap64(tmp_in[5*threadIdx.x + 1]);
+        sout[6] = (float)((int64_t)((( 0xFC00000000000000 & val) >> 4) | rest) >> 54);
+        sout[7] = (float)((int64_t)((  0x03FF000000000000 & val) <<  6) >> 54);
+        sout[8] = (float)((int64_t)((  0x0000FFC000000000 & val) << 16) >> 54);
+        sout[9] = (float)((int64_t)((  0x0000003FF0000000 & val) << 26) >> 54);
+        sout[10] = (float)((int64_t)(( 0x000000000FFC0000 & val) << 36) >> 54);
+        sout[11] = (float)((int64_t)(( 0x000000000003FF00 & val) << 46) >> 54);
+        rest    =                    ( 0x00000000000000FF & val) << 56;
+
+        val  = swap64(tmp_in[5*threadIdx.x + 2]);
+        sout[12] = (float)((int64_t)(((0xC000000000000000 & val) >> 8) | rest) >> 54);
+        sout[13] = (float)((int64_t)(( 0x3FF0000000000000 & val) <<  2) >> 54);
+        sout[14] = (float)((int64_t)(( 0x000FFC0000000000 & val) << 12) >> 54);
+        sout[15] = (float)((int64_t)(( 0x000003FF00000000 & val) << 22) >> 54);
+        sout[16] = (float)((int64_t)(( 0x00000000FFC00000 & val) << 32) >> 54);
+        sout[17] = (float)((int64_t)(( 0x00000000003FF000 & val) << 42) >> 54);
+        sout[18] = (float)((int64_t)(( 0x0000000000000FFC & val) << 52) >> 54);
+        rest    =                    ( 0x0000000000000003 & val) << 62;
+
+        val  = swap64(tmp_in[5*threadIdx.x + 3]);
+        sout[19] = (float)((int64_t)(((0xFF00000000000000 & val) >> 2) | rest) >> 54);
+        sout[20] = (float)((int64_t)(( 0x00FFC00000000000 & val) <<  8) >> 54);
+        sout[21] = (float)((int64_t)(( 0x00003FF000000000 & val) << 18) >> 54);
+        sout[22] = (float)((int64_t)(( 0x0000000FFC000000 & val) << 28) >> 54);
+        sout[23] = (float)((int64_t)(( 0x0000000003FF0000 & val) << 38) >> 54);
+        sout[24] = (float)((int64_t)(( 0x000000000000FFC0 & val) << 48) >> 54);
+        rest    =                    ( 0x000000000000003F & val) << 58;
+
+        val  = swap64(tmp_in[5*threadIdx.x + 4]);
+        sout[25] = (float)((int64_t)(((0xF000000000000000 & val) >> 6) | rest) >> 54);
+        sout[26] = (float)((int64_t)(( 0x0FFC000000000000 & val) <<  4) >> 54);
+        sout[27] = (float)((int64_t)(( 0x0003FF0000000000 & val) << 14) >> 54);
+        sout[28] = (float)((int64_t)(( 0x000000FFC0000000 & val) << 24) >> 54);
+        sout[29] = (float)((int64_t)(( 0x000000003FF00000 & val) << 34) >> 54);
+        sout[30] = (float)((int64_t)(( 0x00000000000FFC00 & val) << 44) >> 54);
+        sout[31] = (float)((int64_t)(( 0x00000000000003FF & val) << 54) >> 54);
+        rest = 0;
+
+        __syncthreads();
+        size_t block_write_start = block_idx * EDD_NTHREADS_UNPACK10 * 32;
+        for (size_t ii = threadIdx.x; ii < 32 * EDD_NTHREADS_UNPACK10; ii += blockDim.x)
+        {
+            out[block_write_start + ii] = tmp_out[ii];
+        }
+        block_idx += gridDim.x;
+        __syncthreads();
+    }
+}
+
+
+
+
+
 __global__
 void unpack_edd_12bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n)
 {
@@ -139,6 +227,17 @@ void Unpacker::unpack<8>(const uint64_t* input, float* output, size_t size)
            input, output, size);
 }

+
+template <>
+void Unpacker::unpack<10>(const uint64_t* input, float* output, size_t size)
+{
+    BOOST_LOG_TRIVIAL(debug) << "Unpacking 10-bit data";
+    int nblocks = size / EDD_NTHREADS_UNPACK10;
+    kernels::unpack_edd_10bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK10, 0, _stream>>>(
+            input, output, size);
+}
+
+
 } //namespace edd
 } //namespace effelsberg
 } //namespace psrdada_cpp
--- a/psrdada_cpp/effelsberg/edd/test/src/UnpackerTester.cu
+++ b/psrdada_cpp/effelsberg/edd/test/src/UnpackerTester.cu
@@ -110,7 +110,7 @@ void UnpackerTester::unpacker_10_to_32_c_reference(
            static_cast<int64_t>(( 0x0000000000FFC000 & val) << 40) >> 54));
        output.push_back(static_cast<float>(
            static_cast<int64_t>(( 0x0000000000003FF0 & val) << 50) >> 54));
-        rest =          ( 0x000000000000000F & val) << 60; // 4 bits rest.
+        rest =                   ( 0x000000000000000F & val) << 60; // 4 bits rest.
        val = be64toh(input[ii + 1]);
        output.push_back(static_cast<float>(
            static_cast<int64_t>(((0xFC00000000000000 & val) >> 4) | rest) >> 54));
@@ -205,10 +205,11 @@ void UnpackerTester::compare_against_host(
    OutputType const& host_output)
 {
    OutputType copy_from_gpu = gpu_output;
+    cudaDeviceSynchronize();
    ASSERT_EQ(host_output.size(), copy_from_gpu.size());
    for (std::size_t ii = 0; ii < host_output.size(); ++ii)
    {
-	ASSERT_EQ(host_output[ii], copy_from_gpu[ii]);
+	    EXPECT_EQ(host_output[ii], copy_from_gpu[ii]) << " ii = " << ii;
    }
 }

@@ -240,7 +241,7 @@ TEST_F(UnpackerTester, 8_bit_unpack_test)
    InputType host_input(n);
    for (size_t ii = 0; ii < n; ++ii)
    {
-        host_input[ii] = distribution(generator);
+        host_input[ii]  = distribution(generator);
    }
    Unpacker::InputType gpu_input = host_input;
    Unpacker::OutputType gpu_output;
@@ -253,6 +254,29 @@ TEST_F(UnpackerTester, 8_bit_unpack_test)
 }


+TEST_F(UnpackerTester, 10_bit_unpack_test)
+{
+    std::size_t n = 640;
+    std::default_random_engine generator;
+    std::uniform_int_distribution<int> distribution(1,1<<31);
+    InputType host_input(n);
+    for (size_t ii = 0; ii < n; ++ii)
+    {
+        host_input[ii] = distribution(generator);
+    }
+    Unpacker::InputType gpu_input = host_input;
+    Unpacker::OutputType gpu_output;
+    gpu_output.resize(host_input.size() * sizeof(host_input[0]) * 8 / 10);
+    OutputType host_output;
+    Unpacker unpacker(_stream);
+    unpacker.unpack<10>(gpu_input, gpu_output);
+    unpacker_10_to_32_c_reference(host_input, host_output);
+    compare_against_host(gpu_output, host_output);
+}
+
+
+
+
 } //namespace test
 } //namespace edd
 } //namespace meerkat