Commit 64d92803 authored by Tobias Winchen's avatar Tobias Winchen
Browse files

Merge branch 'devel' into 'devel'

Added 10 it unpacker on GPU

See merge request !4
parents b3619f8d f9ab54d5
Pipeline #92900 failed with stages
in 5 minutes and 36 seconds
...@@ -12,6 +12,9 @@ namespace kernels { ...@@ -12,6 +12,9 @@ namespace kernels {
__global__ __global__
void unpack_edd_12bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n); void unpack_edd_12bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n);
__global__
void unpack_edd_10bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n);
__global__ __global__
void unpack_edd_8bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n); void unpack_edd_8bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n);
...@@ -33,7 +36,7 @@ public: ...@@ -33,7 +36,7 @@ public:
void unpack(const uint64_t* input, float* output, size_t size); void unpack(const uint64_t* input, float* output, size_t size);
template <int Nbits> template <int Nbits>
void unpack(InputType const& input, OutputType& output) void unpack(InputType const& input, OutputType& output)
{ {
InputType::value_type const* input_ptr = thrust::raw_pointer_cast(input.data()); InputType::value_type const* input_ptr = thrust::raw_pointer_cast(input.data());
OutputType::value_type* output_ptr = thrust::raw_pointer_cast(output.data()); OutputType::value_type* output_ptr = thrust::raw_pointer_cast(output.data());
......
...@@ -75,7 +75,7 @@ GatedSpectrometer<HandlerType, InputType, OutputType>::GatedSpectrometer( ...@@ -75,7 +75,7 @@ GatedSpectrometer<HandlerType, InputType, OutputType>::GatedSpectrometer(
{ {
// Sanity checks // Sanity checks
assert(((nbits == 12) || (nbits == 8))); assert(((nbits == 12) || (nbits == 8) || (nbits == 10)));
assert(_naccumulate > 0); assert(_naccumulate > 0);
// check for any device errors // check for any device errors
......
...@@ -160,7 +160,7 @@ GatedStokesSpectrometer<HandlerType>::GatedStokesSpectrometer( ...@@ -160,7 +160,7 @@ GatedStokesSpectrometer<HandlerType>::GatedStokesSpectrometer(
_call_count(0), _nsamps_per_heap(4096), _processing_efficiency(0.){ _call_count(0), _nsamps_per_heap(4096), _processing_efficiency(0.){
// Sanity checks // Sanity checks
assert(((_nbits == 12) || (_nbits == 8))); assert(((_nbits == 12) || (_nbits == 8) || (_nbits == 10)));
assert(_naccumulate > 0); assert(_naccumulate > 0);
// check for any device errors // check for any device errors
......
...@@ -198,7 +198,7 @@ int main(int argc, char **argv) { ...@@ -198,7 +198,7 @@ int main(int argc, char **argv) {
desc.add_options()("nbits,b", po::value<unsigned int>(&ip.nbits)->required(), desc.add_options()("nbits,b", po::value<unsigned int>(&ip.nbits)->required(),
"The number of bits per sample in the " "The number of bits per sample in the "
"packetiser output (8 or 12)"); "packetiser output (8, 10 12)");
desc.add_options()("fft_length,n", po::value<size_t>(&ip.fft_length)->required(), desc.add_options()("fft_length,n", po::value<size_t>(&ip.fft_length)->required(),
"The length of the FFT to perform on the data"); "The length of the FFT to perform on the data");
desc.add_options()("naccumulate,a", desc.add_options()("naccumulate,a",
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include "psrdada_cpp/cuda_utils.hpp" #include "psrdada_cpp/cuda_utils.hpp"
#define EDD_NTHREADS_UNPACK 512 #define EDD_NTHREADS_UNPACK 512
#define EDD_NTHREADS_UNPACK10 128 // More than 128 threads are silently not launched??
namespace psrdada_cpp { namespace psrdada_cpp {
namespace effelsberg { namespace effelsberg {
...@@ -21,6 +22,93 @@ __device__ __forceinline__ uint64_t swap64(uint64_t x) ...@@ -21,6 +22,93 @@ __device__ __forceinline__ uint64_t swap64(uint64_t x)
return result; return result;
} }
__global__
void unpack_edd_10bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n)
{
/**
* Note: This kernels will not work with more than 256 threads.
*/
__shared__ volatile float tmp_out[EDD_NTHREADS_UNPACK10 * 32];
__shared__ volatile uint64_t tmp_in[EDD_NTHREADS_UNPACK10 * 5];
int block_idx = blockIdx.x;
uint64_t val;
uint64_t rest;
volatile float* sout = tmp_out + (32 * threadIdx.x);
for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
(5 * idx + 4) < n;
idx+=gridDim.x*blockDim.x)
{
//Read to shared memeory
int block_read_start = block_idx * EDD_NTHREADS_UNPACK10 * 5;
tmp_in[threadIdx.x] = in[block_read_start + threadIdx.x];
tmp_in[EDD_NTHREADS_UNPACK10 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK10 + threadIdx.x];
tmp_in[EDD_NTHREADS_UNPACK10 * 2 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK10 * 2 + threadIdx.x];
tmp_in[EDD_NTHREADS_UNPACK10 * 3 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK10 * 3 + threadIdx.x];
tmp_in[EDD_NTHREADS_UNPACK10 * 4 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK10 * 4 + threadIdx.x];
__syncthreads();
val = swap64(tmp_in[5*threadIdx.x]);
sout[0] = (float)((int64_t)(( 0xFFC0000000000000 & val) << 0) >> 54);
sout[1] = (float)((int64_t)(( 0x003FF00000000000 & val) << 10) >> 54);
sout[2] = (float)((int64_t)(( 0x00000FFC00000000 & val) << 20) >> 54);
sout[3] = (float)((int64_t)(( 0x00000003FF000000 & val) << 30) >> 54);
sout[4] = (float)((int64_t)(( 0x0000000000FFC000 & val) << 40) >> 54);
sout[5] = (float)((int64_t)(( 0x0000000000003FF0 & val) << 50) >> 54);
rest = ( 0x000000000000000F & val) << 60;
val = swap64(tmp_in[5*threadIdx.x + 1]);
sout[6] = (float)((int64_t)((( 0xFC00000000000000 & val) >> 4) | rest) >> 54);
sout[7] = (float)((int64_t)(( 0x03FF000000000000 & val) << 6) >> 54);
sout[8] = (float)((int64_t)(( 0x0000FFC000000000 & val) << 16) >> 54);
sout[9] = (float)((int64_t)(( 0x0000003FF0000000 & val) << 26) >> 54);
sout[10] = (float)((int64_t)(( 0x000000000FFC0000 & val) << 36) >> 54);
sout[11] = (float)((int64_t)(( 0x000000000003FF00 & val) << 46) >> 54);
rest = ( 0x00000000000000FF & val) << 56;
val = swap64(tmp_in[5*threadIdx.x + 2]);
sout[12] = (float)((int64_t)(((0xC000000000000000 & val) >> 8) | rest) >> 54);
sout[13] = (float)((int64_t)(( 0x3FF0000000000000 & val) << 2) >> 54);
sout[14] = (float)((int64_t)(( 0x000FFC0000000000 & val) << 12) >> 54);
sout[15] = (float)((int64_t)(( 0x000003FF00000000 & val) << 22) >> 54);
sout[16] = (float)((int64_t)(( 0x00000000FFC00000 & val) << 32) >> 54);
sout[17] = (float)((int64_t)(( 0x00000000003FF000 & val) << 42) >> 54);
sout[18] = (float)((int64_t)(( 0x0000000000000FFC & val) << 52) >> 54);
rest = ( 0x0000000000000003 & val) << 62;
val = swap64(tmp_in[5*threadIdx.x + 3]);
sout[19] = (float)((int64_t)(((0xFF00000000000000 & val) >> 2) | rest) >> 54);
sout[20] = (float)((int64_t)(( 0x00FFC00000000000 & val) << 8) >> 54);
sout[21] = (float)((int64_t)(( 0x00003FF000000000 & val) << 18) >> 54);
sout[22] = (float)((int64_t)(( 0x0000000FFC000000 & val) << 28) >> 54);
sout[23] = (float)((int64_t)(( 0x0000000003FF0000 & val) << 38) >> 54);
sout[24] = (float)((int64_t)(( 0x000000000000FFC0 & val) << 48) >> 54);
rest = ( 0x000000000000003F & val) << 58;
val = swap64(tmp_in[5*threadIdx.x + 4]);
sout[25] = (float)((int64_t)(((0xF000000000000000 & val) >> 6) | rest) >> 54);
sout[26] = (float)((int64_t)(( 0x0FFC000000000000 & val) << 4) >> 54);
sout[27] = (float)((int64_t)(( 0x0003FF0000000000 & val) << 14) >> 54);
sout[28] = (float)((int64_t)(( 0x000000FFC0000000 & val) << 24) >> 54);
sout[29] = (float)((int64_t)(( 0x000000003FF00000 & val) << 34) >> 54);
sout[30] = (float)((int64_t)(( 0x00000000000FFC00 & val) << 44) >> 54);
sout[31] = (float)((int64_t)(( 0x00000000000003FF & val) << 54) >> 54);
rest = 0;
__syncthreads();
size_t block_write_start = block_idx * EDD_NTHREADS_UNPACK10 * 32;
for (size_t ii = threadIdx.x; ii < 32 * EDD_NTHREADS_UNPACK10; ii += blockDim.x)
{
out[block_write_start + ii] = tmp_out[ii];
}
block_idx += gridDim.x;
__syncthreads();
}
}
__global__ __global__
void unpack_edd_12bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n) void unpack_edd_12bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n)
{ {
...@@ -139,6 +227,17 @@ void Unpacker::unpack<8>(const uint64_t* input, float* output, size_t size) ...@@ -139,6 +227,17 @@ void Unpacker::unpack<8>(const uint64_t* input, float* output, size_t size)
input, output, size); input, output, size);
} }
template <>
void Unpacker::unpack<10>(const uint64_t* input, float* output, size_t size)
{
BOOST_LOG_TRIVIAL(debug) << "Unpacking 10-bit data";
int nblocks = size / EDD_NTHREADS_UNPACK10;
kernels::unpack_edd_10bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK10, 0, _stream>>>(
input, output, size);
}
} //namespace edd } //namespace edd
} //namespace effelsberg } //namespace effelsberg
} //namespace psrdada_cpp } //namespace psrdada_cpp
...@@ -110,7 +110,7 @@ void UnpackerTester::unpacker_10_to_32_c_reference( ...@@ -110,7 +110,7 @@ void UnpackerTester::unpacker_10_to_32_c_reference(
static_cast<int64_t>(( 0x0000000000FFC000 & val) << 40) >> 54)); static_cast<int64_t>(( 0x0000000000FFC000 & val) << 40) >> 54));
output.push_back(static_cast<float>( output.push_back(static_cast<float>(
static_cast<int64_t>(( 0x0000000000003FF0 & val) << 50) >> 54)); static_cast<int64_t>(( 0x0000000000003FF0 & val) << 50) >> 54));
rest = ( 0x000000000000000F & val) << 60; // 4 bits rest. rest = ( 0x000000000000000F & val) << 60; // 4 bits rest.
val = be64toh(input[ii + 1]); val = be64toh(input[ii + 1]);
output.push_back(static_cast<float>( output.push_back(static_cast<float>(
static_cast<int64_t>(((0xFC00000000000000 & val) >> 4) | rest) >> 54)); static_cast<int64_t>(((0xFC00000000000000 & val) >> 4) | rest) >> 54));
...@@ -205,10 +205,11 @@ void UnpackerTester::compare_against_host( ...@@ -205,10 +205,11 @@ void UnpackerTester::compare_against_host(
OutputType const& host_output) OutputType const& host_output)
{ {
OutputType copy_from_gpu = gpu_output; OutputType copy_from_gpu = gpu_output;
cudaDeviceSynchronize();
ASSERT_EQ(host_output.size(), copy_from_gpu.size()); ASSERT_EQ(host_output.size(), copy_from_gpu.size());
for (std::size_t ii = 0; ii < host_output.size(); ++ii) for (std::size_t ii = 0; ii < host_output.size(); ++ii)
{ {
ASSERT_EQ(host_output[ii], copy_from_gpu[ii]); EXPECT_EQ(host_output[ii], copy_from_gpu[ii]) << " ii = " << ii;
} }
} }
...@@ -240,7 +241,7 @@ TEST_F(UnpackerTester, 8_bit_unpack_test) ...@@ -240,7 +241,7 @@ TEST_F(UnpackerTester, 8_bit_unpack_test)
InputType host_input(n); InputType host_input(n);
for (size_t ii = 0; ii < n; ++ii) for (size_t ii = 0; ii < n; ++ii)
{ {
host_input[ii] = distribution(generator); host_input[ii] = distribution(generator);
} }
Unpacker::InputType gpu_input = host_input; Unpacker::InputType gpu_input = host_input;
Unpacker::OutputType gpu_output; Unpacker::OutputType gpu_output;
...@@ -253,6 +254,29 @@ TEST_F(UnpackerTester, 8_bit_unpack_test) ...@@ -253,6 +254,29 @@ TEST_F(UnpackerTester, 8_bit_unpack_test)
} }
TEST_F(UnpackerTester, 10_bit_unpack_test)
{
std::size_t n = 640;
std::default_random_engine generator;
std::uniform_int_distribution<int> distribution(1,1<<31);
InputType host_input(n);
for (size_t ii = 0; ii < n; ++ii)
{
host_input[ii] = distribution(generator);
}
Unpacker::InputType gpu_input = host_input;
Unpacker::OutputType gpu_output;
gpu_output.resize(host_input.size() * sizeof(host_input[0]) * 8 / 10);
OutputType host_output;
Unpacker unpacker(_stream);
unpacker.unpack<10>(gpu_input, gpu_output);
unpacker_10_to_32_c_reference(host_input, host_output);
compare_against_host(gpu_output, host_output);
}
} //namespace test } //namespace test
} //namespace edd } //namespace edd
} //namespace meerkat } //namespace meerkat
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment