Skip to content
Snippets Groups Projects
Commit 64d92803 authored by Tobias Winchen's avatar Tobias Winchen
Browse files

Merge branch 'devel' into 'devel'

Added 10 it unpacker on GPU

See merge request !4
parents b3619f8d f9ab54d5
Branches
No related tags found
1 merge request!4Added 10 it unpacker on GPU
Pipeline #92900 failed
......@@ -12,6 +12,9 @@ namespace kernels {
__global__
void unpack_edd_12bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n);
__global__
void unpack_edd_10bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n);
__global__
void unpack_edd_8bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n);
......@@ -33,7 +36,7 @@ public:
void unpack(const uint64_t* input, float* output, size_t size);
template <int Nbits>
void unpack(InputType const& input, OutputType& output)
void unpack(InputType const& input, OutputType& output)
{
InputType::value_type const* input_ptr = thrust::raw_pointer_cast(input.data());
OutputType::value_type* output_ptr = thrust::raw_pointer_cast(output.data());
......
......@@ -75,7 +75,7 @@ GatedSpectrometer<HandlerType, InputType, OutputType>::GatedSpectrometer(
{
// Sanity checks
assert(((nbits == 12) || (nbits == 8)));
assert(((nbits == 12) || (nbits == 8) || (nbits == 10)));
assert(_naccumulate > 0);
// check for any device errors
......
......@@ -160,7 +160,7 @@ GatedStokesSpectrometer<HandlerType>::GatedStokesSpectrometer(
_call_count(0), _nsamps_per_heap(4096), _processing_efficiency(0.){
// Sanity checks
assert(((_nbits == 12) || (_nbits == 8)));
assert(((_nbits == 12) || (_nbits == 8) || (_nbits == 10)));
assert(_naccumulate > 0);
// check for any device errors
......
......@@ -198,7 +198,7 @@ int main(int argc, char **argv) {
desc.add_options()("nbits,b", po::value<unsigned int>(&ip.nbits)->required(),
"The number of bits per sample in the "
"packetiser output (8 or 12)");
"packetiser output (8, 10 12)");
desc.add_options()("fft_length,n", po::value<size_t>(&ip.fft_length)->required(),
"The length of the FFT to perform on the data");
desc.add_options()("naccumulate,a",
......
......@@ -2,6 +2,7 @@
#include "psrdada_cpp/cuda_utils.hpp"
#define EDD_NTHREADS_UNPACK 512
#define EDD_NTHREADS_UNPACK10 128 // More than 128 threads are silently not launched??
namespace psrdada_cpp {
namespace effelsberg {
......@@ -21,6 +22,93 @@ __device__ __forceinline__ uint64_t swap64(uint64_t x)
return result;
}
__global__
void unpack_edd_10bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n)
{
/**
* Note: This kernels will not work with more than 256 threads.
*/
__shared__ volatile float tmp_out[EDD_NTHREADS_UNPACK10 * 32];
__shared__ volatile uint64_t tmp_in[EDD_NTHREADS_UNPACK10 * 5];
int block_idx = blockIdx.x;
uint64_t val;
uint64_t rest;
volatile float* sout = tmp_out + (32 * threadIdx.x);
for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
(5 * idx + 4) < n;
idx+=gridDim.x*blockDim.x)
{
//Read to shared memeory
int block_read_start = block_idx * EDD_NTHREADS_UNPACK10 * 5;
tmp_in[threadIdx.x] = in[block_read_start + threadIdx.x];
tmp_in[EDD_NTHREADS_UNPACK10 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK10 + threadIdx.x];
tmp_in[EDD_NTHREADS_UNPACK10 * 2 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK10 * 2 + threadIdx.x];
tmp_in[EDD_NTHREADS_UNPACK10 * 3 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK10 * 3 + threadIdx.x];
tmp_in[EDD_NTHREADS_UNPACK10 * 4 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK10 * 4 + threadIdx.x];
__syncthreads();
val = swap64(tmp_in[5*threadIdx.x]);
sout[0] = (float)((int64_t)(( 0xFFC0000000000000 & val) << 0) >> 54);
sout[1] = (float)((int64_t)(( 0x003FF00000000000 & val) << 10) >> 54);
sout[2] = (float)((int64_t)(( 0x00000FFC00000000 & val) << 20) >> 54);
sout[3] = (float)((int64_t)(( 0x00000003FF000000 & val) << 30) >> 54);
sout[4] = (float)((int64_t)(( 0x0000000000FFC000 & val) << 40) >> 54);
sout[5] = (float)((int64_t)(( 0x0000000000003FF0 & val) << 50) >> 54);
rest = ( 0x000000000000000F & val) << 60;
val = swap64(tmp_in[5*threadIdx.x + 1]);
sout[6] = (float)((int64_t)((( 0xFC00000000000000 & val) >> 4) | rest) >> 54);
sout[7] = (float)((int64_t)(( 0x03FF000000000000 & val) << 6) >> 54);
sout[8] = (float)((int64_t)(( 0x0000FFC000000000 & val) << 16) >> 54);
sout[9] = (float)((int64_t)(( 0x0000003FF0000000 & val) << 26) >> 54);
sout[10] = (float)((int64_t)(( 0x000000000FFC0000 & val) << 36) >> 54);
sout[11] = (float)((int64_t)(( 0x000000000003FF00 & val) << 46) >> 54);
rest = ( 0x00000000000000FF & val) << 56;
val = swap64(tmp_in[5*threadIdx.x + 2]);
sout[12] = (float)((int64_t)(((0xC000000000000000 & val) >> 8) | rest) >> 54);
sout[13] = (float)((int64_t)(( 0x3FF0000000000000 & val) << 2) >> 54);
sout[14] = (float)((int64_t)(( 0x000FFC0000000000 & val) << 12) >> 54);
sout[15] = (float)((int64_t)(( 0x000003FF00000000 & val) << 22) >> 54);
sout[16] = (float)((int64_t)(( 0x00000000FFC00000 & val) << 32) >> 54);
sout[17] = (float)((int64_t)(( 0x00000000003FF000 & val) << 42) >> 54);
sout[18] = (float)((int64_t)(( 0x0000000000000FFC & val) << 52) >> 54);
rest = ( 0x0000000000000003 & val) << 62;
val = swap64(tmp_in[5*threadIdx.x + 3]);
sout[19] = (float)((int64_t)(((0xFF00000000000000 & val) >> 2) | rest) >> 54);
sout[20] = (float)((int64_t)(( 0x00FFC00000000000 & val) << 8) >> 54);
sout[21] = (float)((int64_t)(( 0x00003FF000000000 & val) << 18) >> 54);
sout[22] = (float)((int64_t)(( 0x0000000FFC000000 & val) << 28) >> 54);
sout[23] = (float)((int64_t)(( 0x0000000003FF0000 & val) << 38) >> 54);
sout[24] = (float)((int64_t)(( 0x000000000000FFC0 & val) << 48) >> 54);
rest = ( 0x000000000000003F & val) << 58;
val = swap64(tmp_in[5*threadIdx.x + 4]);
sout[25] = (float)((int64_t)(((0xF000000000000000 & val) >> 6) | rest) >> 54);
sout[26] = (float)((int64_t)(( 0x0FFC000000000000 & val) << 4) >> 54);
sout[27] = (float)((int64_t)(( 0x0003FF0000000000 & val) << 14) >> 54);
sout[28] = (float)((int64_t)(( 0x000000FFC0000000 & val) << 24) >> 54);
sout[29] = (float)((int64_t)(( 0x000000003FF00000 & val) << 34) >> 54);
sout[30] = (float)((int64_t)(( 0x00000000000FFC00 & val) << 44) >> 54);
sout[31] = (float)((int64_t)(( 0x00000000000003FF & val) << 54) >> 54);
rest = 0;
__syncthreads();
size_t block_write_start = block_idx * EDD_NTHREADS_UNPACK10 * 32;
for (size_t ii = threadIdx.x; ii < 32 * EDD_NTHREADS_UNPACK10; ii += blockDim.x)
{
out[block_write_start + ii] = tmp_out[ii];
}
block_idx += gridDim.x;
__syncthreads();
}
}
__global__
void unpack_edd_12bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n)
{
......@@ -139,6 +227,17 @@ void Unpacker::unpack<8>(const uint64_t* input, float* output, size_t size)
input, output, size);
}
template <>
void Unpacker::unpack<10>(const uint64_t* input, float* output, size_t size)
{
BOOST_LOG_TRIVIAL(debug) << "Unpacking 10-bit data";
int nblocks = size / EDD_NTHREADS_UNPACK10;
kernels::unpack_edd_10bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK10, 0, _stream>>>(
input, output, size);
}
} //namespace edd
} //namespace effelsberg
} //namespace psrdada_cpp
......@@ -110,7 +110,7 @@ void UnpackerTester::unpacker_10_to_32_c_reference(
static_cast<int64_t>(( 0x0000000000FFC000 & val) << 40) >> 54));
output.push_back(static_cast<float>(
static_cast<int64_t>(( 0x0000000000003FF0 & val) << 50) >> 54));
rest = ( 0x000000000000000F & val) << 60; // 4 bits rest.
rest = ( 0x000000000000000F & val) << 60; // 4 bits rest.
val = be64toh(input[ii + 1]);
output.push_back(static_cast<float>(
static_cast<int64_t>(((0xFC00000000000000 & val) >> 4) | rest) >> 54));
......@@ -205,10 +205,11 @@ void UnpackerTester::compare_against_host(
OutputType const& host_output)
{
OutputType copy_from_gpu = gpu_output;
cudaDeviceSynchronize();
ASSERT_EQ(host_output.size(), copy_from_gpu.size());
for (std::size_t ii = 0; ii < host_output.size(); ++ii)
{
ASSERT_EQ(host_output[ii], copy_from_gpu[ii]);
EXPECT_EQ(host_output[ii], copy_from_gpu[ii]) << " ii = " << ii;
}
}
......@@ -240,7 +241,7 @@ TEST_F(UnpackerTester, 8_bit_unpack_test)
InputType host_input(n);
for (size_t ii = 0; ii < n; ++ii)
{
host_input[ii] = distribution(generator);
host_input[ii] = distribution(generator);
}
Unpacker::InputType gpu_input = host_input;
Unpacker::OutputType gpu_output;
......@@ -253,6 +254,29 @@ TEST_F(UnpackerTester, 8_bit_unpack_test)
}
TEST_F(UnpackerTester, 10_bit_unpack_test)
{
std::size_t n = 640;
std::default_random_engine generator;
std::uniform_int_distribution<int> distribution(1,1<<31);
InputType host_input(n);
for (size_t ii = 0; ii < n; ++ii)
{
host_input[ii] = distribution(generator);
}
Unpacker::InputType gpu_input = host_input;
Unpacker::OutputType gpu_output;
gpu_output.resize(host_input.size() * sizeof(host_input[0]) * 8 / 10);
OutputType host_output;
Unpacker unpacker(_stream);
unpacker.unpack<10>(gpu_input, gpu_output);
unpacker_10_to_32_c_reference(host_input, host_output);
compare_against_host(gpu_output, host_output);
}
} //namespace test
} //namespace edd
} //namespace meerkat
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment