diff --git a/psrdada_cpp/common/src/unpacker.cu b/psrdada_cpp/common/src/unpacker.cu index ca2aa40102ee67508aec8e0221969c1a103b9b6f..9a84387adf22b7fdb4578ed4e7943850ce5fb199 100644 --- a/psrdada_cpp/common/src/unpacker.cu +++ b/psrdada_cpp/common/src/unpacker.cu @@ -103,8 +103,48 @@ void unpack_edd_10bit_to_float32(uint64_t const* __restrict__ in, float* __restr } - - +__global__ +void unpack_edd_12bit_to_float32_non512(ulong3 const* __restrict__ in, float* __restrict__ out, int n) +{ + /** + * Note: This kernel has significantly worse performance than the other 12-bit unpacking kernel + but has the benefit that it works for arbitrary input sizes + */ + float sout[16]; + uint64_t val; + uint64_t rest; + for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += gridDim.x * blockDim.x) + { + ulong3 value = in[idx]; + val = swap64(value.x); + sout[0] = (float)((int64_t)(( 0xFFF0000000000000 & val) << 0) >> 52); + sout[1] = (float)((int64_t)(( 0x000FFF0000000000 & val) << 12) >> 52); + sout[2] = (float)((int64_t)(( 0x000000FFF0000000 & val) << 24) >> 52); + sout[3] = (float)((int64_t)(( 0x000000000FFF0000 & val) << 36) >> 52); + sout[4] = (float)((int64_t)(( 0x000000000000FFF0 & val) << 48) >> 52); + rest = ( 0x000000000000000F & val) << 60; + val = swap64(value.y); + sout[5] = (float)((int64_t)((( 0xFF00000000000000 & val) >> 4) | rest) >> 52); + sout[6] = (float)((int64_t)(( 0x00FFF00000000000 & val) << 8) >> 52); + sout[7] = (float)((int64_t)(( 0x00000FFF00000000 & val) << 20) >> 52); + sout[8] = (float)((int64_t)(( 0x00000000FFF00000 & val) << 32) >> 52); + sout[9] = (float)((int64_t)(( 0x00000000000FFF00 & val) << 44) >> 52); + rest = ( 0x00000000000000FF & val) << 56; + val = swap64(value.z); + sout[10] = (float)((int64_t)((( 0xF000000000000000 & val) >> 8) | rest) >> 52); + sout[11] = (float)((int64_t)(( 0x0FFF000000000000 & val) << 4) >> 52); + sout[12] = (float)((int64_t)(( 0x0000FFF000000000 & val) << 16) >> 52); + sout[13] = (float)((int64_t)(( 0x0000000FFF000000 & val) << 28) >> 52); + sout[14] = (float)((int64_t)(( 0x0000000000FFF000 & val) << 40) >> 52); + sout[15] = (float)((int64_t)(( 0x0000000000000FFF & val) << 52) >> 52); + + const std::size_t output_idx = idx * 16; + for (int ii = 0; ii < 16; ++ii) + { + out[output_idx + ii] = sout[ii]; + } + } +} __global__ void unpack_edd_12bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n) @@ -112,25 +152,26 @@ void unpack_edd_12bit_to_float32(uint64_t const* __restrict__ in, float* __restr /** * Note: This kernels will not work with more than 512 threads. */ - __shared__ volatile float tmp_out[EDD_NTHREADS_UNPACK * 16]; - __shared__ volatile uint64_t tmp_in[EDD_NTHREADS_UNPACK * 3]; + __shared__ volatile float tmp_out[EDD_NTHREADS_UNPACK * 16]; // 8192 floats + __shared__ volatile uint64_t tmp_in[EDD_NTHREADS_UNPACK * 3]; // 1536 uint64_t int block_idx = blockIdx.x; uint64_t val; uint64_t rest; volatile float* sout = tmp_out + (16 * threadIdx.x); // We search for the next 512 multiple of size n to utilize all threads in the outer // loop. If we set (3 * idx + 2) < n, the kernel only works for n % 512 = 0 && n % 3 = 0 - int next_multiple = ((n >> 9) + 1) << 9; + int next_multiple = ((n >> 9) + 1) << 9; // rounds up to next 512 for (int idx = blockIdx.x * blockDim.x + threadIdx.x; (3 * idx + 2) < next_multiple; idx+=gridDim.x*blockDim.x) { + if(3 * idx < n) { //Read to shared memeory int block_read_start = block_idx * EDD_NTHREADS_UNPACK * 3; - tmp_in[threadIdx.x] = in[block_read_start + threadIdx.x]; - tmp_in[EDD_NTHREADS_UNPACK + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK + threadIdx.x]; + tmp_in[EDD_NTHREADS_UNPACK * 0 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK * 0 + threadIdx.x]; + tmp_in[EDD_NTHREADS_UNPACK * 1 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK * 1 + threadIdx.x]; tmp_in[EDD_NTHREADS_UNPACK * 2 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK * 2 + threadIdx.x]; } __syncthreads(); @@ -225,18 +266,28 @@ Unpacker::~Unpacker() template <> void Unpacker::unpack<12>(const uint64_t* input, float* output, size_t size) { - assert(size % 3 == 0); + + if (size % 3 != 0) { + throw std::invalid_argument("Input size must be a multiple of 3: size = " + std::to_string(size)); + } BOOST_LOG_TRIVIAL(debug) << " - Unpacking 12-bit data"; - int nblocks = size / EDD_NTHREADS_UNPACK; - kernels::unpack_edd_12bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK, 0, _stream>>>( - input, output, size); + if (size % EDD_NTHREADS_UNPACK == 0) { + int nblocks = size / EDD_NTHREADS_UNPACK; + kernels::unpack_edd_12bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK, 0, _stream>>>( + input, output, size); + } else { + int nthreads = 1024; + int nblocks = ((size/3) % nthreads == 0) ? (size/3) / nthreads : (size/3) / nthreads + 1; + kernels::unpack_edd_12bit_to_float32_non512<<< nblocks, nthreads, 0, _stream>>>( + reinterpret_cast<const ulong3*>(input), output, size/3); + } } template <> void Unpacker::unpack<8>(const uint64_t* input, float* output, size_t size) { BOOST_LOG_TRIVIAL(debug) << " - Unpacking 8-bit data"; - int nblocks = size / EDD_NTHREADS_UNPACK; + int nblocks = (size % EDD_NTHREADS_UNPACK == 0) ? size / EDD_NTHREADS_UNPACK : size / EDD_NTHREADS_UNPACK + 1; kernels::unpack_edd_8bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK, 0, _stream>>>( input, output, size); } @@ -244,9 +295,11 @@ void Unpacker::unpack<8>(const uint64_t* input, float* output, size_t size) template <> void Unpacker::unpack<10>(const uint64_t* input, float* output, size_t size) { - assert(size % 5 == 0); + if (size % 5 != 0) { + throw std::invalid_argument("Input size must be a multiple of 5: size = " + std::to_string(size)); + } BOOST_LOG_TRIVIAL(debug) << " - Unpacking 10-bit data"; - int nblocks = size / EDD_NTHREADS_UNPACK10; + int nblocks = (size % EDD_NTHREADS_UNPACK10 == 0) ? size / EDD_NTHREADS_UNPACK10 : size / EDD_NTHREADS_UNPACK10 + 1; kernels::unpack_edd_10bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK10, 0, _stream>>>( input, output, size); } diff --git a/psrdada_cpp/common/test/src/unpacker_tester.cu b/psrdada_cpp/common/test/src/unpacker_tester.cu index 72d07e401ac33b5689d1b9affb9129c4dc4a9b1e..8c58db09a18916228512a8afd07672e2d3069b7e 100644 --- a/psrdada_cpp/common/test/src/unpacker_tester.cu +++ b/psrdada_cpp/common/test/src/unpacker_tester.cu @@ -226,6 +226,7 @@ TEST_F(UnpackerTester, test_8bit_unpacking_with_type_casting) Unpacker unpacker(_stream); unpacker.unpack<8>(idata, odata); + CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream)); Unpacker::OutputType h_odata = odata; compare_against_host(h_odata, reference); } @@ -245,6 +246,7 @@ TEST_F(UnpackerTester, test_8bit_unpacking_with_type_casting_non_512_multiple) Unpacker unpacker(_stream); unpacker.unpack<8>(idata, odata); + CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream)); Unpacker::OutputType h_odata = odata; compare_against_host(h_odata, reference); } @@ -259,13 +261,14 @@ TEST_F(UnpackerTester, 12_bit_unpack_test) OutputType host_output; Unpacker unpacker(_stream); unpacker.unpack<12>(gpu_input, gpu_output); + CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream)); unpacker_12_to_32_c_reference(host_input, host_output); compare_against_host(gpu_output, host_output); } TEST_F(UnpackerTester, 12_bit_unpack_test_non_512_multiple) { - std::size_t n = 1017*3; + std::size_t n = 1017 * 3; InputType host_input = testing_tools::random_vector<uint64_t>(1, 1<<31, n); Unpacker::InputType gpu_input = host_input; Unpacker::OutputType gpu_output; @@ -273,6 +276,7 @@ TEST_F(UnpackerTester, 12_bit_unpack_test_non_512_multiple) OutputType host_output; Unpacker unpacker(_stream); unpacker.unpack<12>(gpu_input, gpu_output); + CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream)); unpacker_12_to_32_c_reference(host_input, host_output); compare_against_host(gpu_output, host_output); } @@ -296,6 +300,7 @@ TEST_F(UnpackerTester, 8_bit_unpack_test) OutputType host_output; Unpacker unpacker(_stream); unpacker.unpack<8>(gpu_input, gpu_output); + CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream)); unpacker_8_to_32_c_reference(host_input, host_output); compare_against_host(gpu_output, host_output); } @@ -310,6 +315,7 @@ TEST_F(UnpackerTester, 8_bit_unpack_test_non_512_multiple) OutputType host_output; Unpacker unpacker(_stream); unpacker.unpack<8>(gpu_input, gpu_output); + CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream)); unpacker_8_to_32_c_reference(host_input, host_output); compare_against_host(gpu_output, host_output); } @@ -333,6 +339,7 @@ TEST_F(UnpackerTester, 10_bit_unpack_test) OutputType host_output; Unpacker unpacker(_stream); unpacker.unpack<10>(gpu_input, gpu_output); + CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream)); unpacker_10_to_32_c_reference(host_input, host_output); compare_against_host(gpu_output, host_output); }