Commit 733c24a0 authored by Tobias Winchen's avatar Tobias Winchen
Browse files

Added more general packer + restructured tests

parent 8e3cc058
......@@ -20,20 +20,18 @@ cuda_add_executable(fft_spectrometer src/fft_spectrometer_cli.cu)
target_link_libraries(fft_spectrometer ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES})
install(TARGETS fft_spectrometer DESTINATION bin)
cuda_add_executable(VLBI_prof src/VLBI_prof.cu)
target_link_libraries(VLBI_prof ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES})
install(TARGETS VLBI_prof DESTINATION bin)
#Gated FFT spectrometer interface
cuda_add_executable(gated_spectrometer src/GatedSpectrometer_cli.cu)
target_link_libraries(gated_spectrometer ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} -lcublas)
install(TARGETS gated_spectrometer DESTINATION bin)
cuda_add_executable(VLBI src/VLBI_cli.cu)
target_link_libraries(VLBI ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} -lcublas)
install(TARGETS VLBI DESTINATION bin)
cuda_add_executable(dada_dummy_data src/dummy_data_generator.cu)
target_link_libraries(dada_dummy_data ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES})
install(TARGETS VLBI DESTINATION bin)
add_subdirectory(test)
endif(ENABLE_CUDA)
#ifndef PSRDADA_CPP_EFFELSBERG_EDD_PACKER_CUH
#define PSRDADA_CPP_EFFELSBERG_EDD_PACKER_CUH
#include "psrdada_cpp/common.hpp"
#include <thrust/device_vector.h>
#include "psrdada_cpp/cuda_utils.hpp"
namespace psrdada_cpp {
namespace effelsberg {
namespace edd {
namespace kernels {
// pack float to 2,4,8,16 bit integers
template <unsigned int input_bit_depth>
__global__ void packNbit(const float *__restrict__ input,
uint32_t *__restrict__ output, size_t inputSize,
float minV, float maxV) {
// number of values to pack into one output element, use 32 bit here to
// maximize number of threads
const uint8_t NPACK = 32 / input_bit_depth;
const float l = (maxV - minV) / ((1 << input_bit_depth) - 1);
__shared__ uint32_t tmp[1024];
for (uint32_t i = NPACK * blockIdx.x * blockDim.x + threadIdx.x;
(i < inputSize); i += blockDim.x * gridDim.x * NPACK) {
tmp[threadIdx.x] = 0;
#pragma unroll
for (uint8_t j = 0; j < NPACK; j++) {
// Load new input value, clip and convert to Nbit integer
const float inp = input[i + j * blockDim.x];
uint32_t p = 0;
#pragma unroll
for (int k = 1; k < (1 << input_bit_depth); k++) {
p += (inp > ((k * l) + minV));
} // this is more efficient than fmin, fmax for clamp and cast.
// store in shared memory with linear access
tmp[threadIdx.x] += p << (input_bit_depth * j);
}
__syncthreads();
// load value from shared memory and rearange to output - the read value is
// reused per warp
uint32_t out = 0;
// bit mask: Thread 0 always first input_bit_depth bits, thread 1 always
// second input_bit_depth bits, ...
const uint32_t mask = ((1 << input_bit_depth) - 1) << (input_bit_depth * (threadIdx.x % NPACK));
#pragma unroll
for (uint32_t j = 0; j < NPACK; j++) {
uint32_t v = tmp[(threadIdx.x / NPACK) * NPACK + j] & mask;
// retrieve correct bits
v = v >> (input_bit_depth * (threadIdx.x % NPACK));
v = v << (input_bit_depth * j);
out += v;
}
size_t oidx = threadIdx.x / NPACK + (threadIdx.x % NPACK) * (blockDim.x / NPACK) + (i - threadIdx.x) / NPACK;
output[oidx] = out;
__syncthreads();
}
}
} // namespace kernels
template <unsigned int input_bit_depth>
void pack(const thrust::device_vector<float> &input, thrust::device_vector<uint32_t> &output, float minV, float maxV, cudaStream_t &stream)
{
BOOST_LOG_TRIVIAL(debug) << "Packing data with bitdepth " << input_bit_depth << " in range " << minV << " - " << maxV;
const uint32_t NPACK = 32 / input_bit_depth;
assert(input.size() % NPACK == 0);
output.resize(input.size() / NPACK);
BOOST_LOG_TRIVIAL(debug) << "Input size: " << input.size() << " elements";
BOOST_LOG_TRIVIAL(debug) << "Resizing output buffer to " << output.size() << " elements";
kernels::packNbit<input_bit_depth><<<128, 1024, 0,stream>>>(thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()),
input.size(), minV, maxV);
CUDA_ERROR_CHECK(cudaStreamSynchronize(stream));
};
} //namespace edd
} //namespace effelsberg
} //namespace psrdada_cpp
#endif // PSRDADA_CPP_EFFELSBERG_EDD_UNPACKER_CUH
......@@ -14,17 +14,6 @@
namespace psrdada_cpp {
namespace effelsberg {
namespace edd {
namespace kernels {
__global__
void pack_edd_float32_to_2bit(const float* __restrict__ in, uint32_t * __restrict__ out, size_t n);
} //namespace kernels
void pack_2bit(thrust::device_vector<float> const& input, thrust::device_vector<uint8_t>& output, float minV, float maxV, cudaStream_t _stream = 0);
// some helper functions to dealm with bit encoding of the header
......@@ -162,7 +151,7 @@ private:
thrust::device_vector<float> _unpacked_voltage;
// Output data
DoubleDeviceBuffer<uint8_t> _packed_voltage;
DoubleDeviceBuffer<uint32_t> _packed_voltage;
DoublePinnedHostBuffer<uint8_t> _outputBuffer;
VDIFHeader _vdifHeader;
......
#include "psrdada_cpp/effelsberg/edd/VLBI.cuh"
#include "psrdada_cpp/effelsberg/edd/Packer.cuh"
//#include "psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh"
#include "psrdada_cpp/cuda_utils.hpp"
......@@ -45,7 +46,7 @@ VLBI<HandlerType>::VLBI(std::size_t buffer_bytes, std::size_t input_bitDepth,
BOOST_LOG_TRIVIAL(debug) << " Input voltages size (in 64-bit words): "
<< _raw_voltage_db.size();
_packed_voltage.resize(n64bit_words * 64 / input_bitDepth / 4);
_packed_voltage.resize(n64bit_words * 64 / input_bitDepth / 16);
_spillOver.reserve(5000);
BOOST_LOG_TRIVIAL(debug) << " Output voltages size: "
......@@ -140,7 +141,7 @@ bool VLBI<HandlerType>::operator()(RawBytes &block) {
float minV = -2;
float maxV = 2;
pack_2bit(_unpacked_voltage, _packed_voltage.b(), minV, maxV, _proc_stream);
pack<2>(_unpacked_voltage, _packed_voltage.b(), minV, maxV, _proc_stream);
CUDA_ERROR_CHECK(cudaStreamSynchronize(_proc_stream));
......
#include "psrdada_cpp/effelsberg/edd/VLBI.cuh"
#include "psrdada_cpp/cuda_utils.hpp"
#define EDD_NTHREADS_PACK 1024
......@@ -7,90 +8,6 @@
namespace psrdada_cpp {
namespace effelsberg {
namespace edd {
namespace kernels {
__global__
void pack_edd_float32_to_2bit(const float * __restrict__ in, uint32_t* __restrict__ out, size_t n, float minV, float maxV)
{
__shared__ uint32_t tmp_in[EDD_NTHREADS_PACK];
//__shared__ uint32_t tmp_in[EDD_NTHREADS_PACK];
//__shared__ volatile uint8_t tmp_out[EDD_NTHREADS_PACK / 4];
const float s = (maxV - minV) / 3.;
for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n ; idx += gridDim.x * blockDim.x)
{
const float delta = (in[idx] - minV);
tmp_in[threadIdx.x] = 0;
tmp_in[threadIdx.x] += (delta > 1 * s);
tmp_in[threadIdx.x] += (delta > 2 * s);
tmp_in[threadIdx.x] += (delta > 3 * s);
__syncthreads();
// can be improved by distributing work on more threads in tree
// structure, but already at 60-70% memory utilization
if (threadIdx.x < EDD_NTHREADS_PACK / NPACK)
{
for (size_t i = 1; i < NPACK; i++)
{
tmp_in[threadIdx.x * NPACK] += (tmp_in[threadIdx.x * NPACK + i] << (i*2));
}
out[(idx - threadIdx.x) / NPACK + threadIdx.x] = tmp_in[threadIdx.x *NPACK];
}
__syncthreads();
}
}
//__global__ void pack_edd_float32_to_2bit(const float* __restrict__ input, uint32_t* __restrict__ output, size_t inputSize, float minV, float maxV)
//{
// float l = (maxV - minV) / 3;
// for (size_t i = blockIdx.x * blockDim.x + 16 * threadIdx.x; (i < inputSize);
// i += blockDim.x * gridDim.x * 16)
// {
// uint32_t out = 0;
// for (size_t j =0; j < 16; j++)
// {
// //out = out << 2;
//
// const float inp = input[i + j];
// const uint32_t tmp = (inp > minV + l) + (inp > minV + 2 * l) + (inp > minV + 3 * l);
// out += (tmp << (2 * j));
// //input[i + j] = i + j;
// }
//
// output[i / 16] = out;
// }
//}
} //namespace kernels
void pack_2bit(thrust::device_vector<float> const& input, thrust::device_vector<uint8_t>& output, float minV, float maxV, cudaStream_t _stream)
{
BOOST_LOG_TRIVIAL(debug) << "Packing 2-bit data";
assert(input.size() % NPACK == 0);
output.resize(input.size() / NPACK * 4);
BOOST_LOG_TRIVIAL(debug) << "Input size: " << input.size() << " elements";
BOOST_LOG_TRIVIAL(debug) << "Resizing output buffer to " << output.size() << " elements";
size_t nblocks = std::min(input.size() / EDD_NTHREADS_PACK, 4096uL);
BOOST_LOG_TRIVIAL(debug) << " using " << nblocks << " blocks of " << EDD_NTHREADS_PACK << " threads";
float const* input_ptr = thrust::raw_pointer_cast(input.data());
uint32_t* output_ptr = (uint32_t*) thrust::raw_pointer_cast(output.data());
kernels::pack_edd_float32_to_2bit<<< nblocks, EDD_NTHREADS_PACK, 0, _stream>>>(
input_ptr, output_ptr, input.size(), minV, maxV);
CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream));
}
// Create abit mask with 1 between first and lastBit (inclusive) and zero
/// otherwise;
......
#include <thrust/device_vector.h>
#include <cuda_profiler_api.h>
#include <thrust/random.h>
#include <thrust/execution_policy.h>
#include "psrdada_cpp/effelsberg/edd/VLBI.cuh"
struct GenRand
{
__device__
float operator () (int idx)
{
thrust::default_random_engine randEng;
thrust::uniform_real_distribution<float> uniDist;
randEng.discard(idx);
return uniDist(randEng);
}
};
int main()
{
const size_t n = 1024 * 1024 * 1024 / 4;
thrust::device_vector<float> input(n);
thrust::device_vector<uint8_t> output(n * 4);
thrust::fill(input.begin(), input.end(), .5);
thrust::fill(output.begin(), output.end(), 5);
cudaDeviceSynchronize();
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 1, 0);
cudaDeviceSynchronize();
std::cout << input[0] << std::endl;
std::cout << input[1] << std::endl;
std::cout << input[2] << std::endl;
std::cout << input[3] << std::endl;
std::cout << (int) output[0] << std::endl;
for (size_t i = 0; i<10; i++)
std::cout << i <<": " << output[i] << std::endl;
cudaProfilerStop();
}
......@@ -7,10 +7,11 @@ set(
src/ChanneliserTester.cu
src/DetectorAccumulatorTester.cu
src/FftSpectrometerTester.cu
src/GatedSpectrometerTest.cu
src/PackerTest.cu
src/UnpackerTester.cu
src/ScaledTransposeTFtoTFTTester.cu
src/VLBITest.cu
src/GatedSpectrometerTest.cu
)
cuda_add_executable(gtest_edd ${gtest_edd_src} )
target_link_libraries(gtest_edd ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} -lcublas)
......
......@@ -8,51 +8,6 @@
#include "psrdada_cpp/cuda_utils.hpp"
#include "thrust/extrema.h"
TEST(VLBITest, 2_bit_pack_test)
{
std::size_t n = 1024;
thrust::device_vector<float> input(n);
thrust::device_vector<uint8_t> output(n / 4);
{
float minV = -2;
float maxV = 2;
srand (time(NULL));
for (int i =0; i < input.size(); i++)
{
input[i] = ((float(rand()) / RAND_MAX) - 0.5) * 2.5 * (maxV-minV) + maxV + minV;
}
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, minV, maxV);
float step = (maxV - minV) / 3;
float L2 = minV + step;
float L3 = minV + 2 * step;
float L4 = minV + 3 * step;
const size_t nbp = 4; // 4 samples per output value
for(int i = 0; i < input.size() / nbp; i++)
{
uint64_t of = output[i];
for (size_t j =0; j< nbp; j++)
{
int a = ((of >> (j *2)) & 3);
int k = i * nbp + j;
if (input[k] >= L4)
EXPECT_EQ(a, 3);
else if (input[k] >= L3)
EXPECT_EQ(a, 2);
else if (input[k] >= L2)
EXPECT_EQ(a, 1);
else
EXPECT_EQ(a, 0);
}
}
}
}
using namespace psrdada_cpp::effelsberg::edd;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment