Commit 656eacb6 authored by Tobias Winchen's avatar Tobias Winchen
Browse files

Added prototype for 2bit pack

parent e9c608a6
......@@ -14,7 +14,7 @@ if(ENABLE_CUDA)
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Pass options to NVCC ( -ccbin /path --compiler-options -lfftw3f --compiler-options -lm --verbose)
list(APPEND CUDA_NVCC_FLAGS -DENABLE_CUDA --std c++11 -Wno-deprecated-gpu-targets)
list(APPEND CUDA_NVCC_FLAGS -DENABLE_CUDA --std c++11 -Wno-deprecated-gpu-targets --ptxas-options=-v)
list(APPEND CUDA_NVCC_FLAGS_DEBUG --debug; --device-debug; --generate-line-info -Xcompiler "-Werror")
#list(APPEND CUDA_NVCC_FLAGS -arch compute_35) # minumum compute level (Sps restriction)
string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
......
......@@ -9,6 +9,7 @@ set(psrdada_cpp_effelsberg_edd_src
src/Unpacker.cu
src/DetectorAccumulator.cu
src/ScaledTransposeTFtoTFT.cu
src/VLBI.cu
)
cuda_add_library(${CMAKE_PROJECT_NAME}_effelsberg_edd ${psrdada_cpp_effelsberg_edd_src})
......@@ -18,5 +19,9 @@ cuda_add_executable(fft_spectrometer src/fft_spectrometer_cli.cu)
target_link_libraries(fft_spectrometer ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES})
install(TARGETS fft_spectrometer DESTINATION bin)
cuda_add_executable(VLBI_prof src/VLBI_prof.cu)
target_link_libraries(VLBI_prof ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES})
install(TARGETS VLBI_prof DESTINATION bin)
add_subdirectory(test)
endif(ENABLE_CUDA)
#ifndef PSRDADA_CPP_EFFELSBERG_EDD_VLBI_CUH
#define PSRDADA_CPP_EFFELSBERG_EDD_VLBI_CUH
#include "psrdada_cpp/common.hpp"
#include <thrust/device_vector.h>
namespace psrdada_cpp {
namespace effelsberg {
namespace edd {
namespace kernels {
__global__
void pack_edd_float32_to_2bit(float* __restrict__ in, uint32_t * __restrict__ out, size_t n);
} //namespace kernels
void pack_2bit(thrust::device_vector<float> const& input, thrust::device_vector<uint32_t>& output, float minV, float maxV, cudaStream_t _stream = 0);
} //namespace edd
} //namespace effelsberg
} //namespace psrdada_cpp
#endif // PSRDADA_CPP_EFFELSBERG_EDD_VLBI_CUH
#include "psrdada_cpp/effelsberg/edd/VLBI.cuh"
#include "psrdada_cpp/cuda_utils.hpp"
#define EDD_NTHREADS_PACK 1024
#define NPACK 16
namespace psrdada_cpp {
namespace effelsberg {
namespace edd {
namespace kernels {
__global__
void pack_edd_float32_to_2bit(float const* __restrict__ in, uint32_t* __restrict__ out, size_t n, float minV, float maxV)
{
__shared__ uint32_t tmp_in[EDD_NTHREADS_PACK];
//__shared__ uint32_t tmp_in[EDD_NTHREADS_PACK];
//__shared__ volatile uint8_t tmp_out[EDD_NTHREADS_PACK / 4];
const float s = (maxV - minV) / 3.;
int odx = blockIdx.x * blockDim.x / NPACK + threadIdx.x;
for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n ; idx += gridDim.x * blockDim.x)
{
const float delta = (in[idx] - minV);
tmp_in[threadIdx.x] = 0;
tmp_in[threadIdx.x] += (delta >= 1 * s);
tmp_in[threadIdx.x] += (delta >= 2 * s);
tmp_in[threadIdx.x] += (delta >= 3 * s);
__syncthreads();
// can be improved by distributing work on more threads in tree
// structure, but already at 60-70% memory utilization
if (threadIdx.x < EDD_NTHREADS_PACK / NPACK)
{
for (size_t i = 1; i < NPACK; i++)
{
tmp_in[threadIdx.x] |= tmp_in[threadIdx.x * NPACK + i] << i*2;
}
//out[odx] = tmp_out;
out[odx] = tmp_in[threadIdx.x];
}
odx += gridDim.x * blockDim.x / NPACK;
__syncthreads();
}
}
} //namespace kernels
void pack_2bit(thrust::device_vector<float> const& input, thrust::device_vector<uint32_t>& output, float minV, float maxV, cudaStream_t _stream)
{
BOOST_LOG_TRIVIAL(debug) << "Packing 2-bit data";
assert(input.size() % NPACK == 0);
output.resize(input.size() / NPACK);
BOOST_LOG_TRIVIAL(debug) << "INput size: " << input.size() << " elements";
BOOST_LOG_TRIVIAL(debug) << "Resizing output buffer to " << output.size() << " elements";
size_t nblocks = std::min(input.size() / EDD_NTHREADS_PACK, 4096uL);
BOOST_LOG_TRIVIAL(debug) << " using " << nblocks << " blocks of " << EDD_NTHREADS_PACK << " threads";
float const* input_ptr = thrust::raw_pointer_cast(input.data());
uint32_t* output_ptr = thrust::raw_pointer_cast(output.data());
kernels::pack_edd_float32_to_2bit<<< nblocks, EDD_NTHREADS_PACK, 0, _stream>>>(
input_ptr, output_ptr, input.size(), minV, maxV);
CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream));
}
} //namespace edd
} //namespace effelsberg
} //namespace psrdada_cpp
#include <thrust/device_vector.h>
#include <cuda_profiler_api.h>
#include <thrust/random.h>
#include <thrust/execution_policy.h>
#include "psrdada_cpp/effelsberg/edd/VLBI.cuh"
struct GenRand
{
__device__
float operator () (int idx)
{
thrust::default_random_engine randEng;
thrust::uniform_real_distribution<float> uniDist;
randEng.discard(idx);
return uniDist(randEng);
}
};
int main()
{
const size_t n = 1024 * 1024 * 1024 / 4;
thrust::device_vector<float> input(n);
thrust::device_vector<uint32_t> output(n);
thrust::fill(input.begin(), input.end(), .5);
thrust::fill(output.begin(), output.end(), 5);
cudaDeviceSynchronize();
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 1, 0);
cudaDeviceSynchronize();
std::cout << input[0] << std::endl;
std::cout << input[1] << std::endl;
std::cout << input[2] << std::endl;
std::cout << input[3] << std::endl;
std::cout << (int) output[0] << std::endl;
for (size_t i = 0; i<10; i++)
std::cout << i <<": " << output[i] << std::endl;
cudaProfilerStop();
}
......@@ -9,6 +9,7 @@ set(
src/FftSpectrometerTester.cu
src/UnpackerTester.cu
src/ScaledTransposeTFtoTFTTester.cu
src/VLBITest.cu
)
cuda_add_executable(gtest_edd ${gtest_edd_src} )
target_link_libraries(gtest_edd ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES})
......
#include "gtest/gtest.h"
#include "psrdada_cpp/cuda_utils.hpp"
#include <random>
#include "psrdada_cpp/effelsberg/edd/VLBI.cuh"
#include "thrust/extrema.h"
TEST(VLBITest, 2_bit_pack_test)
{
std::size_t n = 1024;
thrust::device_vector<float> input(n);
thrust::device_vector<uint32_t> output(n);
{
thrust::fill(input.begin(), input.end(), 0);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
EXPECT_EQ(output.size(), input.size() / 16);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ(0, *minmax.first);
EXPECT_EQ(0, *minmax.second);
}
{
thrust::fill(input.begin(), input.end(), 1);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ((uint32_t)0b0101010101010101010101010101010101010101, *minmax.first);
EXPECT_EQ((uint32_t)0b0101010101010101010101010101010101010101, *minmax.second);
}
{
thrust::fill(input.begin(), input.end(), 2);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ((uint32_t)0b1010101010101010101010101010101010101010, *minmax.first);
EXPECT_EQ((uint32_t)0b1010101010101010101010101010101010101010, *minmax.second);
}
{
thrust::fill(input.begin(), input.end(), 3);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.first);
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.second);
}
{
thrust::fill(input.begin(), input.end(), 4);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.first);
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.second);
}
}
//int main(int argc, char **argv) {
// ::testing::InitGoogleTest(&argc, argv);
//
// return RUN_ALL_TESTS();
//}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment