Skip to content
Snippets Groups Projects
Commit 656eacb6 authored by Tobias Winchen's avatar Tobias Winchen
Browse files

Added prototype for 2bit pack

parent e9c608a6
No related branches found
No related tags found
No related merge requests found
...@@ -14,7 +14,7 @@ if(ENABLE_CUDA) ...@@ -14,7 +14,7 @@ if(ENABLE_CUDA)
set(CUDA_PROPAGATE_HOST_FLAGS OFF) set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Pass options to NVCC ( -ccbin /path --compiler-options -lfftw3f --compiler-options -lm --verbose) # Pass options to NVCC ( -ccbin /path --compiler-options -lfftw3f --compiler-options -lm --verbose)
list(APPEND CUDA_NVCC_FLAGS -DENABLE_CUDA --std c++11 -Wno-deprecated-gpu-targets) list(APPEND CUDA_NVCC_FLAGS -DENABLE_CUDA --std c++11 -Wno-deprecated-gpu-targets --ptxas-options=-v)
list(APPEND CUDA_NVCC_FLAGS_DEBUG --debug; --device-debug; --generate-line-info -Xcompiler "-Werror") list(APPEND CUDA_NVCC_FLAGS_DEBUG --debug; --device-debug; --generate-line-info -Xcompiler "-Werror")
#list(APPEND CUDA_NVCC_FLAGS -arch compute_35) # minumum compute level (Sps restriction) #list(APPEND CUDA_NVCC_FLAGS -arch compute_35) # minumum compute level (Sps restriction)
string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE) string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
......
...@@ -9,6 +9,7 @@ set(psrdada_cpp_effelsberg_edd_src ...@@ -9,6 +9,7 @@ set(psrdada_cpp_effelsberg_edd_src
src/Unpacker.cu src/Unpacker.cu
src/DetectorAccumulator.cu src/DetectorAccumulator.cu
src/ScaledTransposeTFtoTFT.cu src/ScaledTransposeTFtoTFT.cu
src/VLBI.cu
) )
cuda_add_library(${CMAKE_PROJECT_NAME}_effelsberg_edd ${psrdada_cpp_effelsberg_edd_src}) cuda_add_library(${CMAKE_PROJECT_NAME}_effelsberg_edd ${psrdada_cpp_effelsberg_edd_src})
...@@ -18,5 +19,9 @@ cuda_add_executable(fft_spectrometer src/fft_spectrometer_cli.cu) ...@@ -18,5 +19,9 @@ cuda_add_executable(fft_spectrometer src/fft_spectrometer_cli.cu)
target_link_libraries(fft_spectrometer ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}) target_link_libraries(fft_spectrometer ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES})
install(TARGETS fft_spectrometer DESTINATION bin) install(TARGETS fft_spectrometer DESTINATION bin)
cuda_add_executable(VLBI_prof src/VLBI_prof.cu)
target_link_libraries(VLBI_prof ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES})
install(TARGETS VLBI_prof DESTINATION bin)
add_subdirectory(test) add_subdirectory(test)
endif(ENABLE_CUDA) endif(ENABLE_CUDA)
#ifndef PSRDADA_CPP_EFFELSBERG_EDD_VLBI_CUH
#define PSRDADA_CPP_EFFELSBERG_EDD_VLBI_CUH
#include "psrdada_cpp/common.hpp"
#include <thrust/device_vector.h>
namespace psrdada_cpp {
namespace effelsberg {
namespace edd {
namespace kernels {
__global__
void pack_edd_float32_to_2bit(float* __restrict__ in, uint32_t * __restrict__ out, size_t n);
} //namespace kernels
void pack_2bit(thrust::device_vector<float> const& input, thrust::device_vector<uint32_t>& output, float minV, float maxV, cudaStream_t _stream = 0);
} //namespace edd
} //namespace effelsberg
} //namespace psrdada_cpp
#endif // PSRDADA_CPP_EFFELSBERG_EDD_VLBI_CUH
#include "psrdada_cpp/effelsberg/edd/VLBI.cuh"
#include "psrdada_cpp/cuda_utils.hpp"
#define EDD_NTHREADS_PACK 1024
#define NPACK 16
namespace psrdada_cpp {
namespace effelsberg {
namespace edd {
namespace kernels {
__global__
void pack_edd_float32_to_2bit(float const* __restrict__ in, uint32_t* __restrict__ out, size_t n, float minV, float maxV)
{
__shared__ uint32_t tmp_in[EDD_NTHREADS_PACK];
//__shared__ uint32_t tmp_in[EDD_NTHREADS_PACK];
//__shared__ volatile uint8_t tmp_out[EDD_NTHREADS_PACK / 4];
const float s = (maxV - minV) / 3.;
int odx = blockIdx.x * blockDim.x / NPACK + threadIdx.x;
for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n ; idx += gridDim.x * blockDim.x)
{
const float delta = (in[idx] - minV);
tmp_in[threadIdx.x] = 0;
tmp_in[threadIdx.x] += (delta >= 1 * s);
tmp_in[threadIdx.x] += (delta >= 2 * s);
tmp_in[threadIdx.x] += (delta >= 3 * s);
__syncthreads();
// can be improved by distributing work on more threads in tree
// structure, but already at 60-70% memory utilization
if (threadIdx.x < EDD_NTHREADS_PACK / NPACK)
{
for (size_t i = 1; i < NPACK; i++)
{
tmp_in[threadIdx.x] |= tmp_in[threadIdx.x * NPACK + i] << i*2;
}
//out[odx] = tmp_out;
out[odx] = tmp_in[threadIdx.x];
}
odx += gridDim.x * blockDim.x / NPACK;
__syncthreads();
}
}
} //namespace kernels
void pack_2bit(thrust::device_vector<float> const& input, thrust::device_vector<uint32_t>& output, float minV, float maxV, cudaStream_t _stream)
{
BOOST_LOG_TRIVIAL(debug) << "Packing 2-bit data";
assert(input.size() % NPACK == 0);
output.resize(input.size() / NPACK);
BOOST_LOG_TRIVIAL(debug) << "INput size: " << input.size() << " elements";
BOOST_LOG_TRIVIAL(debug) << "Resizing output buffer to " << output.size() << " elements";
size_t nblocks = std::min(input.size() / EDD_NTHREADS_PACK, 4096uL);
BOOST_LOG_TRIVIAL(debug) << " using " << nblocks << " blocks of " << EDD_NTHREADS_PACK << " threads";
float const* input_ptr = thrust::raw_pointer_cast(input.data());
uint32_t* output_ptr = thrust::raw_pointer_cast(output.data());
kernels::pack_edd_float32_to_2bit<<< nblocks, EDD_NTHREADS_PACK, 0, _stream>>>(
input_ptr, output_ptr, input.size(), minV, maxV);
CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream));
}
} //namespace edd
} //namespace effelsberg
} //namespace psrdada_cpp
#include <thrust/device_vector.h>
#include <cuda_profiler_api.h>
#include <thrust/random.h>
#include <thrust/execution_policy.h>
#include "psrdada_cpp/effelsberg/edd/VLBI.cuh"
struct GenRand
{
__device__
float operator () (int idx)
{
thrust::default_random_engine randEng;
thrust::uniform_real_distribution<float> uniDist;
randEng.discard(idx);
return uniDist(randEng);
}
};
int main()
{
const size_t n = 1024 * 1024 * 1024 / 4;
thrust::device_vector<float> input(n);
thrust::device_vector<uint32_t> output(n);
thrust::fill(input.begin(), input.end(), .5);
thrust::fill(output.begin(), output.end(), 5);
cudaDeviceSynchronize();
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 1, 0);
cudaDeviceSynchronize();
std::cout << input[0] << std::endl;
std::cout << input[1] << std::endl;
std::cout << input[2] << std::endl;
std::cout << input[3] << std::endl;
std::cout << (int) output[0] << std::endl;
for (size_t i = 0; i<10; i++)
std::cout << i <<": " << output[i] << std::endl;
cudaProfilerStop();
}
...@@ -9,6 +9,7 @@ set( ...@@ -9,6 +9,7 @@ set(
src/FftSpectrometerTester.cu src/FftSpectrometerTester.cu
src/UnpackerTester.cu src/UnpackerTester.cu
src/ScaledTransposeTFtoTFTTester.cu src/ScaledTransposeTFtoTFTTester.cu
src/VLBITest.cu
) )
cuda_add_executable(gtest_edd ${gtest_edd_src} ) cuda_add_executable(gtest_edd ${gtest_edd_src} )
target_link_libraries(gtest_edd ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}) target_link_libraries(gtest_edd ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES})
......
#include "gtest/gtest.h"
#include "psrdada_cpp/cuda_utils.hpp"
#include <random>
#include "psrdada_cpp/effelsberg/edd/VLBI.cuh"
#include "thrust/extrema.h"
TEST(VLBITest, 2_bit_pack_test)
{
std::size_t n = 1024;
thrust::device_vector<float> input(n);
thrust::device_vector<uint32_t> output(n);
{
thrust::fill(input.begin(), input.end(), 0);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
EXPECT_EQ(output.size(), input.size() / 16);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ(0, *minmax.first);
EXPECT_EQ(0, *minmax.second);
}
{
thrust::fill(input.begin(), input.end(), 1);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ((uint32_t)0b0101010101010101010101010101010101010101, *minmax.first);
EXPECT_EQ((uint32_t)0b0101010101010101010101010101010101010101, *minmax.second);
}
{
thrust::fill(input.begin(), input.end(), 2);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ((uint32_t)0b1010101010101010101010101010101010101010, *minmax.first);
EXPECT_EQ((uint32_t)0b1010101010101010101010101010101010101010, *minmax.second);
}
{
thrust::fill(input.begin(), input.end(), 3);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.first);
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.second);
}
{
thrust::fill(input.begin(), input.end(), 4);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.first);
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.second);
}
}
//int main(int argc, char **argv) {
// ::testing::InitGoogleTest(&argc, argv);
//
// return RUN_ALL_TESTS();
//}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment