diff --git a/psrdada_cpp/effelsberg/edd/CMakeLists.txt b/psrdada_cpp/effelsberg/edd/CMakeLists.txt index 681dde81d220c288b0dfa8a89742f6d4ee633004..210d1d673d002b84763c56649124ee59bfe23343 100644 --- a/psrdada_cpp/effelsberg/edd/CMakeLists.txt +++ b/psrdada_cpp/effelsberg/edd/CMakeLists.txt @@ -8,6 +8,7 @@ set(PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES set(psrdada_cpp_effelsberg_edd_src src/Unpacker.cu src/DetectorAccumulator.cu + src/ScaledTransposeTFtoTFT.cu ) cuda_add_library(${CMAKE_PROJECT_NAME}_effelsberg_edd ${psrdada_cpp_effelsberg_edd_src}) diff --git a/psrdada_cpp/effelsberg/edd/src/ScaledTransposeTFtoTFT.cu b/psrdada_cpp/effelsberg/edd/src/ScaledTransposeTFtoTFT.cu index f2193df4194c14821794ba7fc923beb526db10e0..effbcc2b28b9d60f01b06047ba1b0e29054757d3 100644 --- a/psrdada_cpp/effelsberg/edd/src/ScaledTransposeTFtoTFT.cu +++ b/psrdada_cpp/effelsberg/edd/src/ScaledTransposeTFtoTFT.cu @@ -17,7 +17,7 @@ void tf_to_tft_transpose( const float scale, const float offset) { - extern __shared__ char2* temp; //nbytes = sizeof(char2) * nsamps_per_load * nchans; + extern __shared__ char2 temp[]; //nbytes = sizeof(char2) * nsamps_per_load * nchans; const int load_offset = nsamps_per_packet * blockIdx.x * nchans; for (int sub_samp_load_idx = 0; sub_samp_load_idx < nsamps_per_packet/nsamps_per_load; @@ -41,17 +41,16 @@ void tf_to_tft_transpose( } } __syncthreads(); - int store_offset = load_offset + nsamps_per_load * sub_samp_load_idx; for (int chan_store_idx = threadIdx.y; chan_store_idx < nchans; chan_store_idx += blockDim.y) { for (int samp_store_idx = threadIdx.x; samp_store_idx < nsamps_per_load; - samp_store_idx += blockDix.x) + samp_store_idx += blockIdx.x) { int store_idx = (load_offset + chan_store_idx * nsamps_per_packet - + samps_per_load * sub_samp_load_idx + samp_store_idx); + + nsamps_per_load * sub_samp_load_idx + samp_store_idx); output[store_idx] = temp[samp_store_idx * nsamps_per_load + chan_store_idx]; } } @@ -88,7 +87,7 @@ void ScaledTransposeTFtoTFT::transpose(InputType const& input, OutputType& outpu assert(input.size() % (_nchans * _nsamps_per_packet) == 0 /* Input is not a multiple of _nchans * _nsamps_per_packet*/); output.resize(input.size()); const int nsamps_per_load = 16; - assert(_nsamps_per_packet % nsamps_per_load) == 0; + assert((_nsamps_per_packet % nsamps_per_load) == 0); const int nsamps = input.size() / _nchans; int shared_mem_bytes = sizeof(OutputType::value_type) * _nchans * nsamps_per_load; int nblocks = nsamps / _nsamps_per_packet; diff --git a/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt b/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt index e16d105afc30b3957d92def77de33d8b86b75d91..541c4c6e6efd7836008c4a48c55c003162719add 100644 --- a/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt +++ b/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt @@ -7,7 +7,7 @@ set( src/DetectorAccumulatorTester.cu src/FftSpectrometerTester.cu src/UnpackerTester.cu - src/ScaledTransposeTFtoTFT.cu + src/ScaledTransposeTFtoTFTTester.cu ) cuda_add_executable(gtest_edd ${gtest_edd_src} ) target_link_libraries(gtest_edd ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}) diff --git a/psrdada_cpp/effelsberg/edd/test/src/ScaledTransposeTFtoTFTTester.cu b/psrdada_cpp/effelsberg/edd/test/src/ScaledTransposeTFtoTFTTester.cu index e75b229148be739cd1c0954cf4143013a616319c..ceb46ed46b5d432447af1e6308e71ab471e69eb2 100644 --- a/psrdada_cpp/effelsberg/edd/test/src/ScaledTransposeTFtoTFTTester.cu +++ b/psrdada_cpp/effelsberg/edd/test/src/ScaledTransposeTFtoTFTTester.cu @@ -37,11 +37,11 @@ void ScaledTransposeTFtoTFTTester::transpose_c_reference( const int nsamps, const int nsamps_per_packet, const float scale, - const float offset); + const float offset) { int nsamples = input.size() / nchans; int outer_t_dim = nsamps / nsamps_per_packet; - output.size(input.size()); + output.resize(input.size()); for (int outer_t_idx = 0; outer_t_idx < outer_t_dim; ++outer_t_idx) { for (int chan_idx = 0; chan_idx < nchans; ++chan_idx) @@ -68,7 +68,8 @@ void ScaledTransposeTFtoTFTTester::compare_against_host( ASSERT_EQ(host_output.size(), copy_from_gpu.size()); for (std::size_t ii = 0; ii < host_output.size(); ++ii) { - ASSERT_EQ(host_output[ii], copy_from_gpu[ii]); + ASSERT_EQ(host_output[ii].x, copy_from_gpu[ii].x); + ASSERT_EQ(host_output[ii].y, copy_from_gpu[ii].y); } } @@ -78,7 +79,7 @@ TEST_F(ScaledTransposeTFtoTFTTester, counter_test) int nsamps_per_packet = 8192/nchans; float stdev = 64.0f; float scale = 4.0f; - int nsamps = nsamps_per_packet * 1024 + int nsamps = nsamps_per_packet * 1024; int n = nchans * nsamps; std::default_random_engine generator; std::normal_distribution<float> distribution(0.0, stdev); @@ -91,9 +92,9 @@ TEST_F(ScaledTransposeTFtoTFTTester, counter_test) ScaledTransposeTFtoTFT::InputType gpu_input = host_input; ScaledTransposeTFtoTFT::OutputType gpu_output; OutputType host_output; - ScaledTransposeTFtoTFT transposer(nchans, nsamps_per_packet, nscale, 0.0, _stream); + ScaledTransposeTFtoTFT transposer(nchans, nsamps_per_packet, scale, 0.0, _stream); transposer.transpose(gpu_input, gpu_output); - detect_c_reference(host_input, host_output, nchans, nsamps, nsamps_per_packet, scale, 0.0); + transpose_c_reference(host_input, host_output, nchans, nsamps, nsamps_per_packet, scale, 0.0); compare_against_host(gpu_output, host_output); }