diff --git a/psrdada_cpp/effelsberg/edd/src/VLBI.cu b/psrdada_cpp/effelsberg/edd/src/VLBI.cu index a3ce4c4149fa9e97ca4db6185c38e73579cd6275..a08a7300bf2230404f6f801c3f154ef215ab468b 100644 --- a/psrdada_cpp/effelsberg/edd/src/VLBI.cu +++ b/psrdada_cpp/effelsberg/edd/src/VLBI.cu @@ -12,7 +12,7 @@ namespace kernels { __global__ -void pack_edd_float32_to_2bit(float const* __restrict__ in, uint32_t* __restrict__ out, size_t n, float minV, float maxV) +void pack_edd_float32_to_2bit(const float * __restrict__ in, uint32_t* __restrict__ out, size_t n, float minV, float maxV) { __shared__ uint32_t tmp_in[EDD_NTHREADS_PACK]; @@ -20,14 +20,13 @@ void pack_edd_float32_to_2bit(float const* __restrict__ in, uint32_t* __restrict //__shared__ volatile uint8_t tmp_out[EDD_NTHREADS_PACK / 4]; const float s = (maxV - minV) / 3.; - int odx = blockIdx.x * blockDim.x / NPACK + threadIdx.x; for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n ; idx += gridDim.x * blockDim.x) { const float delta = (in[idx] - minV); tmp_in[threadIdx.x] = 0; - tmp_in[threadIdx.x] += (delta >= 1 * s); - tmp_in[threadIdx.x] += (delta >= 2 * s); - tmp_in[threadIdx.x] += (delta >= 3 * s); + tmp_in[threadIdx.x] += (delta > 1 * s); + tmp_in[threadIdx.x] += (delta > 2 * s); + tmp_in[threadIdx.x] += (delta > 3 * s); __syncthreads(); // can be improved by distributing work on more threads in tree @@ -36,17 +35,39 @@ void pack_edd_float32_to_2bit(float const* __restrict__ in, uint32_t* __restrict { for (size_t i = 1; i < NPACK; i++) { - tmp_in[threadIdx.x] |= tmp_in[threadIdx.x * NPACK + i] << i*2; + tmp_in[threadIdx.x * NPACK] += (tmp_in[threadIdx.x * NPACK + i] << (i*2)); } - //out[odx] = tmp_out; - out[odx] = tmp_in[threadIdx.x]; + out[(idx - threadIdx.x) / NPACK + threadIdx.x] = tmp_in[threadIdx.x *NPACK]; } - odx += gridDim.x * blockDim.x / NPACK; __syncthreads(); } } + +//__global__ void pack_edd_float32_to_2bit(const float* __restrict__ input, uint32_t* __restrict__ output, size_t inputSize, float minV, float maxV) +//{ +// float l = (maxV - minV) / 3; +// for (size_t i = blockIdx.x * blockDim.x + 16 * threadIdx.x; (i < inputSize); +// i += blockDim.x * gridDim.x * 16) +// { +// uint32_t out = 0; +// for (size_t j =0; j < 16; j++) +// { +// //out = out << 2; +// +// const float inp = input[i + j]; +// const uint32_t tmp = (inp > minV + l) + (inp > minV + 2 * l) + (inp > minV + 3 * l); +// out += (tmp << (2 * j)); +// //input[i + j] = i + j; +// } +// +// output[i / 16] = out; +// } +//} + + + } //namespace kernels diff --git a/psrdada_cpp/effelsberg/edd/test/src/VLBITest.cu b/psrdada_cpp/effelsberg/edd/test/src/VLBITest.cu index 54dc7e3244aeee50a33535a89cf9c97042d31f17..815c76a721debc855b0a7a6ea111d13046569146 100644 --- a/psrdada_cpp/effelsberg/edd/test/src/VLBITest.cu +++ b/psrdada_cpp/effelsberg/edd/test/src/VLBITest.cu @@ -1,76 +1,55 @@ #include "gtest/gtest.h" -#include "psrdada_cpp/cuda_utils.hpp" -#include <random> +#include <time.h> +#include <stdlib.h> + #include "psrdada_cpp/effelsberg/edd/VLBI.cuh" + +#include "psrdada_cpp/cuda_utils.hpp" #include "thrust/extrema.h" TEST(VLBITest, 2_bit_pack_test) { std::size_t n = 1024; thrust::device_vector<float> input(n); - thrust::device_vector<uint32_t> output(n); - { - thrust::fill(input.begin(), input.end(), 0); - thrust::fill(output.begin(), output.end(), 5); - - psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3); - - EXPECT_EQ(output.size(), input.size() / 16); - thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax; - minmax = thrust::minmax_element(output.begin(), output.end()); - EXPECT_EQ(0, *minmax.first); - EXPECT_EQ(0, *minmax.second); - } - - { - thrust::fill(input.begin(), input.end(), 1); - thrust::fill(output.begin(), output.end(), 5); - - psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3); - thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax; - minmax = thrust::minmax_element(output.begin(), output.end()); - - EXPECT_EQ((uint32_t)0b0101010101010101010101010101010101010101, *minmax.first); - EXPECT_EQ((uint32_t)0b0101010101010101010101010101010101010101, *minmax.second); - } - - { - thrust::fill(input.begin(), input.end(), 2); - thrust::fill(output.begin(), output.end(), 5); - - psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3); - thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax; - minmax = thrust::minmax_element(output.begin(), output.end()); - - EXPECT_EQ((uint32_t)0b1010101010101010101010101010101010101010, *minmax.first); - EXPECT_EQ((uint32_t)0b1010101010101010101010101010101010101010, *minmax.second); - } + thrust::device_vector<uint32_t> output(n / 16); { - thrust::fill(input.begin(), input.end(), 3); - thrust::fill(output.begin(), output.end(), 5); - - psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3); - thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax; - minmax = thrust::minmax_element(output.begin(), output.end()); + float minV = -2; + float maxV = 2; - EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.first); - EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.second); - } + srand (time(NULL)); + for (int i =0; i < input.size(); i++) + { + input[i] = ((float(rand()) / RAND_MAX) - 0.5) * 2.5 * (maxV-minV) + maxV + minV; + } - { - thrust::fill(input.begin(), input.end(), 4); thrust::fill(output.begin(), output.end(), 5); - - psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3); - thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax; - minmax = thrust::minmax_element(output.begin(), output.end()); - - EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.first); - EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.second); + psrdada_cpp::effelsberg::edd::pack_2bit(input, output, minV, maxV); + + float step = (maxV - minV) / 3; + float L2 = minV + step; + float L3 = minV + 2 * step; + float L4 = minV + 3 * step; + + for(int i = 0; i < input.size() / 16; i++) + { + uint64_t of = output[i]; + for (size_t j =0; j< 16; j++) + { + int a = ((of >> (j *2)) & 3); + int k = i * 16 + j; + if (input[k] >= L4) + EXPECT_EQ(a, 3); + else if (input[k] >= L3) + EXPECT_EQ(a, 2); + else if (input[k] >= L2) + EXPECT_EQ(a, 1); + else + EXPECT_EQ(a, 0); + } + } } - } //int main(int argc, char **argv) {