Skip to content
Snippets Groups Projects
Commit ca7b19a0 authored by Tobias Winchen's avatar Tobias Winchen
Browse files

Fixed pack output index

parent a5fe3846
No related branches found
No related tags found
No related merge requests found
......@@ -12,7 +12,7 @@ namespace kernels {
__global__
void pack_edd_float32_to_2bit(float const* __restrict__ in, uint32_t* __restrict__ out, size_t n, float minV, float maxV)
void pack_edd_float32_to_2bit(const float * __restrict__ in, uint32_t* __restrict__ out, size_t n, float minV, float maxV)
{
__shared__ uint32_t tmp_in[EDD_NTHREADS_PACK];
......@@ -20,14 +20,13 @@ void pack_edd_float32_to_2bit(float const* __restrict__ in, uint32_t* __restrict
//__shared__ volatile uint8_t tmp_out[EDD_NTHREADS_PACK / 4];
const float s = (maxV - minV) / 3.;
int odx = blockIdx.x * blockDim.x / NPACK + threadIdx.x;
for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n ; idx += gridDim.x * blockDim.x)
{
const float delta = (in[idx] - minV);
tmp_in[threadIdx.x] = 0;
tmp_in[threadIdx.x] += (delta >= 1 * s);
tmp_in[threadIdx.x] += (delta >= 2 * s);
tmp_in[threadIdx.x] += (delta >= 3 * s);
tmp_in[threadIdx.x] += (delta > 1 * s);
tmp_in[threadIdx.x] += (delta > 2 * s);
tmp_in[threadIdx.x] += (delta > 3 * s);
__syncthreads();
// can be improved by distributing work on more threads in tree
......@@ -36,17 +35,39 @@ void pack_edd_float32_to_2bit(float const* __restrict__ in, uint32_t* __restrict
{
for (size_t i = 1; i < NPACK; i++)
{
tmp_in[threadIdx.x] |= tmp_in[threadIdx.x * NPACK + i] << i*2;
tmp_in[threadIdx.x * NPACK] += (tmp_in[threadIdx.x * NPACK + i] << (i*2));
}
//out[odx] = tmp_out;
out[odx] = tmp_in[threadIdx.x];
out[(idx - threadIdx.x) / NPACK + threadIdx.x] = tmp_in[threadIdx.x *NPACK];
}
odx += gridDim.x * blockDim.x / NPACK;
__syncthreads();
}
}
//__global__ void pack_edd_float32_to_2bit(const float* __restrict__ input, uint32_t* __restrict__ output, size_t inputSize, float minV, float maxV)
//{
// float l = (maxV - minV) / 3;
// for (size_t i = blockIdx.x * blockDim.x + 16 * threadIdx.x; (i < inputSize);
// i += blockDim.x * gridDim.x * 16)
// {
// uint32_t out = 0;
// for (size_t j =0; j < 16; j++)
// {
// //out = out << 2;
//
// const float inp = input[i + j];
// const uint32_t tmp = (inp > minV + l) + (inp > minV + 2 * l) + (inp > minV + 3 * l);
// out += (tmp << (2 * j));
// //input[i + j] = i + j;
// }
//
// output[i / 16] = out;
// }
//}
} //namespace kernels
......
#include "gtest/gtest.h"
#include "psrdada_cpp/cuda_utils.hpp"
#include <random>
#include <time.h>
#include <stdlib.h>
#include "psrdada_cpp/effelsberg/edd/VLBI.cuh"
#include "psrdada_cpp/cuda_utils.hpp"
#include "thrust/extrema.h"
TEST(VLBITest, 2_bit_pack_test)
{
std::size_t n = 1024;
thrust::device_vector<float> input(n);
thrust::device_vector<uint32_t> output(n);
{
thrust::fill(input.begin(), input.end(), 0);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
EXPECT_EQ(output.size(), input.size() / 16);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ(0, *minmax.first);
EXPECT_EQ(0, *minmax.second);
}
{
thrust::fill(input.begin(), input.end(), 1);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ((uint32_t)0b0101010101010101010101010101010101010101, *minmax.first);
EXPECT_EQ((uint32_t)0b0101010101010101010101010101010101010101, *minmax.second);
}
{
thrust::fill(input.begin(), input.end(), 2);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ((uint32_t)0b1010101010101010101010101010101010101010, *minmax.first);
EXPECT_EQ((uint32_t)0b1010101010101010101010101010101010101010, *minmax.second);
}
thrust::device_vector<uint32_t> output(n / 16);
{
thrust::fill(input.begin(), input.end(), 3);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
float minV = -2;
float maxV = 2;
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.first);
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.second);
}
srand (time(NULL));
for (int i =0; i < input.size(); i++)
{
input[i] = ((float(rand()) / RAND_MAX) - 0.5) * 2.5 * (maxV-minV) + maxV + minV;
}
{
thrust::fill(input.begin(), input.end(), 4);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.first);
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.second);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, minV, maxV);
float step = (maxV - minV) / 3;
float L2 = minV + step;
float L3 = minV + 2 * step;
float L4 = minV + 3 * step;
for(int i = 0; i < input.size() / 16; i++)
{
uint64_t of = output[i];
for (size_t j =0; j< 16; j++)
{
int a = ((of >> (j *2)) & 3);
int k = i * 16 + j;
if (input[k] >= L4)
EXPECT_EQ(a, 3);
else if (input[k] >= L3)
EXPECT_EQ(a, 2);
else if (input[k] >= L2)
EXPECT_EQ(a, 1);
else
EXPECT_EQ(a, 0);
}
}
}
}
//int main(int argc, char **argv) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment