Commit ca7b19a0 authored by Tobias Winchen's avatar Tobias Winchen
Browse files

Fixed pack output index

parent a5fe3846
......@@ -12,7 +12,7 @@ namespace kernels {
__global__
void pack_edd_float32_to_2bit(float const* __restrict__ in, uint32_t* __restrict__ out, size_t n, float minV, float maxV)
void pack_edd_float32_to_2bit(const float * __restrict__ in, uint32_t* __restrict__ out, size_t n, float minV, float maxV)
{
__shared__ uint32_t tmp_in[EDD_NTHREADS_PACK];
......@@ -20,14 +20,13 @@ void pack_edd_float32_to_2bit(float const* __restrict__ in, uint32_t* __restrict
//__shared__ volatile uint8_t tmp_out[EDD_NTHREADS_PACK / 4];
const float s = (maxV - minV) / 3.;
int odx = blockIdx.x * blockDim.x / NPACK + threadIdx.x;
for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n ; idx += gridDim.x * blockDim.x)
{
const float delta = (in[idx] - minV);
tmp_in[threadIdx.x] = 0;
tmp_in[threadIdx.x] += (delta >= 1 * s);
tmp_in[threadIdx.x] += (delta >= 2 * s);
tmp_in[threadIdx.x] += (delta >= 3 * s);
tmp_in[threadIdx.x] += (delta > 1 * s);
tmp_in[threadIdx.x] += (delta > 2 * s);
tmp_in[threadIdx.x] += (delta > 3 * s);
__syncthreads();
// can be improved by distributing work on more threads in tree
......@@ -36,17 +35,39 @@ void pack_edd_float32_to_2bit(float const* __restrict__ in, uint32_t* __restrict
{
for (size_t i = 1; i < NPACK; i++)
{
tmp_in[threadIdx.x] |= tmp_in[threadIdx.x * NPACK + i] << i*2;
tmp_in[threadIdx.x * NPACK] += (tmp_in[threadIdx.x * NPACK + i] << (i*2));
}
//out[odx] = tmp_out;
out[odx] = tmp_in[threadIdx.x];
out[(idx - threadIdx.x) / NPACK + threadIdx.x] = tmp_in[threadIdx.x *NPACK];
}
odx += gridDim.x * blockDim.x / NPACK;
__syncthreads();
}
}
//__global__ void pack_edd_float32_to_2bit(const float* __restrict__ input, uint32_t* __restrict__ output, size_t inputSize, float minV, float maxV)
//{
// float l = (maxV - minV) / 3;
// for (size_t i = blockIdx.x * blockDim.x + 16 * threadIdx.x; (i < inputSize);
// i += blockDim.x * gridDim.x * 16)
// {
// uint32_t out = 0;
// for (size_t j =0; j < 16; j++)
// {
// //out = out << 2;
//
// const float inp = input[i + j];
// const uint32_t tmp = (inp > minV + l) + (inp > minV + 2 * l) + (inp > minV + 3 * l);
// out += (tmp << (2 * j));
// //input[i + j] = i + j;
// }
//
// output[i / 16] = out;
// }
//}
} //namespace kernels
......
#include "gtest/gtest.h"
#include "psrdada_cpp/cuda_utils.hpp"
#include <random>
#include <time.h>
#include <stdlib.h>
#include "psrdada_cpp/effelsberg/edd/VLBI.cuh"
#include "psrdada_cpp/cuda_utils.hpp"
#include "thrust/extrema.h"
TEST(VLBITest, 2_bit_pack_test)
{
std::size_t n = 1024;
thrust::device_vector<float> input(n);
thrust::device_vector<uint32_t> output(n);
{
thrust::fill(input.begin(), input.end(), 0);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
EXPECT_EQ(output.size(), input.size() / 16);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ(0, *minmax.first);
EXPECT_EQ(0, *minmax.second);
}
{
thrust::fill(input.begin(), input.end(), 1);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ((uint32_t)0b0101010101010101010101010101010101010101, *minmax.first);
EXPECT_EQ((uint32_t)0b0101010101010101010101010101010101010101, *minmax.second);
}
{
thrust::fill(input.begin(), input.end(), 2);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ((uint32_t)0b1010101010101010101010101010101010101010, *minmax.first);
EXPECT_EQ((uint32_t)0b1010101010101010101010101010101010101010, *minmax.second);
}
thrust::device_vector<uint32_t> output(n / 16);
{
thrust::fill(input.begin(), input.end(), 3);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
float minV = -2;
float maxV = 2;
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.first);
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.second);
}
srand (time(NULL));
for (int i =0; i < input.size(); i++)
{
input[i] = ((float(rand()) / RAND_MAX) - 0.5) * 2.5 * (maxV-minV) + maxV + minV;
}
{
thrust::fill(input.begin(), input.end(), 4);
thrust::fill(output.begin(), output.end(), 5);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, 0, 3);
thrust::pair<thrust::device_vector<uint32_t>::iterator, thrust::device_vector<uint32_t>::iterator> minmax;
minmax = thrust::minmax_element(output.begin(), output.end());
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.first);
EXPECT_EQ((uint32_t)0b1111111111111111111111111111111111111111, *minmax.second);
psrdada_cpp::effelsberg::edd::pack_2bit(input, output, minV, maxV);
float step = (maxV - minV) / 3;
float L2 = minV + step;
float L3 = minV + 2 * step;
float L4 = minV + 3 * step;
for(int i = 0; i < input.size() / 16; i++)
{
uint64_t of = output[i];
for (size_t j =0; j< 16; j++)
{
int a = ((of >> (j *2)) & 3);
int k = i * 16 + j;
if (input[k] >= L4)
EXPECT_EQ(a, 3);
else if (input[k] >= L3)
EXPECT_EQ(a, 2);
else if (input[k] >= L2)
EXPECT_EQ(a, 1);
else
EXPECT_EQ(a, 0);
}
}
}
}
//int main(int argc, char **argv) {
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment