Unpacker.cu 5.65 KB
Newer Older
Ewan Barr's avatar
Ewan Barr committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#include "psrdada_cpp/effelsberg/edd/Unpacker.cuh"
#include "psrdada_cpp/cuda_utils.hpp"

#define EDD_NTHREADS_UNPACK 512

namespace psrdada_cpp {
namespace effelsberg {
namespace edd {
namespace kernels {

__device__ __forceinline__ uint64_t swap64(uint64_t x)
{
    uint64_t result;
    uint2 t;
    asm("mov.b64 {%0,%1},%2; \n\t"
        : "=r"(t.x), "=r"(t.y) : "l"(x));
    t.x = __byte_perm(t.x, 0, 0x0123);
    t.y = __byte_perm(t.y, 0, 0x0123);
    asm("mov.b64 %0,{%1,%2}; \n\t"
        : "=l"(result) : "r"(t.y), "r"(t.x));
    return result;
}

__global__
Ewan Barr's avatar
Ewan Barr committed
25
void unpack_edd_12bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n)
Ewan Barr's avatar
Ewan Barr committed
26
27
28
29
{
    /**
     * Note: This kernels will not work with more than 512 threads.
     */
Ewan Barr's avatar
Ewan Barr committed
30
31
    __shared__ volatile float tmp_out[EDD_NTHREADS_UNPACK * 16];
    __shared__ volatile uint64_t tmp_in[EDD_NTHREADS_UNPACK * 3];
Ewan Barr's avatar
Ewan Barr committed
32
33
34
35
36
37
38
39
40
    int block_idx = blockIdx.x;
    uint64_t val;
    uint64_t rest;
    volatile float* sout = tmp_out + (16 * threadIdx.x);
    for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
        (3 * idx + 2) < n;
        idx+=gridDim.x*blockDim.x)
    {
        //Read to shared memeory
Ewan Barr's avatar
Ewan Barr committed
41
        int block_read_start = block_idx * EDD_NTHREADS_UNPACK * 3;
Ewan Barr's avatar
Ewan Barr committed
42
        tmp_in[threadIdx.x]                = in[block_read_start + threadIdx.x];
Ewan Barr's avatar
Ewan Barr committed
43
44
        tmp_in[EDD_NTHREADS_UNPACK + threadIdx.x]     = in[block_read_start + EDD_NTHREADS_UNPACK + threadIdx.x];
        tmp_in[EDD_NTHREADS_UNPACK * 2 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK * 2 + threadIdx.x];
Ewan Barr's avatar
Ewan Barr committed
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
        __syncthreads();
        val  = swap64(tmp_in[3*threadIdx.x]);
        sout[0] = (float)((int64_t)(( 0xFFF0000000000000 & val) <<  0) >> 52);
        sout[1] = (float)((int64_t)(( 0x000FFF0000000000 & val) << 12) >> 52);
        sout[2] = (float)((int64_t)(( 0x000000FFF0000000 & val) << 24) >> 52);
        sout[3] = (float)((int64_t)(( 0x000000000FFF0000 & val) << 36) >> 52);
        sout[4] = (float)((int64_t)(( 0x000000000000FFF0 & val) << 48) >> 52);
        rest    =                   ( 0x000000000000000F & val) << 60;
        val  = swap64(tmp_in[3*threadIdx.x + 1]);
        sout[5] = (float)((int64_t)((( 0xFF00000000000000 & val) >> 4) | rest) >> 52);
        sout[6] = (float)((int64_t)((  0x00FFF00000000000 & val) << 8)  >> 52);
        sout[7] = (float)((int64_t)((  0x00000FFF00000000 & val) << 20) >> 52);
        sout[8] = (float)((int64_t)((  0x00000000FFF00000 & val) << 32) >> 52);
        sout[9] = (float)((int64_t)((  0x00000000000FFF00 & val) << 44) >> 52);
        rest    =                   (  0x00000000000000FF & val) << 56;
        val  = swap64(tmp_in[3*threadIdx.x + 2]);
        sout[10] = (float)((int64_t)((( 0xF000000000000000 & val) >>  8) | rest) >> 52);
        sout[11] = (float)((int64_t)((  0x0FFF000000000000 & val) <<  4) >> 52);
        sout[12] = (float)((int64_t)((  0x0000FFF000000000 & val) << 16) >> 52);
        sout[13] = (float)((int64_t)((  0x0000000FFF000000 & val) << 28) >> 52);
        sout[14] = (float)((int64_t)((  0x0000000000FFF000 & val) << 40) >> 52);
Tobias Winchen's avatar
Tobias Winchen committed
66
        sout[15] = (float)((int64_t)((  0x0000000000000FFF & val) << 52) >> 52);
Ewan Barr's avatar
Ewan Barr committed
67
        __syncthreads();
Ewan Barr's avatar
Ewan Barr committed
68
69
        int block_write_start = block_idx * EDD_NTHREADS_UNPACK * 16;
        for (int ii = threadIdx.x; ii < 16 * EDD_NTHREADS_UNPACK; ii += blockDim.x)
Ewan Barr's avatar
Ewan Barr committed
70
71
72
73
74
75
76
77
        {
            out[block_write_start + ii] = tmp_out[ii];
        }
        block_idx += gridDim.x;
    }
}

__global__
Ewan Barr's avatar
Ewan Barr committed
78
void unpack_edd_8bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n)
Ewan Barr's avatar
Ewan Barr committed
79
80
81
82
{
    /**
     * Note: This kernels will not work with more than 512 threads.
     */
Ewan Barr's avatar
Ewan Barr committed
83
    __shared__ volatile float tmp_out[EDD_NTHREADS_UNPACK * 8];
Ewan Barr's avatar
Ewan Barr committed
84
85
86
87
88
89
    int block_idx = blockIdx.x;
    uint64_t val;
    volatile float* sout = tmp_out + (8 * threadIdx.x);

    for (int idx = blockIdx.x * blockDim.x + threadIdx.x ; idx < n ; idx+=gridDim.x*blockDim.x)
    {
Ewan Barr's avatar
Ewan Barr committed
90
        int block_read_start = block_idx * EDD_NTHREADS_UNPACK;
Ewan Barr's avatar
Ewan Barr committed
91
92
93
94
95
96
97
98
99
100
        val = swap64(in[block_read_start + threadIdx.x]);
        sout[0] = (float)((int64_t)(( 0xFF00000000000000 & val) <<  0) >> 56);
        sout[1] = (float)((int64_t)(( 0x00FF000000000000 & val) <<  8) >> 56);
        sout[2] = (float)((int64_t)(( 0x0000FF0000000000 & val) << 16) >> 56);
        sout[3] = (float)((int64_t)(( 0x000000FF00000000 & val) << 24) >> 56);
        sout[4] = (float)((int64_t)(( 0x00000000FF000000 & val) << 32) >> 56);
        sout[5] = (float)((int64_t)(( 0x0000000000FF0000 & val) << 40) >> 56);
        sout[6] = (float)((int64_t)(( 0x000000000000FF00 & val) << 48) >> 56);
        sout[7] = (float)((int64_t)(( 0x00000000000000FF & val) << 56) >> 56);
        __syncthreads();
Ewan Barr's avatar
Ewan Barr committed
101
102
        int block_write_start = block_idx * EDD_NTHREADS_UNPACK * 8;
        for (int ii = threadIdx.x; ii < 8 * EDD_NTHREADS_UNPACK; ii+=blockDim.x)
Ewan Barr's avatar
Ewan Barr committed
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
        {
            out[block_write_start+ii] = tmp_out[ii];
        }
        block_idx += gridDim.x;
    }
}

} //namespace kernels


Unpacker::Unpacker(cudaStream_t stream)
    : _stream(stream)
{

}

Unpacker::~Unpacker()
{

}

template <>
125
void Unpacker::unpack<12>(const uint64_t* input, float* output, size_t size)
Ewan Barr's avatar
Ewan Barr committed
126
127
{
    BOOST_LOG_TRIVIAL(debug) << "Unpacking 12-bit data";
128
    int nblocks = size / EDD_NTHREADS_UNPACK;
Ewan Barr's avatar
Ewan Barr committed
129
    kernels::unpack_edd_12bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK, 0, _stream>>>(
130
            input, output, size);
Ewan Barr's avatar
Ewan Barr committed
131
132
133
}

template <>
134
void Unpacker::unpack<8>(const uint64_t* input, float* output, size_t size)
Ewan Barr's avatar
Ewan Barr committed
135
{
root's avatar
root committed
136
    BOOST_LOG_TRIVIAL(debug) << "Unpacking 8-bit data";
137
    int nblocks = size / EDD_NTHREADS_UNPACK;
Ewan Barr's avatar
Ewan Barr committed
138
    kernels::unpack_edd_8bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK, 0, _stream>>>(
139
            input, output, size);
Ewan Barr's avatar
Ewan Barr committed
140
141
142
143
}

} //namespace edd
} //namespace effelsberg
root's avatar
root committed
144
} //namespace psrdada_cpp