Unpacker.cu 10.7 KB
Newer Older
Ewan Barr's avatar
Ewan Barr committed
1
2
3
4
#include "psrdada_cpp/effelsberg/edd/Unpacker.cuh"
#include "psrdada_cpp/cuda_utils.hpp"

#define EDD_NTHREADS_UNPACK 512
Tobias Winchen's avatar
Tobias Winchen committed
5
#define EDD_NTHREADS_UNPACK10 128   // More than 128 threads are silently not launched??
Ewan Barr's avatar
Ewan Barr committed
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

namespace psrdada_cpp {
namespace effelsberg {
namespace edd {
namespace kernels {

__device__ __forceinline__ uint64_t swap64(uint64_t x)
{
    uint64_t result;
    uint2 t;
    asm("mov.b64 {%0,%1},%2; \n\t"
        : "=r"(t.x), "=r"(t.y) : "l"(x));
    t.x = __byte_perm(t.x, 0, 0x0123);
    t.y = __byte_perm(t.y, 0, 0x0123);
    asm("mov.b64 %0,{%1,%2}; \n\t"
        : "=l"(result) : "r"(t.y), "r"(t.x));
    return result;
}

Tobias Winchen's avatar
Tobias Winchen committed
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
__global__
void unpack_edd_10bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n)
{
    /**
     * Note: This kernels will not work with more than 256 threads.
     */
    __shared__ volatile float tmp_out[EDD_NTHREADS_UNPACK10 * 32];
    __shared__ volatile uint64_t tmp_in[EDD_NTHREADS_UNPACK10 * 5];
    int block_idx = blockIdx.x;
    uint64_t val;
    uint64_t rest;
    volatile float* sout = tmp_out + (32 * threadIdx.x);
    for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
        (5 * idx + 4) < n;
        idx+=gridDim.x*blockDim.x)
    {
        //Read to shared memeory
        int block_read_start = block_idx * EDD_NTHREADS_UNPACK10 * 5;
        tmp_in[threadIdx.x]                             = in[block_read_start + threadIdx.x];
        tmp_in[EDD_NTHREADS_UNPACK10 + threadIdx.x]     = in[block_read_start + EDD_NTHREADS_UNPACK10 + threadIdx.x];
        tmp_in[EDD_NTHREADS_UNPACK10 * 2 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK10 * 2 + threadIdx.x];
        tmp_in[EDD_NTHREADS_UNPACK10 * 3 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK10 * 3 + threadIdx.x];
        tmp_in[EDD_NTHREADS_UNPACK10 * 4 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK10 * 4 + threadIdx.x];

        __syncthreads();
        val  = swap64(tmp_in[5*threadIdx.x]);
        sout[0] = (float)((int64_t)(( 0xFFC0000000000000 & val) <<  0) >> 54);
        sout[1] = (float)((int64_t)(( 0x003FF00000000000 & val) << 10) >> 54);
        sout[2] = (float)((int64_t)(( 0x00000FFC00000000 & val) << 20) >> 54);
        sout[3] = (float)((int64_t)(( 0x00000003FF000000 & val) << 30) >> 54);
        sout[4] = (float)((int64_t)(( 0x0000000000FFC000 & val) << 40) >> 54);
        sout[5] = (float)((int64_t)(( 0x0000000000003FF0 & val) << 50) >> 54);
        rest    =                   ( 0x000000000000000F & val) << 60;

        val  = swap64(tmp_in[5*threadIdx.x + 1]);
        sout[6] = (float)((int64_t)((( 0xFC00000000000000 & val) >> 4) | rest) >> 54);
        sout[7] = (float)((int64_t)((  0x03FF000000000000 & val) <<  6) >> 54);
        sout[8] = (float)((int64_t)((  0x0000FFC000000000 & val) << 16) >> 54);
        sout[9] = (float)((int64_t)((  0x0000003FF0000000 & val) << 26) >> 54);
        sout[10] = (float)((int64_t)(( 0x000000000FFC0000 & val) << 36) >> 54);
        sout[11] = (float)((int64_t)(( 0x000000000003FF00 & val) << 46) >> 54);
        rest    =                    ( 0x00000000000000FF & val) << 56;

        val  = swap64(tmp_in[5*threadIdx.x + 2]);
        sout[12] = (float)((int64_t)(((0xC000000000000000 & val) >> 8) | rest) >> 54);
        sout[13] = (float)((int64_t)(( 0x3FF0000000000000 & val) <<  2) >> 54);
        sout[14] = (float)((int64_t)(( 0x000FFC0000000000 & val) << 12) >> 54);
        sout[15] = (float)((int64_t)(( 0x000003FF00000000 & val) << 22) >> 54);
        sout[16] = (float)((int64_t)(( 0x00000000FFC00000 & val) << 32) >> 54);
        sout[17] = (float)((int64_t)(( 0x00000000003FF000 & val) << 42) >> 54);
        sout[18] = (float)((int64_t)(( 0x0000000000000FFC & val) << 52) >> 54);
        rest    =                    ( 0x0000000000000003 & val) << 62;

        val  = swap64(tmp_in[5*threadIdx.x + 3]);
        sout[19] = (float)((int64_t)(((0xFF00000000000000 & val) >> 2) | rest) >> 54);
        sout[20] = (float)((int64_t)(( 0x00FFC00000000000 & val) <<  8) >> 54);
        sout[21] = (float)((int64_t)(( 0x00003FF000000000 & val) << 18) >> 54);
        sout[22] = (float)((int64_t)(( 0x0000000FFC000000 & val) << 28) >> 54);
        sout[23] = (float)((int64_t)(( 0x0000000003FF0000 & val) << 38) >> 54);
        sout[24] = (float)((int64_t)(( 0x000000000000FFC0 & val) << 48) >> 54);
        rest    =                    ( 0x000000000000003F & val) << 58;

        val  = swap64(tmp_in[5*threadIdx.x + 4]);
        sout[25] = (float)((int64_t)(((0xF000000000000000 & val) >> 6) | rest) >> 54);
        sout[26] = (float)((int64_t)(( 0x0FFC000000000000 & val) <<  4) >> 54);
        sout[27] = (float)((int64_t)(( 0x0003FF0000000000 & val) << 14) >> 54);
        sout[28] = (float)((int64_t)(( 0x000000FFC0000000 & val) << 24) >> 54);
        sout[29] = (float)((int64_t)(( 0x000000003FF00000 & val) << 34) >> 54);
        sout[30] = (float)((int64_t)(( 0x00000000000FFC00 & val) << 44) >> 54);
        sout[31] = (float)((int64_t)(( 0x00000000000003FF & val) << 54) >> 54);
        rest = 0;

        __syncthreads();
        size_t block_write_start = block_idx * EDD_NTHREADS_UNPACK10 * 32;
        for (size_t ii = threadIdx.x; ii < 32 * EDD_NTHREADS_UNPACK10; ii += blockDim.x)
        {
            out[block_write_start + ii] = tmp_out[ii];
        }
        block_idx += gridDim.x;
        __syncthreads();
    }
}





Ewan Barr's avatar
Ewan Barr committed
112
__global__
Ewan Barr's avatar
Ewan Barr committed
113
void unpack_edd_12bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n)
Ewan Barr's avatar
Ewan Barr committed
114
115
116
117
{
    /**
     * Note: This kernels will not work with more than 512 threads.
     */
Ewan Barr's avatar
Ewan Barr committed
118
119
    __shared__ volatile float tmp_out[EDD_NTHREADS_UNPACK * 16];
    __shared__ volatile uint64_t tmp_in[EDD_NTHREADS_UNPACK * 3];
Ewan Barr's avatar
Ewan Barr committed
120
121
122
123
124
125
126
127
128
    int block_idx = blockIdx.x;
    uint64_t val;
    uint64_t rest;
    volatile float* sout = tmp_out + (16 * threadIdx.x);
    for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
        (3 * idx + 2) < n;
        idx+=gridDim.x*blockDim.x)
    {
        //Read to shared memeory
Ewan Barr's avatar
Ewan Barr committed
129
        int block_read_start = block_idx * EDD_NTHREADS_UNPACK * 3;
Ewan Barr's avatar
Ewan Barr committed
130
        tmp_in[threadIdx.x]                = in[block_read_start + threadIdx.x];
Ewan Barr's avatar
Ewan Barr committed
131
132
        tmp_in[EDD_NTHREADS_UNPACK + threadIdx.x]     = in[block_read_start + EDD_NTHREADS_UNPACK + threadIdx.x];
        tmp_in[EDD_NTHREADS_UNPACK * 2 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK * 2 + threadIdx.x];
Ewan Barr's avatar
Ewan Barr committed
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
        __syncthreads();
        val  = swap64(tmp_in[3*threadIdx.x]);
        sout[0] = (float)((int64_t)(( 0xFFF0000000000000 & val) <<  0) >> 52);
        sout[1] = (float)((int64_t)(( 0x000FFF0000000000 & val) << 12) >> 52);
        sout[2] = (float)((int64_t)(( 0x000000FFF0000000 & val) << 24) >> 52);
        sout[3] = (float)((int64_t)(( 0x000000000FFF0000 & val) << 36) >> 52);
        sout[4] = (float)((int64_t)(( 0x000000000000FFF0 & val) << 48) >> 52);
        rest    =                   ( 0x000000000000000F & val) << 60;
        val  = swap64(tmp_in[3*threadIdx.x + 1]);
        sout[5] = (float)((int64_t)((( 0xFF00000000000000 & val) >> 4) | rest) >> 52);
        sout[6] = (float)((int64_t)((  0x00FFF00000000000 & val) << 8)  >> 52);
        sout[7] = (float)((int64_t)((  0x00000FFF00000000 & val) << 20) >> 52);
        sout[8] = (float)((int64_t)((  0x00000000FFF00000 & val) << 32) >> 52);
        sout[9] = (float)((int64_t)((  0x00000000000FFF00 & val) << 44) >> 52);
        rest    =                   (  0x00000000000000FF & val) << 56;
        val  = swap64(tmp_in[3*threadIdx.x + 2]);
        sout[10] = (float)((int64_t)((( 0xF000000000000000 & val) >>  8) | rest) >> 52);
        sout[11] = (float)((int64_t)((  0x0FFF000000000000 & val) <<  4) >> 52);
        sout[12] = (float)((int64_t)((  0x0000FFF000000000 & val) << 16) >> 52);
        sout[13] = (float)((int64_t)((  0x0000000FFF000000 & val) << 28) >> 52);
        sout[14] = (float)((int64_t)((  0x0000000000FFF000 & val) << 40) >> 52);
Tobias Winchen's avatar
Tobias Winchen committed
154
        sout[15] = (float)((int64_t)((  0x0000000000000FFF & val) << 52) >> 52);
Ewan Barr's avatar
Ewan Barr committed
155
        __syncthreads();
Ewan Barr's avatar
Ewan Barr committed
156
157
        int block_write_start = block_idx * EDD_NTHREADS_UNPACK * 16;
        for (int ii = threadIdx.x; ii < 16 * EDD_NTHREADS_UNPACK; ii += blockDim.x)
Ewan Barr's avatar
Ewan Barr committed
158
159
160
161
162
163
164
165
        {
            out[block_write_start + ii] = tmp_out[ii];
        }
        block_idx += gridDim.x;
    }
}

__global__
Ewan Barr's avatar
Ewan Barr committed
166
void unpack_edd_8bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n)
Ewan Barr's avatar
Ewan Barr committed
167
168
169
170
{
    /**
     * Note: This kernels will not work with more than 512 threads.
     */
Ewan Barr's avatar
Ewan Barr committed
171
    __shared__ volatile float tmp_out[EDD_NTHREADS_UNPACK * 8];
Ewan Barr's avatar
Ewan Barr committed
172
173
174
175
176
177
    int block_idx = blockIdx.x;
    uint64_t val;
    volatile float* sout = tmp_out + (8 * threadIdx.x);

    for (int idx = blockIdx.x * blockDim.x + threadIdx.x ; idx < n ; idx+=gridDim.x*blockDim.x)
    {
Ewan Barr's avatar
Ewan Barr committed
178
        int block_read_start = block_idx * EDD_NTHREADS_UNPACK;
Ewan Barr's avatar
Ewan Barr committed
179
180
181
182
183
184
185
186
187
188
        val = swap64(in[block_read_start + threadIdx.x]);
        sout[0] = (float)((int64_t)(( 0xFF00000000000000 & val) <<  0) >> 56);
        sout[1] = (float)((int64_t)(( 0x00FF000000000000 & val) <<  8) >> 56);
        sout[2] = (float)((int64_t)(( 0x0000FF0000000000 & val) << 16) >> 56);
        sout[3] = (float)((int64_t)(( 0x000000FF00000000 & val) << 24) >> 56);
        sout[4] = (float)((int64_t)(( 0x00000000FF000000 & val) << 32) >> 56);
        sout[5] = (float)((int64_t)(( 0x0000000000FF0000 & val) << 40) >> 56);
        sout[6] = (float)((int64_t)(( 0x000000000000FF00 & val) << 48) >> 56);
        sout[7] = (float)((int64_t)(( 0x00000000000000FF & val) << 56) >> 56);
        __syncthreads();
Ewan Barr's avatar
Ewan Barr committed
189
190
        int block_write_start = block_idx * EDD_NTHREADS_UNPACK * 8;
        for (int ii = threadIdx.x; ii < 8 * EDD_NTHREADS_UNPACK; ii+=blockDim.x)
Ewan Barr's avatar
Ewan Barr committed
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
        {
            out[block_write_start+ii] = tmp_out[ii];
        }
        block_idx += gridDim.x;
    }
}

} //namespace kernels


Unpacker::Unpacker(cudaStream_t stream)
    : _stream(stream)
{

}

Unpacker::~Unpacker()
{

}

template <>
213
void Unpacker::unpack<12>(const uint64_t* input, float* output, size_t size)
Ewan Barr's avatar
Ewan Barr committed
214
215
{
    BOOST_LOG_TRIVIAL(debug) << "Unpacking 12-bit data";
216
    int nblocks = size / EDD_NTHREADS_UNPACK;
Ewan Barr's avatar
Ewan Barr committed
217
    kernels::unpack_edd_12bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK, 0, _stream>>>(
218
            input, output, size);
Ewan Barr's avatar
Ewan Barr committed
219
220
221
}

template <>
222
void Unpacker::unpack<8>(const uint64_t* input, float* output, size_t size)
Ewan Barr's avatar
Ewan Barr committed
223
{
root's avatar
root committed
224
    BOOST_LOG_TRIVIAL(debug) << "Unpacking 8-bit data";
225
    int nblocks = size / EDD_NTHREADS_UNPACK;
Ewan Barr's avatar
Ewan Barr committed
226
    kernels::unpack_edd_8bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK, 0, _stream>>>(
227
            input, output, size);
Ewan Barr's avatar
Ewan Barr committed
228
229
}

Tobias Winchen's avatar
Tobias Winchen committed
230
231
232
233
234
235
236
237
238
239
240

template <>
void Unpacker::unpack<10>(const uint64_t* input, float* output, size_t size)
{
    BOOST_LOG_TRIVIAL(debug) << "Unpacking 10-bit data";
    int nblocks = size / EDD_NTHREADS_UNPACK10;
    kernels::unpack_edd_10bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK10, 0, _stream>>>(
            input, output, size);
}


Ewan Barr's avatar
Ewan Barr committed
241
242
} //namespace edd
} //namespace effelsberg
root's avatar
root committed
243
} //namespace psrdada_cpp