Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Tobias Winchen
psrdada_cpp
Commits
f9ab54d5
Commit
f9ab54d5
authored
Feb 05, 2021
by
Tobias Winchen
Browse files
Added 10 it unpacker on GPU
parent
b3619f8d
Pipeline
#92895
failed with stages
in 5 minutes and 48 seconds
Changes
6
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
psrdada_cpp/effelsberg/edd/Unpacker.cuh
View file @
f9ab54d5
...
...
@@ -12,6 +12,9 @@ namespace kernels {
__global__
void
unpack_edd_12bit_to_float32
(
uint64_t
const
*
__restrict__
in
,
float
*
__restrict__
out
,
int
n
);
__global__
void
unpack_edd_10bit_to_float32
(
uint64_t
const
*
__restrict__
in
,
float
*
__restrict__
out
,
int
n
);
__global__
void
unpack_edd_8bit_to_float32
(
uint64_t
const
*
__restrict__
in
,
float
*
__restrict__
out
,
int
n
);
...
...
@@ -33,7 +36,7 @@ public:
void
unpack
(
const
uint64_t
*
input
,
float
*
output
,
size_t
size
);
template
<
int
Nbits
>
void
unpack
(
InputType
const
&
input
,
OutputType
&
output
)
void
unpack
(
InputType
const
&
input
,
OutputType
&
output
)
{
InputType
::
value_type
const
*
input_ptr
=
thrust
::
raw_pointer_cast
(
input
.
data
());
OutputType
::
value_type
*
output_ptr
=
thrust
::
raw_pointer_cast
(
output
.
data
());
...
...
psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
View file @
f9ab54d5
...
...
@@ -75,7 +75,7 @@ GatedSpectrometer<HandlerType, InputType, OutputType>::GatedSpectrometer(
{
// Sanity checks
assert
(((
nbits
==
12
)
||
(
nbits
==
8
)));
assert
(((
nbits
==
12
)
||
(
nbits
==
8
)
||
(
nbits
==
10
)
));
assert
(
_naccumulate
>
0
);
// check for any device errors
...
...
psrdada_cpp/effelsberg/edd/detail/GatedStokesSpectrometer.cu
View file @
f9ab54d5
...
...
@@ -160,7 +160,7 @@ GatedStokesSpectrometer<HandlerType>::GatedStokesSpectrometer(
_call_count
(
0
),
_nsamps_per_heap
(
4096
),
_processing_efficiency
(
0.
){
// Sanity checks
assert
(((
_nbits
==
12
)
||
(
_nbits
==
8
)));
assert
(((
_nbits
==
12
)
||
(
_nbits
==
8
)
||
(
_nbits
==
10
)
));
assert
(
_naccumulate
>
0
);
// check for any device errors
...
...
psrdada_cpp/effelsberg/edd/src/GatedSpectrometer_cli.cu
View file @
f9ab54d5
...
...
@@ -198,7 +198,7 @@ int main(int argc, char **argv) {
desc
.
add_options
()(
"nbits,b"
,
po
::
value
<
unsigned
int
>
(
&
ip
.
nbits
)
->
required
(),
"The number of bits per sample in the "
"packetiser output (8
or
12)"
);
"packetiser output (8
, 10
12)"
);
desc
.
add_options
()(
"fft_length,n"
,
po
::
value
<
size_t
>
(
&
ip
.
fft_length
)
->
required
(),
"The length of the FFT to perform on the data"
);
desc
.
add_options
()(
"naccumulate,a"
,
...
...
psrdada_cpp/effelsberg/edd/src/Unpacker.cu
View file @
f9ab54d5
...
...
@@ -2,6 +2,7 @@
#include
"psrdada_cpp/cuda_utils.hpp"
#define EDD_NTHREADS_UNPACK 512
#define EDD_NTHREADS_UNPACK10 128 // More than 128 threads are silently not launched??
namespace
psrdada_cpp
{
namespace
effelsberg
{
...
...
@@ -21,6 +22,93 @@ __device__ __forceinline__ uint64_t swap64(uint64_t x)
return
result
;
}
__global__
void
unpack_edd_10bit_to_float32
(
uint64_t
const
*
__restrict__
in
,
float
*
__restrict__
out
,
int
n
)
{
/**
* Note: This kernels will not work with more than 256 threads.
*/
__shared__
volatile
float
tmp_out
[
EDD_NTHREADS_UNPACK10
*
32
];
__shared__
volatile
uint64_t
tmp_in
[
EDD_NTHREADS_UNPACK10
*
5
];
int
block_idx
=
blockIdx
.
x
;
uint64_t
val
;
uint64_t
rest
;
volatile
float
*
sout
=
tmp_out
+
(
32
*
threadIdx
.
x
);
for
(
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
(
5
*
idx
+
4
)
<
n
;
idx
+=
gridDim
.
x
*
blockDim
.
x
)
{
//Read to shared memeory
int
block_read_start
=
block_idx
*
EDD_NTHREADS_UNPACK10
*
5
;
tmp_in
[
threadIdx
.
x
]
=
in
[
block_read_start
+
threadIdx
.
x
];
tmp_in
[
EDD_NTHREADS_UNPACK10
+
threadIdx
.
x
]
=
in
[
block_read_start
+
EDD_NTHREADS_UNPACK10
+
threadIdx
.
x
];
tmp_in
[
EDD_NTHREADS_UNPACK10
*
2
+
threadIdx
.
x
]
=
in
[
block_read_start
+
EDD_NTHREADS_UNPACK10
*
2
+
threadIdx
.
x
];
tmp_in
[
EDD_NTHREADS_UNPACK10
*
3
+
threadIdx
.
x
]
=
in
[
block_read_start
+
EDD_NTHREADS_UNPACK10
*
3
+
threadIdx
.
x
];
tmp_in
[
EDD_NTHREADS_UNPACK10
*
4
+
threadIdx
.
x
]
=
in
[
block_read_start
+
EDD_NTHREADS_UNPACK10
*
4
+
threadIdx
.
x
];
__syncthreads
();
val
=
swap64
(
tmp_in
[
5
*
threadIdx
.
x
]);
sout
[
0
]
=
(
float
)((
int64_t
)((
0xFFC0000000000000
&
val
)
<<
0
)
>>
54
);
sout
[
1
]
=
(
float
)((
int64_t
)((
0x003FF00000000000
&
val
)
<<
10
)
>>
54
);
sout
[
2
]
=
(
float
)((
int64_t
)((
0x00000FFC00000000
&
val
)
<<
20
)
>>
54
);
sout
[
3
]
=
(
float
)((
int64_t
)((
0x00000003FF000000
&
val
)
<<
30
)
>>
54
);
sout
[
4
]
=
(
float
)((
int64_t
)((
0x0000000000FFC000
&
val
)
<<
40
)
>>
54
);
sout
[
5
]
=
(
float
)((
int64_t
)((
0x0000000000003FF0
&
val
)
<<
50
)
>>
54
);
rest
=
(
0x000000000000000F
&
val
)
<<
60
;
val
=
swap64
(
tmp_in
[
5
*
threadIdx
.
x
+
1
]);
sout
[
6
]
=
(
float
)((
int64_t
)(((
0xFC00000000000000
&
val
)
>>
4
)
|
rest
)
>>
54
);
sout
[
7
]
=
(
float
)((
int64_t
)((
0x03FF000000000000
&
val
)
<<
6
)
>>
54
);
sout
[
8
]
=
(
float
)((
int64_t
)((
0x0000FFC000000000
&
val
)
<<
16
)
>>
54
);
sout
[
9
]
=
(
float
)((
int64_t
)((
0x0000003FF0000000
&
val
)
<<
26
)
>>
54
);
sout
[
10
]
=
(
float
)((
int64_t
)((
0x000000000FFC0000
&
val
)
<<
36
)
>>
54
);
sout
[
11
]
=
(
float
)((
int64_t
)((
0x000000000003FF00
&
val
)
<<
46
)
>>
54
);
rest
=
(
0x00000000000000FF
&
val
)
<<
56
;
val
=
swap64
(
tmp_in
[
5
*
threadIdx
.
x
+
2
]);
sout
[
12
]
=
(
float
)((
int64_t
)(((
0xC000000000000000
&
val
)
>>
8
)
|
rest
)
>>
54
);
sout
[
13
]
=
(
float
)((
int64_t
)((
0x3FF0000000000000
&
val
)
<<
2
)
>>
54
);
sout
[
14
]
=
(
float
)((
int64_t
)((
0x000FFC0000000000
&
val
)
<<
12
)
>>
54
);
sout
[
15
]
=
(
float
)((
int64_t
)((
0x000003FF00000000
&
val
)
<<
22
)
>>
54
);
sout
[
16
]
=
(
float
)((
int64_t
)((
0x00000000FFC00000
&
val
)
<<
32
)
>>
54
);
sout
[
17
]
=
(
float
)((
int64_t
)((
0x00000000003FF000
&
val
)
<<
42
)
>>
54
);
sout
[
18
]
=
(
float
)((
int64_t
)((
0x0000000000000FFC
&
val
)
<<
52
)
>>
54
);
rest
=
(
0x0000000000000003
&
val
)
<<
62
;
val
=
swap64
(
tmp_in
[
5
*
threadIdx
.
x
+
3
]);
sout
[
19
]
=
(
float
)((
int64_t
)(((
0xFF00000000000000
&
val
)
>>
2
)
|
rest
)
>>
54
);
sout
[
20
]
=
(
float
)((
int64_t
)((
0x00FFC00000000000
&
val
)
<<
8
)
>>
54
);
sout
[
21
]
=
(
float
)((
int64_t
)((
0x00003FF000000000
&
val
)
<<
18
)
>>
54
);
sout
[
22
]
=
(
float
)((
int64_t
)((
0x0000000FFC000000
&
val
)
<<
28
)
>>
54
);
sout
[
23
]
=
(
float
)((
int64_t
)((
0x0000000003FF0000
&
val
)
<<
38
)
>>
54
);
sout
[
24
]
=
(
float
)((
int64_t
)((
0x000000000000FFC0
&
val
)
<<
48
)
>>
54
);
rest
=
(
0x000000000000003F
&
val
)
<<
58
;
val
=
swap64
(
tmp_in
[
5
*
threadIdx
.
x
+
4
]);
sout
[
25
]
=
(
float
)((
int64_t
)(((
0xF000000000000000
&
val
)
>>
6
)
|
rest
)
>>
54
);
sout
[
26
]
=
(
float
)((
int64_t
)((
0x0FFC000000000000
&
val
)
<<
4
)
>>
54
);
sout
[
27
]
=
(
float
)((
int64_t
)((
0x0003FF0000000000
&
val
)
<<
14
)
>>
54
);
sout
[
28
]
=
(
float
)((
int64_t
)((
0x000000FFC0000000
&
val
)
<<
24
)
>>
54
);
sout
[
29
]
=
(
float
)((
int64_t
)((
0x000000003FF00000
&
val
)
<<
34
)
>>
54
);
sout
[
30
]
=
(
float
)((
int64_t
)((
0x00000000000FFC00
&
val
)
<<
44
)
>>
54
);
sout
[
31
]
=
(
float
)((
int64_t
)((
0x00000000000003FF
&
val
)
<<
54
)
>>
54
);
rest
=
0
;
__syncthreads
();
size_t
block_write_start
=
block_idx
*
EDD_NTHREADS_UNPACK10
*
32
;
for
(
size_t
ii
=
threadIdx
.
x
;
ii
<
32
*
EDD_NTHREADS_UNPACK10
;
ii
+=
blockDim
.
x
)
{
out
[
block_write_start
+
ii
]
=
tmp_out
[
ii
];
}
block_idx
+=
gridDim
.
x
;
__syncthreads
();
}
}
__global__
void
unpack_edd_12bit_to_float32
(
uint64_t
const
*
__restrict__
in
,
float
*
__restrict__
out
,
int
n
)
{
...
...
@@ -139,6 +227,17 @@ void Unpacker::unpack<8>(const uint64_t* input, float* output, size_t size)
input
,
output
,
size
);
}
template
<
>
void
Unpacker
::
unpack
<
10
>
(
const
uint64_t
*
input
,
float
*
output
,
size_t
size
)
{
BOOST_LOG_TRIVIAL
(
debug
)
<<
"Unpacking 10-bit data"
;
int
nblocks
=
size
/
EDD_NTHREADS_UNPACK10
;
kernels
::
unpack_edd_10bit_to_float32
<<<
nblocks
,
EDD_NTHREADS_UNPACK10
,
0
,
_stream
>>>
(
input
,
output
,
size
);
}
}
//namespace edd
}
//namespace effelsberg
}
//namespace psrdada_cpp
psrdada_cpp/effelsberg/edd/test/src/UnpackerTester.cu
View file @
f9ab54d5
...
...
@@ -110,7 +110,7 @@ void UnpackerTester::unpacker_10_to_32_c_reference(
static_cast
<
int64_t
>
((
0x0000000000FFC000
&
val
)
<<
40
)
>>
54
));
output
.
push_back
(
static_cast
<
float
>
(
static_cast
<
int64_t
>
((
0x0000000000003FF0
&
val
)
<<
50
)
>>
54
));
rest
=
(
0x000000000000000F
&
val
)
<<
60
;
// 4 bits rest.
rest
=
(
0x000000000000000F
&
val
)
<<
60
;
// 4 bits rest.
val
=
be64toh
(
input
[
ii
+
1
]);
output
.
push_back
(
static_cast
<
float
>
(
static_cast
<
int64_t
>
(((
0xFC00000000000000
&
val
)
>>
4
)
|
rest
)
>>
54
));
...
...
@@ -205,10 +205,11 @@ void UnpackerTester::compare_against_host(
OutputType
const
&
host_output
)
{
OutputType
copy_from_gpu
=
gpu_output
;
cudaDeviceSynchronize
();
ASSERT_EQ
(
host_output
.
size
(),
copy_from_gpu
.
size
());
for
(
std
::
size_t
ii
=
0
;
ii
<
host_output
.
size
();
++
ii
)
{
ASSER
T_EQ
(
host_output
[
ii
],
copy_from_gpu
[
ii
]);
EXPEC
T_EQ
(
host_output
[
ii
],
copy_from_gpu
[
ii
])
<<
" ii = "
<<
ii
;
}
}
...
...
@@ -240,7 +241,7 @@ TEST_F(UnpackerTester, 8_bit_unpack_test)
InputType
host_input
(
n
);
for
(
size_t
ii
=
0
;
ii
<
n
;
++
ii
)
{
host_input
[
ii
]
=
distribution
(
generator
);
host_input
[
ii
]
=
distribution
(
generator
);
}
Unpacker
::
InputType
gpu_input
=
host_input
;
Unpacker
::
OutputType
gpu_output
;
...
...
@@ -253,6 +254,29 @@ TEST_F(UnpackerTester, 8_bit_unpack_test)
}
TEST_F
(
UnpackerTester
,
10
_bit_unpack_test
)
{
std
::
size_t
n
=
640
;
std
::
default_random_engine
generator
;
std
::
uniform_int_distribution
<
int
>
distribution
(
1
,
1
<<
31
);
InputType
host_input
(
n
);
for
(
size_t
ii
=
0
;
ii
<
n
;
++
ii
)
{
host_input
[
ii
]
=
distribution
(
generator
);
}
Unpacker
::
InputType
gpu_input
=
host_input
;
Unpacker
::
OutputType
gpu_output
;
gpu_output
.
resize
(
host_input
.
size
()
*
sizeof
(
host_input
[
0
])
*
8
/
10
);
OutputType
host_output
;
Unpacker
unpacker
(
_stream
);
unpacker
.
unpack
<
10
>
(
gpu_input
,
gpu_output
);
unpacker_10_to_32_c_reference
(
host_input
,
host_output
);
compare_against_host
(
gpu_output
,
host_output
);
}
}
//namespace test
}
//namespace edd
}
//namespace meerkat
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment